From 0f2ee2d16c72fba5605aea2d6a2f48e93e7146ad Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 11:22:57 +0800
Subject: [PATCH 01/43] [Feature] HunyuanImage-3.0 IT2I: support multi-image
input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
HunyuanImage-3.0-Instruct supports up to 3 reference images for IT2I
"Multi-Image Fusion" upstream (README §200-216, §500). vllm-omni's DiT
pipeline, AR processor, OpenAI schema, and ar2diffusion bridge already
accepted list-shaped `multi_modal_data["image"]`, but four call sites
still encoded a hard "N=1" assumption that blocked real multi-image
runs. End-to-end smoke (4× L20X) on the official `input_1_0.png` +
`input_1_1.png` demo pair runs cleanly and preserves each image's
native bucket (no forced cropping of the second image).
Surgery points:
1. `vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py`:
`build_prompt` / `build_prompt_tokens` take `num_images: int`
(default 1, validated 1 <= N <= 3 for image-input tasks) and emit N
consecutive `
` placeholders between `User: ` and the user
prompt. Mirrors the official tokenizer where each cond_image becomes
its own user-role message and `apply_general_template` concatenates
successive user messages back-to-back inside one user_prefix /
user_suffix wrap.
2. `vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py`
`HunyuanImage3Processor.process_image`: each cond image now keeps
its own VAE `reso_group` bucket (mirrors the official ragged
behavior in `_encode_cond_image`). Per-image VAE pixel tensors are
flattened to 1-D and concatenated; `_get_mm_fields_config` declares
`vae_pixel_values` with `MultiModalFieldConfig.flat_from_sizes(...,
vae_pixel_size)` so vLLM splits the buffer back per image at
consumption time. Mirrors the GLM-Image / Ming-Flash-Omni pattern.
`_parse_and_validate_image_input` reconstructs a list of per-image
(3, H_i, W_i) tensors using `vae_token_grid_hw`; `embed_multimodal`
loops over the list for VAE encode + patch_embed (which was already
per-image after the encode call). VIT (Siglip2 naflex) keeps the
`batched("image")` path since naflex pads to `max_num_patches`.
3. `vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py`
`instantiate_timestep_tokens`: `_encode_cond_image` returns
`cond_t` as `list[Tensor]` for the multi-image branch (one tensor
of N_cond_images timesteps per batch item). `instantiate_vae_image_tokens`
already had a per-batch zip loop for the list shape; this function
was missed and used a global flatten that silently broke on
heterogeneous batches (different image counts per batch item).
Adds a per-batch loop that mirrors `instantiate_vae_image_tokens`,
slicing both `t` and `timestep_scatter_index` per batch item.
4. `examples/offline_inference/hunyuan_image3/end2end.py`:
`--image-path` accepts comma-separated paths (matching the official
upstream CLI); `num_images` is threaded through to the prompt
builder.
Tests: new regression file pinning N=1/2/3 placeholder layout (string
+ token-id, FakeTokenizer for fast CPU coverage), default-N=1
byte-equivalence with legacy callers, ValueError for out-of-range N,
and three real-`AutoTokenizer.from_pretrained` cases proving N=1/2/3
produce N consecutive `
` token ids on the production tokenizer
path with no separator drift between successive `
` placeholders.
End-to-end smoke (4× L20X 143GB, AR=TP2 + DiT=TP2, 20 denoise steps,
multi-image fusion against the official demo pair):
- AR generated CoT tokens for the fused request
- DiT denoise 20/20 steps in 24s (~1.10 s/step)
- Peak GPU mem 95.52 GB reserved / 90.10 GB allocated, 5.7% pool
- Output PNG saved cleanly; second reference image's native aspect
visible in the fusion (vs the prior shared-bucket implementation
that forced it into the first image's square bucket).
Output-size handling for the AR/DiT ratio lifecycle is intentionally
NOT touched. The pre-existing `image_list[0]` raw-pixel fallback in
`pre_process_func` bypasses the AR's ratio-token prediction (the
`` token sampled under `SliceVocabLogitsProcessor`);
properly wiring that into `ar2diffusion`'s width/height assignment is
a separate refactor.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/end2end.py | 34 ++-
.../test_hunyuan_image3_it2i_multi_image.py | 251 ++++++++++++++++++
.../hunyuan_image3/pipeline_hunyuan_image3.py | 7 +-
.../models/hunyuan_image3/prompt_utils.py | 36 ++-
.../models/hunyuan_image3/hunyuan_image3.py | 117 +++++---
5 files changed, 389 insertions(+), 56 deletions(-)
create mode 100644 tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 5232568f11e..f9f734c9f4a 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -10,6 +10,7 @@
Usage:
python end2end.py --modality text2img --prompts "A cute cat"
python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy"
+ python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine"
python end2end.py --modality img2text --image-path input.png --prompts "Describe this image"
"""
@@ -71,7 +72,7 @@ def parse_args():
"--image-path",
type=str,
default=None,
- help="Path to input image (for img2img/img2text).",
+ help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
)
parser.add_argument(
"--output",
@@ -207,14 +208,19 @@ def main():
print("[Info] No prompts provided, using default.")
prompts = ["A cute cat"]
- # Load image if needed
- input_image = None
+ input_images: list = []
if args.modality in ("img2img", "img2text"):
- if not args.image_path or not os.path.exists(args.image_path):
+ if not args.image_path:
raise ValueError(f"--image-path required for {args.modality}, got: {args.image_path}")
from PIL import Image
- input_image = Image.open(args.image_path).convert("RGB")
+ image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
+ for p in image_paths:
+ if not os.path.exists(p):
+ raise ValueError(f"Image path does not exist: {p}")
+ input_images.append(Image.open(p).convert("RGB"))
+ if not input_images:
+ raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
# Load tokenizer for segment-wise prompt tokenization (matches HF
# apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
@@ -222,10 +228,18 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+ mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
+
# Format prompts
formatted_prompts: list[OmniPromptType] = []
for p in prompts:
- result = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
+ # Only pass `num_images` for modalities that actually consume images;
+ # text-only paths ignore the parameter, but threading it
+ # unconditionally reads as if t2i needed at least one image.
+ build_kwargs: dict = {"task": task, "sys_type": args.sys_type}
+ if input_images:
+ build_kwargs["num_images"] = len(input_images)
+ result = build_prompt_tokens(p, tokenizer, **build_kwargs)
token_ids = result.token_ids
effective_sys_type = result.system_prompt_type
@@ -243,12 +257,12 @@ def main():
prompt_dict["modalities"] = ["image"]
elif args.modality == "img2img":
prompt_dict["modalities"] = ["image"]
- prompt_dict["multi_modal_data"] = {"image": input_image}
- prompt_dict["height"] = input_image.height
- prompt_dict["width"] = input_image.width
+ prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
+ prompt_dict["height"] = input_images[0].height
+ prompt_dict["width"] = input_images[0].width
elif args.modality == "img2text":
prompt_dict["modalities"] = ["text"]
- prompt_dict["multi_modal_data"] = {"image": input_image}
+ prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
elif args.modality == "text2text":
prompt_dict["modalities"] = ["text"]
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
new file mode 100644
index 00000000000..c8a9891385c
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Multi-image input regression for HunyuanImage3 IT2I prompt construction.
+
+The official HunyuanImage-3.0-Instruct supports up to 3 reference images
+per IT2I request ("Multi-Image Fusion"; see hunyuan3.0_ins/README.md
+section 200-216 + line 500). Each cond image becomes its own user-role
+message and `apply_general_template` concatenates successive user
+messages back-to-back inside ONE user_prefix/user_suffix wrap (see
+hunyuan3.0_ins/tokenization_hunyuan_image_3.py:1399-1400, 1499-1515).
+The lightweight `
` + `multi_modal_data` builder used by the example
+flow must match that contract: N consecutive `
` placeholders sit
+between `User: ` and the user prompt, with no separator between them.
+
+This file pins:
+ 1. N consecutive `
` placeholders for N=1/2/3 across both the
+ string builder (`build_prompt`) and the token builder
+ (`build_prompt_tokens`).
+ 2. The N=1 path stays bit-identical to the legacy single-image builder
+ (regression guard so default callers don't notice).
+ 3. N=2 / N=3 token sequences differ from N=1 by exactly (N-1) extra
+ `
` ids inserted between `User: ` and `user_prompt`.
+ 4. Validation: N<1 and N>3 raise ValueError (hard cap N<=3 mirrors
+ official upstream).
+ 5. Text-only tasks ignore `num_images` (no validation, no extra ids).
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ MAX_IMAGES_PER_REQUEST,
+ build_prompt,
+ build_prompt_tokens,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class FakeTokenizer:
+ """Recording fake tokenizer mirroring the one in test_prompt_utils.
+
+ Special token ids: `<|startoftext|>`=1, `
`=2, ``=3,
+ ``=4. encode() returns one id per character starting at
+ 100, so substring-position assertions are stable.
+ """
+
+ SPECIAL = {
+ "<|startoftext|>": 1,
+ "
": 2,
+ "": 3,
+ "": 4,
+ }
+
+ def __init__(self) -> None:
+ self.encode_calls: list[str] = []
+
+ def convert_tokens_to_ids(self, tok: str) -> int:
+ return self.SPECIAL.get(tok, 0)
+
+ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+ self.encode_calls.append(text)
+ return list(range(100, 100 + len(text)))
+
+
+_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption")
+_TEXT_ONLY_TASKS = ("t2t",)
+
+
+# -------------------- string builder --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("num_images", [1, 2, 3])
+def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int):
+ """N=1/2/3 -> exactly N `
` substrings appear consecutively
+ between `User: ` and the user prompt, with no separator between them."""
+ s = build_prompt("HELLO", task=task, num_images=num_images)
+ assert s.count("
") == num_images, (
+ f"task={task} num_images={num_images}: expected {num_images}
"
+ f"placeholders, found {s.count('
')} -- prompt was: {s!r}"
+ )
+
+ # All `
` placeholders must form one contiguous run "![]()
..."
+ # immediately after `User: ` and before HELLO.
+ user_idx = s.index("User: ") + len("User: ")
+ hello_idx = s.index("HELLO")
+ between = s[user_idx:hello_idx]
+ assert between == "
" * num_images, (
+ f"region between `User: ` and prompt must be exactly N
placeholders; got {between!r}"
+ )
+
+
+def test_build_prompt_default_num_images_matches_legacy():
+ """num_images default = 1 must produce a string bit-identical to the
+ pre-multi-image behavior (single `
` placeholder)."""
+ legacy = build_prompt("HELLO", task="it2i_think")
+ explicit = build_prompt("HELLO", task="it2i_think", num_images=1)
+ assert legacy == explicit, "default num_images=1 must match legacy single-image output"
+
+
+# -------------------- token builder --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+def test_build_prompt_tokens_inserts_N_img_ids(task: str):
+ """N=1/2/3 -> the resulting id sequence contains exactly N copies of
+ img_id (=2) sitting consecutively after the `User: ` segment."""
+ tok = FakeTokenizer()
+ ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1)
+ tok = FakeTokenizer()
+ ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2)
+ tok = FakeTokenizer()
+ ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3)
+
+ assert ids_n1.count(2) == 1
+ assert ids_n2.count(2) == 2
+ assert ids_n3.count(2) == 3
+
+ # Each additional image must extend the sequence by exactly one img_id,
+ # not shift other tokens around.
+ assert len(ids_n2) == len(ids_n1) + 1
+ assert len(ids_n3) == len(ids_n1) + 2
+
+ # The img_ids must be CONSECUTIVE (no other token between successive
+ # `
` placeholders -- mirrors the official `process_successive_message`
+ # wrapping where successive user messages share one user_prefix/suffix).
+ for ids, n in [(ids_n2, 2), (ids_n3, 3)]:
+ first = ids.index(2)
+ for k in range(n):
+ assert ids[first + k] == 2, (
+ f"img_ids must be consecutive starting at position {first} for n={n}; got {ids[first : first + n]!r}"
+ )
+
+
+def test_build_prompt_tokens_default_num_images_matches_legacy():
+ """num_images default = 1 must produce the same id sequence as
+ omitting the parameter (regression guard for existing single-image
+ callers)."""
+ tok_a = FakeTokenizer()
+ legacy = build_prompt_tokens("hi", tok_a, task="it2i_think")
+ tok_b = FakeTokenizer()
+ explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1)
+ assert legacy == explicit
+ # Also: encode() must have been called on the same set of segments,
+ # so segment boundaries are preserved.
+ assert tok_a.encode_calls == tok_b.encode_calls
+
+
+# -------------------- validation --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99])
+def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int):
+ with pytest.raises(ValueError, match="num_images must be in"):
+ build_prompt("hi", task=task, num_images=bad)
+ with pytest.raises(ValueError, match="num_images must be in"):
+ build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad)
+
+
+@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS)
+@pytest.mark.parametrize("num_images", [0, 1, 2, 99])
+def test_text_only_tasks_ignore_num_images(task: str, num_images: int):
+ """Validation only kicks in for image-input tasks; t2t et al. accept
+ any num_images and emit zero `
` placeholders."""
+ s = build_prompt("hi", task=task, num_images=num_images)
+ assert "
" not in s
+ ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images)
+ assert 2 not in ids
+
+
+# -------------------- real HF tokenizer regression --------------------
+
+_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
+
+
+def _hf_cached(model_id: str) -> bool:
+ hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+ snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
+ return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+@pytest.mark.parametrize("num_images", [1, 2, 3])
+def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
+ """Real `AutoTokenizer.from_pretrained(...)` (the production path) must
+ encode N=1/2/3 prompts to a sequence with exactly N consecutive `
`
+ token-ids in the right place — proves the placeholder layout from
+ `build_prompt_tokens` survives a real BPE tokenizer, not just FakeTokenizer.
+ """
+ from transformers import AutoTokenizer
+
+ tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+ img_id = tok.convert_tokens_to_ids("
")
+ assert img_id is not None and img_id >= 0, f"
not in tokenizer vocab; got id={img_id}"
+
+ ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images)
+
+ # Exactly N copies of
id, all consecutive.
+ img_positions = [i for i, x in enumerate(ids) if x == img_id]
+ assert len(img_positions) == num_images, (
+ f"expected {num_images}
ids, got {len(img_positions)} at positions {img_positions}"
+ )
+ assert img_positions == list(range(img_positions[0], img_positions[0] + num_images)), (
+ f"
ids must be contiguous; got positions {img_positions}"
+ )
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
+ """Going from N to N+1 images must extend the encoded id sequence by
+ exactly one extra `
` token-id and shift nothing else. Catches
+ accidental separator tokens between successive `
` placeholders
+ that a FakeTokenizer (deterministic encode) can't surface."""
+ from transformers import AutoTokenizer
+
+ tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+ img_id = tok.convert_tokens_to_ids("
")
+
+ ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+ ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2)
+ ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3)
+
+ assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
+ assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
+
+ # Insert one img_id at the existing position; everything else unchanged.
+ p1 = ids_n1.index(img_id)
+ assert ids_n2[: p1 + 1] == ids_n1[: p1 + 1] + [], "prefix before extra
must match N=1"
+ assert ids_n2[p1] == img_id and ids_n2[p1 + 1] == img_id, "two consecutive
ids at the insertion point"
+ assert ids_n2[p1 + 2 :] == ids_n1[p1 + 1 :], "tail after the extra
must match N=1's tail"
+ # N=3 same pattern, three in a row.
+ assert ids_n3[p1 : p1 + 3] == [img_id, img_id, img_id]
+ assert ids_n3[p1 + 3 :] == ids_n1[p1 + 1 :]
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+def test_real_tokenizer_default_n1_byte_identical_to_legacy():
+ """Default `num_images=1` must produce the exact same id sequence as
+ omitting the parameter — pins the legacy single-image regression
+ against the real tokenizer (not just FakeTokenizer)."""
+ from transformers import AutoTokenizer
+
+ tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+ legacy = build_prompt_tokens("hi", tok, task="it2i_think")
+ explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+ assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 1f88e9e7155..74fe268babf 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -539,7 +539,12 @@ def instantiate_timestep_tokens(
timestep_scatter_index: BatchRaggedTensor,
):
batch_size, seq_len, n_embd = x.shape
- # batch_size x n x n_embd
+ # `_encode_cond_image` returns `t` as list[Tensor] for the
+ # multi-image branch (outer length = batch_size, currently fixed
+ # at 1 by the stage runtime `max_batch_size`); flatten to a Tensor
+ # before reshape.
+ if isinstance(t, list):
+ t = torch.cat([ti.reshape(-1) for ti in t], dim=0)
timestep_scatter_src = self.timestep_emb(t.reshape(-1)).reshape(batch_size, -1, n_embd)
x.scatter_(
dim=1,
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 5d8e9af6ab8..068dad87f8b 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -72,11 +72,21 @@ def resolve_stop_token_ids(
return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]]
+# Upstream "Multi-Image Fusion" caps reference images at 3 per request.
+MAX_IMAGES_PER_REQUEST = 3
+
+
+def _validate_num_images(num_images: int) -> None:
+ if not (1 <= num_images <= MAX_IMAGES_PER_REQUEST):
+ raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}")
+
+
def build_prompt(
user_prompt: str,
task: str = "it2i_think",
sys_type: str | None = None,
custom_system_prompt: str | None = None,
+ num_images: int = 1,
) -> str:
"""Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).
@@ -85,6 +95,9 @@ def build_prompt(
tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
inputs that need to match HF baseline byte-for-byte, use
`build_prompt_tokens` instead and feed the result via prompt_token_ids.
+
+ `num_images` emits N consecutive `
` placeholders between
+ `User: ` and `user_prompt`. Ignored for text-only tasks.
"""
if task not in _TASK_PRESETS:
raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
@@ -96,6 +109,8 @@ def build_prompt(
sys_text = system_prompt.strip() if system_prompt else ""
has_image_input = task.startswith("i2t") or task.startswith("it2i")
+ if has_image_input:
+ _validate_num_images(num_images)
# t2i_vanilla: pretrain mode for direct text->image generation. The
# vanilla system prompt drives the model with no chat structure.
@@ -108,7 +123,7 @@ def build_prompt(
# All other tasks (t2t / i2t / t2i_think / t2i_recaption /
# it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
- # <|startoftext|>{system?}\n\nUser: {
?}{user_prompt}\n\nAssistant: {trigger?}
+ # <|startoftext|>{system?}\n\nUser: {
*N?}{user_prompt}\n\nAssistant: {trigger?}
# generation_config.json declares sequence_template="instruct", so the
# AR prefill MUST use this template -- verified to match HF's
# apply_chat_template output token-for-token (modulo BPE boundary merges).
@@ -121,7 +136,7 @@ def build_prompt(
parts.append(f"{sys_text}\n\n")
parts.append("User: ")
if has_image_input:
- parts.append("
")
+ parts.extend(["
"] * num_images)
parts.append(user_prompt)
parts.append("\n\nAssistant: ")
if trigger_tag:
@@ -142,6 +157,7 @@ def build_prompt_tokens(
task: str = "it2i_think",
sys_type: str | None = None,
custom_system_prompt: str | None = None,
+ num_images: int = 1,
) -> PromptTokensResult:
"""Segment-by-segment tokenization that matches HF apply_chat_template.
@@ -155,6 +171,8 @@ def build_prompt_tokens(
Returns:
PromptTokensResult
+
+ `num_images` inserts N `
` token ids; see `build_prompt`.
"""
if task not in _TASK_PRESETS:
raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
@@ -167,6 +185,8 @@ def build_prompt_tokens(
trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
has_image_input = task.startswith("i2t") or task.startswith("it2i")
+ if has_image_input:
+ _validate_num_images(num_images)
# t2i_vanilla uses pretrain template with no chat structure; the vanilla
# system prompt drives the model directly. No segment boundaries to
@@ -190,7 +210,7 @@ def build_prompt_tokens(
ids += tokenizer.encode("\n\n", add_special_tokens=False)
ids += tokenizer.encode("User: ", add_special_tokens=False)
if has_image_input:
- ids += [img_id]
+ ids += [img_id] * num_images
ids += tokenizer.encode(user_prompt, add_special_tokens=False)
ids += tokenizer.encode("\n\nAssistant: ", add_special_tokens=False)
if trig_id is not None:
@@ -202,4 +222,12 @@ def build_prompt_tokens(
)
-__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS]
+__all__ = [
+ "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
+ "MAX_IMAGES_PER_REQUEST",
+ "_TASK_PRESETS",
+ "available_tasks",
+ "build_prompt",
+ "build_prompt_tokens",
+ "resolve_stop_token_ids",
+]
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 1e057a71efa..e9d41ebf958 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -860,6 +860,13 @@ def process_image(self, image_input: ImageInput):
else:
raise TypeError(f"Unsupported image type: {type(image_input)}.")
+ # Each cond image keeps its own VAE bucket (mirrors official HF's
+ # ragged behavior in `_encode_cond_image`). VAE pixel tensors have
+ # different (H_i, W_i) per image, so they're flattened to 1-D and
+ # concatenated; vLLM `flat_from_sizes("image", vae_pixel_size)` slices
+ # them back per-image at consumption time. VIT (Siglip2 naflex) pads
+ # to `max_num_patches` so VIT fields keep the existing `batched`
+ # stack path.
batch_data = []
for image in images:
current_info = {}
@@ -883,42 +890,49 @@ def process_image(self, image_input: ImageInput):
_ss = torch.tensor(_ss, dtype=torch.long)
current_info["vit_spatial_shapes"] = _ss.squeeze(0)
- # VAE processing.
- # The resize/crop math here mirrors HF's `resize_and_crop` with
- # crop_type="center" (hunyuan3.0_ins/image_processor.py:61). VAE
- # normalize uses the same transforms.Compose([ToTensor,
- # Normalize([0.5], [0.5])]) as HF's `pil_image_to_tensor`. So
- # numerical output of this branch should match HF up to floating-
- # point reduction order.
+ # VAE: per-image bucket via `reso_group.get_target_size`; mirrors
+ # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the
+ # VAE encoder casts to model dtype at its boundary (see
+ # `_vae_encode`).
image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
resized_image = self._resize_and_crop(image, (image_width, image_height))
- vae_pixel_values = self.vae_processor(resized_image)
+ vae_pixel_values = self.vae_processor(resized_image).squeeze(0)
token_height = image_height // (self.hf_config.vae_downsample_factor[0] * self.hf_config.patch_size)
token_width = image_width // (self.hf_config.vae_downsample_factor[1] * self.hf_config.patch_size)
- # Keep fp32 — the VAE encoder casts to model dtype at its boundary
- # (see _vae_encode). Casting to bf16 here costs ~7e-4 mean-abs-diff
- # bf16 quantization error on every pixel vs HF (which keeps fp32
- # in build_cond_images), measurable as a real numerical drift in
- # downstream image embeddings.
- current_info["vae_pixel_values"] = vae_pixel_values.squeeze(0)
+
+ current_info["vae_pixel_values_flat"] = vae_pixel_values.reshape(-1)
+ current_info["vae_pixel_size"] = torch.tensor(vae_pixel_values.numel(), dtype=torch.long)
current_info["vae_token_grid_hw"] = torch.tensor([token_height, token_width])
- # size
base_size, ratio_index = self.reso_group.get_base_size_and_ratio_index(image_width, image_height)
current_info["base_size"] = torch.tensor(base_size)
current_info["ratio_index"] = torch.tensor(ratio_index)
batch_data.append(current_info)
- # Stack the tensors in the list into a batch dimension (B, ...)
- final_image_info = {}
- if len(batch_data) > 0:
- for key in batch_data[0].keys():
- final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0)
+ final_image_info: dict[str, torch.Tensor] = {}
+ if not batch_data:
+ return final_image_info
+
+ # Same-shape fields: stack along a new image-batch dim as before.
+ same_shape_keys = [
+ "vit_pixel_values",
+ "vit_pixel_attention_mask",
+ "vit_spatial_shapes",
+ "vae_token_grid_hw",
+ "vae_pixel_size",
+ "base_size",
+ "ratio_index",
+ ]
+ for key in same_shape_keys:
+ final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0)
+
+ # Variable-shape VAE pixels: 1-D concat across images (paired with
+ # `vae_pixel_size` via `flat_from_sizes` in `_get_mm_fields_config`).
+ final_image_info["vae_pixel_values"] = torch.cat([d["vae_pixel_values_flat"] for d in batch_data], dim=0)
- if final_image_info:
- shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()}
- logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}")
+ shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()}
+ logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}")
return final_image_info
@@ -1030,8 +1044,13 @@ def _get_mm_fields_config(
config["vit_pixel_attention_mask"] = MultiModalFieldConfig.batched("image")
if "vit_spatial_shapes" in hf_inputs:
config["vit_spatial_shapes"] = MultiModalFieldConfig.batched("image")
- if "vae_pixel_values" in hf_inputs:
- config["vae_pixel_values"] = MultiModalFieldConfig.batched("image")
+ # `vae_pixel_values` is a 1-D concatenation of variable-shape per-image
+ # VAE tensors (see `process_image`). `vae_pixel_size` carries the
+ # per-image flat length so vLLM can split the buffer back per image.
+ if "vae_pixel_values" in hf_inputs and "vae_pixel_size" in hf_inputs:
+ config["vae_pixel_values"] = MultiModalFieldConfig.flat_from_sizes("image", hf_inputs["vae_pixel_size"])
+ if "vae_pixel_size" in hf_inputs:
+ config["vae_pixel_size"] = MultiModalFieldConfig.batched("image")
if "vae_token_grid_hw" in hf_inputs:
config["vae_token_grid_hw"] = MultiModalFieldConfig.batched("image")
if "base_size" in hf_inputs:
@@ -1668,6 +1687,9 @@ def _parse_and_validate_image_input(
vit_pixel_attention_mask = kwargs.pop("vit_pixel_attention_mask", None)
vit_spatial_shapes = kwargs.pop("vit_spatial_shapes", None)
vae_pixel_values = kwargs.pop("vae_pixel_values", None)
+ # vae_pixel_size is only metadata for vLLM's flat_from_sizes split;
+ # we reconstruct per-image shapes from vae_token_grid_hw below.
+ kwargs.pop("vae_pixel_size", None)
vae_token_grid_hw = kwargs.pop("vae_token_grid_hw", None)
if vit_pixel_values is None or vae_pixel_values is None:
@@ -1677,13 +1699,36 @@ def _parse_and_validate_image_input(
if vit_pixel_values.numel() == 0 or vae_pixel_values.numel() == 0:
return None
+ # `vae_pixel_values` arrives as a 1-D concatenation of per-image flat
+ # buffers (see `process_image` + `flat_from_sizes`). Reconstruct a
+ # list of per-image (3, H_i, W_i) tensors using the per-image grid
+ # dims so the downstream VAE encoder can run image-by-image.
+ vae_factor_h = self.config.vae_downsample_factor[0] * self.config.patch_size
+ vae_factor_w = self.config.vae_downsample_factor[1] * self.config.patch_size
+ num_images = vae_token_grid_hw.shape[0]
+ vae_image_list: list[torch.Tensor] = []
+ offset = 0
+ flat = vae_pixel_values.reshape(-1)
+ for i in range(num_images):
+ token_h, token_w = vae_token_grid_hw[i].tolist()
+ h_i = int(token_h) * vae_factor_h
+ w_i = int(token_w) * vae_factor_w
+ n_i = 3 * h_i * w_i
+ vae_image_list.append(flat[offset : offset + n_i].reshape(3, h_i, w_i))
+ offset += n_i
+ if offset != flat.numel():
+ raise ValueError(
+ f"vae_pixel_values size mismatch: consumed {offset} of {flat.numel()} elements "
+ f"across {num_images} images (token_grid_hw={vae_token_grid_hw.tolist()})"
+ )
+
return HunyuanImage3PixelInputs(
type="pixel_values",
pixel_values={
"vit_pixel_values": vit_pixel_values,
"vit_pixel_attention_mask": vit_pixel_attention_mask,
"vit_spatial_shapes": vit_spatial_shapes,
- "vae_pixel_values": vae_pixel_values,
+ "vae_pixel_values": vae_image_list,
"vae_token_grid_hw": vae_token_grid_hw,
},
)
@@ -1795,22 +1840,12 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
# Perform ViT encoding
vit_embeddings = self._vit_encode(vit_pixel_values, vit_pixel_attention_mask, vit_spatial_shapes)
- # Perform VAE encoding
- t, latents = self._vae_encode(vae_pixel_values, vae_cfg_factor)
-
- # Process VAE latents through patch_embed to convert to token embeddings
- # VAE latents are in (B, C, H, W) format, need to be converted to (B, seq_len, hidden_size)
+ # VAE encode + patch_embed per image — each cond image is at its own
+ # `reso_group` bucket so shapes are ragged across the image-batch dim.
vae_token_embeddings = []
- batch_size = latents.shape[0]
- for i in range(batch_size):
- t_i = t[i]
- latents_i = latents[i : i + 1] # Shape: (1, C, H, W)
-
- # Time embedding for VAE processing
- t_emb = self.time_embed(t_i)
-
- # Process VAE latent through patch_embed
- # Input: (1, C, H, W) -> Output: (1, seq_len, hidden_size)
+ for vae_image_i in vae_pixel_values:
+ t_i, latents_i = self._vae_encode(vae_image_i.unsqueeze(0), vae_cfg_factor)
+ t_emb = self.time_embed(t_i[0])
vae_tokens, _, _ = self.patch_embed(latents_i, t_emb)
vae_token_embeddings.append(vae_tokens)
From 46b3b84091954588861edbcc62a9638ec5f4cb67 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 21:55:21 +0800
Subject: [PATCH 02/43] [Refactor] HunyuanImage-3.0 prompt_utils: split task
and bot_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Replace conflated `task` strings (`it2i_think`, `t2i_recaption`,
`t2i_vanilla`, ...) with two orthogonal axes:
task ∈ {t2t, i2t, it2i, t2i}
controls only `
` placeholder emission.
bot_task ∈ {None, think, recaption, think_recaption, vanilla}
controls system prompt + trigger tag.
Mapping:
bot_task=None → en_unified no trigger
bot_task=think → en_unified
bot_task=recaption → en_unified
bot_task=think_recaption → en_think_recaption
bot_task=vanilla → en_vanilla no trigger, no chat
(only valid with task='t2i')
The pre-existing `_TASK_PRESETS` carried a `bot_task` field that was
dead code under all paths actually exercised (`sys_type='en_unified' /
'en_vanilla'`); only `sys_type='dynamic'` consumed it, and nothing in
the repo ever set that. The refactor promotes `bot_task` to the
user-facing API and drops the `task` × mode conflation, also exposing
the previously-unreachable `en_think_recaption` system prompt.
Public helpers `available_bot_tasks()` and `resolve_sys_type(bot_task)`
let callers derive the default sys_type without re-encoding the table.
Side fix on `build_prompt`: the legacy code stripped the system
prompt's leading whitespace while `build_prompt_tokens` did not. This
was invisible while every system prompt was `unified_system_prompt_en`
(no leading newline) but would diverge byte-wise once
`bot_task='think_recaption'` exposes `en_think_recaption` (which
starts with `\n`). `build_prompt` now keeps the system prompt verbatim,
matching the segment-by-segment tokenization path and HF's
`apply_chat_template`.
end2end.py: `--bot-task` choices are now {none, think, recaption,
think_recaption, vanilla}. The literal `none` is the explicit way to
request `bot_task=None` on a modality whose default is `think`
(text2img / img2img); leaving --bot-task unset still falls back to the
modality default. The duplicated `_TASK_PRESETS` literal in the example
script is removed in favor of `resolve_sys_type(bot_task)`.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/README.md | 26 +-
.../hunyuan_image3/end2end.py | 97 ++-----
.../test_hunyuan_image3_it2i_multi_image.py | 62 +++--
.../hunyuan_image3/test_prompt_utils.py | 252 ++++++------------
.../models/hunyuan_image3/prompt_utils.py | 186 ++++++++-----
5 files changed, 254 insertions(+), 369 deletions(-)
diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 6db4cbec9ed..8b90e6b7fa3 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -112,6 +112,7 @@ python end2end.py --modality text2img \
--additional-config '{"torchair_graph_config":{"enabled":true}}'
```
+
## Key Arguments
| Argument | Description |
@@ -123,16 +124,15 @@ python end2end.py --modality text2img \
| `--steps` | Number of diffusion inference steps for image generation. |
| `--guidance-scale` | Classifier-free guidance scale for image generation. |
| `--height`, `--width` | Output image size for `text2img`. |
-| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds ``; `recaption` adds ``; `vanilla` uses the text-to-image pretrain template. |
+| `--bot-task` | Override prompt mode. `none`, `think`, `recaption`, `think_recaption`, or `vanilla`. |
| `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. |
| `--vae-use-tiling` | Enable VAE tiling for memory reduction. |
## Notes
-- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults.
-- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage.
+- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy.
+- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`.
- The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs.
-- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only.
## Prompt Format
@@ -148,22 +148,8 @@ Assistant: {trigger_tag?}
- `
`: Placeholder for each input image (single token; expanded by the multimodal pipeline).
- Trigger tags: `` for CoT and `` for recaptioning, placed after `Assistant: `.
-- System prompt: Auto-selected based on task.
-- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure.
-- The example composes the internal prompt task from `--modality` and `--bot-task`
- before calling `prompt_utils`; for example, `img2text + think` becomes
- `i2t_think` for prompt and stop-token lookup.
+- System prompt: Auto-selected from `task` and `bot_task`.
+- `bot_task='vanilla'` with `task='t2i'` uses the bare pretrain template.
The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
helper handles segment-by-segment tokenization and matches HF `apply_chat_template`.
-
-## FAQ
-
-- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`.
-- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended).
-
-| Stage | VRAM (approx) |
-| :--- | :--- |
-| Stage 0 (AR) | ~15 GiB + KV Cache |
-| Stage 1 (DiT) | ~30 GiB |
-| Total (8-GPU) | ~45 GiB + KV Cache |
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index f9f734c9f4a..9d8f5113201 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -2,16 +2,10 @@
HunyuanImage-3.0-Instruct unified end-to-end inference script.
Supports all modalities through a single entry point:
- - text2img: Text → AR → DiT → Image
- - img2img: Text+Image → AR → DiT → Edited Image (IT2I)
- - img2text: Image+Text → AR → Text description (I2T)
- - text2text: Text → AR → Text (comprehension, no image)
-
-Usage:
- python end2end.py --modality text2img --prompts "A cute cat"
- python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy"
- python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine"
- python end2end.py --modality img2text --image-path input.png --prompts "Describe this image"
+ - text2img: Text -> AR -> DiT -> Image
+ - img2img: Text+Image -> AR -> DiT -> Edited Image (IT2I)
+ - img2text: Image+Text -> AR -> Text description (I2T)
+ - text2text: Text -> AR -> Text (comprehension, no image)
"""
import argparse
@@ -20,9 +14,9 @@
from pathlib import Path
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- _TASK_PRESETS,
build_prompt_tokens,
resolve_stop_token_ids,
+ resolve_sys_type,
)
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniPromptType
@@ -46,11 +40,12 @@
"text2text": "text-to-text",
}
-_MODALITY_TASK_MAP = {
- "text2img": "t2i",
- "img2img": "it2i",
- "img2text": "i2t",
- "text2text": "t2t",
+# Modality -> (task, default bot_task) mapping.
+_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
+ "text2img": ("t2i", "think"),
+ "img2img": ("it2i", "think"),
+ "img2text": ("i2t", None),
+ "text2text": ("t2t", None),
}
@@ -81,7 +76,6 @@ def parse_args():
help="Output directory to save results.",
)
- # Generation parameters
parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
@@ -93,17 +87,12 @@ def parse_args():
help="Enable VAE tiling for memory optimization.",
)
- # Prompt configuration
parser.add_argument(
"--bot-task",
type=str,
- default="auto",
- choices=["auto", "think", "recaption", "think_recaption", "vanilla"],
- help=(
- "Prompt behavior. 'auto' selects the default for the modality; "
- "'think' adds ; 'recaption' adds ; "
- "'vanilla' uses the t2i pretrain template."
- ),
+ default=None,
+ choices=["none", "think", "recaption", "think_recaption", "vanilla"],
+ help="Override prompt mode. Default: auto from --modality.",
)
parser.add_argument(
"--sys-type",
@@ -112,7 +101,6 @@ def parse_args():
help="Override system prompt type (e.g. en_unified, en_vanilla).",
)
- # Omni init args
parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
parser.add_argument("--log-stats", action="store_true", default=False)
@@ -158,22 +146,13 @@ def main():
os.makedirs(args.output, exist_ok=True)
additional_config = parse_additional_config(args.additional_config)
- # Determine task for prompt formatting from modality + bot behavior.
- task = _MODALITY_TASK_MAP[args.modality]
- assert task is not None
- bot_task = args.bot_task
- if bot_task != "auto":
- task = task + "_" + bot_task
- if task not in _TASK_PRESETS:
- valid_bot_tasks = {
- "text2img": ["think", "recaption", "vanilla"],
- "img2img": ["think", "recaption", "think_recaption"],
- "img2text": ["auto"],
- "text2text": ["auto"],
- }[args.modality]
- raise ValueError(
- f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}"
- )
+ task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
+ if args.bot_task is None:
+ bot_task: str | None = default_bot_task
+ elif args.bot_task == "none":
+ bot_task = None
+ else:
+ bot_task = args.bot_task
if args.deploy_config is not None and args.stage_configs_path is not None:
raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -183,7 +162,6 @@ def main():
if deploy_config is None and stage_configs_path is None:
deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality]
- # Build Omni
omni_kwargs = {
"model": args.model,
"vae_use_tiling": args.vae_use_tiling,
@@ -202,10 +180,8 @@ def main():
omni = Omni(**omni_kwargs)
- # Prepare prompts
prompts = args.prompts or ["A cute cat"]
if not prompts:
- print("[Info] No prompts provided, using default.")
prompts = ["A cute cat"]
input_images: list = []
@@ -222,34 +198,23 @@ def main():
if not input_images:
raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
- # Load tokenizer for segment-wise prompt tokenization (matches HF
- # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
-
mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
- # Format prompts
formatted_prompts: list[OmniPromptType] = []
- for p in prompts:
- # Only pass `num_images` for modalities that actually consume images;
- # text-only paths ignore the parameter, but threading it
- # unconditionally reads as if t2i needed at least one image.
- build_kwargs: dict = {"task": task, "sys_type": args.sys_type}
+ for prompt in prompts:
+ build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
if input_images:
build_kwargs["num_images"] = len(input_images)
- result = build_prompt_tokens(p, tokenizer, **build_kwargs)
+ result = build_prompt_tokens(prompt, tokenizer, **build_kwargs)
token_ids = result.token_ids
- effective_sys_type = result.system_prompt_type
+ effective_sys_type = args.sys_type or resolve_sys_type(bot_task)
- # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
- # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
- # the DiT stage so the diffusion pipeline can rebuild the same
- # system prefix when constructing its model inputs.
prompt_dict: dict = {
"prompt_token_ids": token_ids,
- "prompt": p,
+ "prompt": prompt,
"use_system_prompt": effective_sys_type,
}
@@ -268,14 +233,11 @@ def main():
formatted_prompts.append(prompt_dict)
- # Build sampling params from defaults
params_list = list(omni.default_sampling_params_list)
- # Override diffusion params if applicable
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
- assert ar_stop_token_ids is not None
for sp in params_list:
if isinstance(sp, OmniDiffusionSamplingParams):
sp.num_inference_steps = args.steps
@@ -283,13 +245,12 @@ def main():
sp.guidance_scale_provided = True
if args.seed is not None:
sp.seed = args.seed
- if args.modality in ("text2img",):
+ if args.modality == "text2img":
sp.height = args.height
sp.width = args.width
elif hasattr(sp, "stop_token_ids"):
sp.stop_token_ids = ar_stop_token_ids
- # Print configuration
print(f"\n{'=' * 60}")
print("HunyuanImage-3.0 Generation Configuration:")
print(f" Model: {args.model}")
@@ -314,13 +275,10 @@ def main():
print(f" Prompts: {prompts}")
print(f"{'=' * 60}\n")
- # Generate
omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
- # Process outputs
img_idx = 0
for req_output in omni_outputs:
- # Text output (AR stage or text-only)
ro = getattr(req_output, "request_output", None)
txt = ""
if ro and getattr(ro, "outputs", None):
@@ -334,7 +292,6 @@ def main():
if txt:
print(f"[Output] Text:\n{txt}")
- # Image output (DiT stage)
images = getattr(req_output, "images", None)
if not images and ro and hasattr(ro, "images"):
images = ro.images
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
index c8a9891385c..7a1e266b936 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -66,21 +66,25 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
return list(range(100, 100 + len(text)))
-_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption")
-_TEXT_ONLY_TASKS = ("t2t",)
+_IMAGE_TASK_COMBOS = (
+ ("i2t", None),
+ ("it2i", "think"),
+ ("it2i", "recaption"),
+)
+_TEXT_ONLY_TASK_COMBOS = (("t2t", None),)
# -------------------- string builder --------------------
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
@pytest.mark.parametrize("num_images", [1, 2, 3])
-def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int):
+def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, bot_task: str | None, num_images: int):
"""N=1/2/3 -> exactly N `
` substrings appear consecutively
between `User: ` and the user prompt, with no separator between them."""
- s = build_prompt("HELLO", task=task, num_images=num_images)
+ s = build_prompt("HELLO", task=task, bot_task=bot_task, num_images=num_images)
assert s.count("
") == num_images, (
- f"task={task} num_images={num_images}: expected {num_images}
"
+ f"task={task} bot_task={bot_task} num_images={num_images}: expected {num_images}
"
f"placeholders, found {s.count('
')} -- prompt was: {s!r}"
)
@@ -97,24 +101,24 @@ def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images
def test_build_prompt_default_num_images_matches_legacy():
"""num_images default = 1 must produce a string bit-identical to the
pre-multi-image behavior (single `
` placeholder)."""
- legacy = build_prompt("HELLO", task="it2i_think")
- explicit = build_prompt("HELLO", task="it2i_think", num_images=1)
+ legacy = build_prompt("HELLO", task="it2i", bot_task="think")
+ explicit = build_prompt("HELLO", task="it2i", bot_task="think", num_images=1)
assert legacy == explicit, "default num_images=1 must match legacy single-image output"
# -------------------- token builder --------------------
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
-def test_build_prompt_tokens_inserts_N_img_ids(task: str):
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
+def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None):
"""N=1/2/3 -> the resulting id sequence contains exactly N copies of
img_id (=2) sitting consecutively after the `User: ` segment."""
tok = FakeTokenizer()
- ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1)
+ ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1)
tok = FakeTokenizer()
- ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2)
+ ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2)
tok = FakeTokenizer()
- ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3)
+ ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3)
assert ids_n1.count(2) == 1
assert ids_n2.count(2) == 2
@@ -141,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
omitting the parameter (regression guard for existing single-image
callers)."""
tok_a = FakeTokenizer()
- legacy = build_prompt_tokens("hi", tok_a, task="it2i_think")
+ legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think")
tok_b = FakeTokenizer()
- explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1)
+ explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1)
assert legacy == explicit
# Also: encode() must have been called on the same set of segments,
# so segment boundaries are preserved.
@@ -153,23 +157,23 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
# -------------------- validation --------------------
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
@pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99])
-def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int):
+def test_build_prompt_rejects_out_of_range_num_images(task: str, bot_task: str | None, bad: int):
with pytest.raises(ValueError, match="num_images must be in"):
- build_prompt("hi", task=task, num_images=bad)
+ build_prompt("hi", task=task, bot_task=bot_task, num_images=bad)
with pytest.raises(ValueError, match="num_images must be in"):
- build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad)
+ build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=bad)
-@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS)
+@pytest.mark.parametrize("task,bot_task", _TEXT_ONLY_TASK_COMBOS)
@pytest.mark.parametrize("num_images", [0, 1, 2, 99])
-def test_text_only_tasks_ignore_num_images(task: str, num_images: int):
+def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_images: int):
"""Validation only kicks in for image-input tasks; t2t et al. accept
any num_images and emit zero `
` placeholders."""
- s = build_prompt("hi", task=task, num_images=num_images)
+ s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images)
assert "
" not in s
- ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images)
+ ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images)
assert 2 not in ids
@@ -198,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
img_id = tok.convert_tokens_to_ids("
")
assert img_id is not None and img_id >= 0, f"
not in tokenizer vocab; got id={img_id}"
- ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images)
+ ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images)
# Exactly N copies of
id, all consecutive.
img_positions = [i for i, x in enumerate(ids) if x == img_id]
@@ -221,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
img_id = tok.convert_tokens_to_ids("
")
- ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
- ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2)
- ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3)
+ ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
+ ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2)
+ ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3)
assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
@@ -246,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy():
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
- legacy = build_prompt_tokens("hi", tok, task="it2i_think")
- explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+ legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think")
+ explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 1130c0f6db1..4d98bc5dcf2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -1,20 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Regression tests for HunyuanImage3 prompt construction (PR #3243).
-
-Two layers:
- 1. Pure-logic tests with a recording fake tokenizer -- protect the
- prompt template structure (BOS, User:/Assistant: framing, trigger
- placement, image placeholder position) and protect the segment-
- by-segment tokenization contract (each segment must hit
- `tokenizer.encode` in isolation).
- 2. Real-tokenizer regression -- run when the HunyuanImage3-Instruct
- tokenizer is in the local HF cache. Asserts the segment-tokenized
- output diverges from the naive full-string encode, which is the
- bug-tripping fixture for the cross-segment BPE merge fix
- (commit 7bd429ed).
-"""
-
from __future__ import annotations
import ast
@@ -25,6 +10,8 @@
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+ _TASK_PRESETS,
+ available_bot_tasks,
available_tasks,
build_prompt,
build_prompt_tokens,
@@ -34,18 +21,7 @@
pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-# -------------------- Pure-logic structural tests --------------------
-
-
class FakeTokenizer:
- """Minimal tokenizer stub that records every encode() call.
-
- Returns deterministic ids from convert_tokens_to_ids while
- encode() returns one id per character starting at 100. This lets
- tests both verify segmentation (by inspecting `encode_calls`) and
- locate substrings inside the returned id list.
- """
-
SPECIAL = {
"<|startoftext|>": 1,
"
": 2,
@@ -72,85 +48,80 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
def test_available_tasks_covers_all_modalities():
- tasks = set(available_tasks())
- assert tasks >= {
- "t2t",
- "i2t",
+ assert set(available_tasks()) == {"t2t", "i2t", "it2i", "t2i"}
+
+
+def test_available_bot_tasks_covers_all_modes():
+ assert set(available_bot_tasks()) == {None, "think", "recaption", "think_recaption", "vanilla"}
+
+
+def test_legacy_task_presets_still_available():
+ assert {
"it2i_think",
"it2i_recaption",
"it2i_think_recaption",
"t2i_think",
"t2i_recaption",
"t2i_vanilla",
- }
+ } <= set(_TASK_PRESETS)
def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
tok = FakeTokenizer()
-
answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id]
assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id]
+ assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
@pytest.mark.parametrize(
- "task",
+ "task,bot_task",
[
- "t2t",
- "i2t",
- "it2i_think",
- "it2i_recaption",
- "it2i_think_recaption",
- "t2i_think",
- "t2i_recaption",
+ ("t2t", None),
+ ("i2t", None),
+ ("it2i", "think"),
+ ("it2i", "recaption"),
+ ("it2i", "think_recaption"),
+ ("t2i", "think"),
+ ("t2i", "recaption"),
+ ("t2i", "think_recaption"),
],
)
-def test_build_prompt_string_structure_chat_template(task: str):
- """Chat-template tasks must produce <|startoftext|>...User: ...Assistant: ...
- with image placeholder (when applicable) and trigger tag AFTER `Assistant: `."""
- s = build_prompt("HELLO", task=task)
-
+def test_build_prompt_string_structure_chat_template(task: str, bot_task: str | None):
+ s = build_prompt("HELLO", task=task, bot_task=bot_task)
assert s.startswith("<|startoftext|>")
assert "User: " in s
assert "Assistant: " in s
assert s.index("User: ") < s.index("HELLO") < s.index("Assistant: ")
- if task.startswith(("i2t", "it2i")):
- assert s.index("User: ") < s.index("
") < s.index("HELLO"), (
- "
placeholder must sit between `User: ` and the user prompt"
- )
+ if task in ("i2t", "it2i"):
+ assert s.index("User: ") < s.index("
") < s.index("HELLO")
else:
assert "
" not in s
- # Trigger tag must be the FINAL token of the prompt (after `Assistant: `).
- # Note: the system prompt itself mentions / as mode
- # documentation, so substring index() catches the wrong occurrence -- use
- # endswith() which directly captures "trigger is at the tail" (the Part A
- # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
- if task in ("it2i_think", "t2i_think", "it2i_think_recaption"):
- assert s.endswith("Assistant: "), (
- f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
- )
- if task in ("it2i_recaption", "t2i_recaption"):
- assert s.endswith("Assistant: "), (
- f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
- )
- if task in ("t2t", "i2t"):
- assert s.endswith("Assistant: "), "Plain (no-trigger) task must end at `Assistant: ` with no trailing tag."
+ if bot_task in ("think", "think_recaption"):
+ assert s.endswith("Assistant: ")
+ elif bot_task == "recaption":
+ assert s.endswith("Assistant: ")
+ elif bot_task is None:
+ assert s.endswith("Assistant: ")
def test_build_prompt_vanilla_uses_pretrain_template():
- """t2i_vanilla is the only task that bypasses chat structure -- direct
- text->image generation driven by the vanilla system prompt."""
- s = build_prompt("HELLO", task="t2i_vanilla")
+ s = build_prompt("HELLO", task="t2i", bot_task="vanilla")
assert s.startswith("<|startoftext|>")
assert "User: " not in s
assert "Assistant: " not in s
- assert "" not in s
- assert "" not in s
assert s.endswith("HELLO")
+def test_build_prompt_vanilla_rejects_non_t2i_task():
+ with pytest.raises(ValueError, match="bot_task='vanilla'"):
+ build_prompt("x", task="it2i", bot_task="vanilla")
+ with pytest.raises(ValueError, match="bot_task='vanilla'"):
+ build_prompt_tokens("x", FakeTokenizer(), task="i2t", bot_task="vanilla")
+
+
def test_build_prompt_unknown_task_raises():
with pytest.raises(ValueError, match="Unknown task"):
build_prompt("x", task="bogus")
@@ -158,127 +129,83 @@ def test_build_prompt_unknown_task_raises():
build_prompt_tokens("x", FakeTokenizer(), task="bogus")
+def test_build_prompt_unknown_bot_task_raises():
+ with pytest.raises(ValueError, match="Unknown bot_task"):
+ build_prompt("x", task="t2i", bot_task="bogus")
+ with pytest.raises(ValueError, match="Unknown bot_task"):
+ build_prompt_tokens("x", FakeTokenizer(), task="t2i", bot_task="bogus")
+
+
def test_build_prompt_tokens_segments_each_boundary():
- """Regression for cross-segment BPE merge bug (commit 7bd429ed):
- each template segment must hit tokenizer.encode() independently;
- user_prompt MUST NOT be concatenated with the following separator
- in the same encode() call."""
tok = FakeTokenizer()
- build_prompt_tokens("写诗。", tok, task="i2t")
-
- # Each canonical segment is encoded in its own call.
+ build_prompt_tokens("写诗。", tok, task="i2t", bot_task=None)
assert "User: " in tok.encode_calls
- assert "写诗。" in tok.encode_calls, (
- "user_prompt must be encoded alone -- if it is concatenated with the "
- "trailing separator, BPE will merge across the boundary (the PR-#3243 bug)."
- )
+ assert "写诗。" in tok.encode_calls
assert "\n\nAssistant: " in tok.encode_calls
-
- # No call must contain user_prompt glued to neighboring text.
for call in tok.encode_calls:
if call != "写诗。":
- assert "写诗。" not in call, f"user_prompt leaked into a multi-segment encode call: {call!r}"
+ assert "写诗。" not in call
def test_build_prompt_tokens_image_placeholder_present_for_image_tasks():
tok = FakeTokenizer()
- result = build_prompt_tokens("hi", tok, task="i2t")
+ result = build_prompt_tokens("hi", tok, task="i2t", bot_task=None)
ids = result.token_ids
- assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"], "BOS (<|startoftext|>) must be the first token"
- assert FakeTokenizer.SPECIAL["
"] in ids, "
placeholder must be present for i2t/it2i tasks"
+ assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"]
+ assert FakeTokenizer.SPECIAL["
"] in ids
def test_build_prompt_tokens_no_image_for_text_only_tasks():
tok = FakeTokenizer()
- result = build_prompt_tokens("hi", tok, task="t2t")
+ result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None)
ids = result.token_ids
- assert FakeTokenizer.SPECIAL["
"] not in ids, "
must NOT appear for text-only tasks"
+ assert FakeTokenizer.SPECIAL["
"] not in ids
@pytest.mark.parametrize(
- "task,trigger_id",
+ "task,bot_task,trigger_id",
[
- ("it2i_think", FakeTokenizer.SPECIAL[""]),
- ("t2i_think", FakeTokenizer.SPECIAL[""]),
- ("it2i_recaption", FakeTokenizer.SPECIAL[""]),
- ("t2i_recaption", FakeTokenizer.SPECIAL[""]),
+ ("it2i", "think", FakeTokenizer.SPECIAL[""]),
+ ("t2i", "think", FakeTokenizer.SPECIAL[""]),
+ ("t2i", "think_recaption", FakeTokenizer.SPECIAL[""]),
+ ("it2i", "recaption", FakeTokenizer.SPECIAL[""]),
+ ("t2i", "recaption", FakeTokenizer.SPECIAL[""]),
+ ("it2i_think", None, FakeTokenizer.SPECIAL[""]),
+ ("it2i_recaption", None, FakeTokenizer.SPECIAL[""]),
],
)
-def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
- """Trigger tag id must be the LAST token (after `Assistant: ` segment)."""
+def test_build_prompt_tokens_trigger_is_last_token(task: str, bot_task: str | None, trigger_id: int):
tok = FakeTokenizer()
- result = build_prompt_tokens("hi", tok, task=task)
- ids = result.token_ids
- assert ids[-1] == trigger_id
+ result = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task)
+ assert result.token_ids[-1] == trigger_id
def test_build_prompt_tokens_no_trigger_for_plain_tasks():
- """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id."""
tok = FakeTokenizer()
- result = build_prompt_tokens("hi", tok, task="t2t")
- ids = result.token_ids
- assert ids[-1] not in {
+ result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None)
+ assert result.token_ids[-1] not in {
FakeTokenizer.SPECIAL[""],
FakeTokenizer.SPECIAL[""],
}
-# -------------------- end2end.py wiring guard --------------------
-
-
def _repo_root() -> pathlib.Path:
- # tests/diffusion/models/hunyuan_image3/test_prompt_utils.py -> repo root
return pathlib.Path(__file__).resolve().parents[4]
def test_end2end_routes_through_shared_prompt_utils():
- """Regression for the *delivery vector* of PR #3243.
-
- Background: the wrong-template bug that PR #3243 fixes was introduced
- when end2end.py grew its own hand-rolled prompt builder that diverged
- from the canonical instruct chat template. To prevent that exact
- failure mode from recurring, end2end.py MUST:
- 1. Import the prompt builders from the shared prompt_utils module.
- 2. NOT redefine `build_prompt` or `build_prompt_tokens` locally.
-
- A local redefinition is precisely how a future merge can silently
- re-introduce a pretrain-style template (trigger BEFORE user_prompt,
- no User:/Assistant: framing, etc.) without touching prompt_utils,
- bypassing every other test in this file.
- """
end2end_path = _repo_root() / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py"
- assert end2end_path.is_file(), f"end2end.py not found at {end2end_path}"
-
tree = ast.parse(end2end_path.read_text(encoding="utf-8"))
local_func_names = {n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)}
- forbidden = {"build_prompt", "build_prompt_tokens"}
- redefined = local_func_names & forbidden
- assert not redefined, (
- f"end2end.py defines {sorted(redefined)} locally. This is exactly how "
- "the wrong prompt template re-entered the example before PR #3243. "
- "Use the shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils` "
- "helpers instead."
- )
+ assert not (local_func_names & {"build_prompt", "build_prompt_tokens"})
imported_from_prompt_utils: set[str] = set()
for node in ast.walk(tree):
if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
imported_from_prompt_utils.update(alias.name for alias in node.names)
- expected_imports = {
- "_TASK_PRESETS",
- "build_prompt_tokens",
- "resolve_stop_token_ids",
- }
- assert expected_imports <= imported_from_prompt_utils, (
- "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "
- "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared "
- "module is the single source of truth for the AR-prefill template and "
- "bot_task-derived AR stop token ids."
- )
-
-
-# -------------------- Real-tokenizer regression --------------------
+ expected_imports = {"build_prompt_tokens", "resolve_stop_token_ids", "resolve_sys_type"}
+ assert expected_imports <= imported_from_prompt_utils
_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
@@ -290,41 +217,14 @@ def _hf_cached(model_id: str) -> bool:
return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
-@pytest.mark.skipif(
- not _hf_cached(_HUNYUAN_MODEL_ID),
- reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
def test_segment_tokenize_diverges_from_full_string_encode():
- """Regression for PR #3243 segment-tokenization fix.
-
- The naive `tokenizer.encode(build_prompt(...))` lets BPE merge tokens
- across segment boundaries (notably `。\\n\\n` -> a single id), which
- drifts the AR prefill away from HF's apply_chat_template output. The
- segment-by-segment build_prompt_tokens must produce a STRICTLY
- DIFFERENT id sequence on a prompt that triggers the merge.
-
- If someone "simplifies" build_prompt_tokens to call encode() on the
- full string, this assertion fires.
- """
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
-
user_prompt = "写一首关于夜的诗。"
- result = build_prompt_tokens(user_prompt, tok, task="i2t")
+ result = build_prompt_tokens(user_prompt, tok, task="i2t", bot_task=None)
seg_ids = result.token_ids
- full_ids = tok.encode(build_prompt(user_prompt, task="i2t"), add_special_tokens=False)
-
- assert seg_ids != full_ids, (
- "build_prompt_tokens output equals naive full-string encode -- "
- "the BPE-merge-bypass behavior is no longer exercised. This means "
- "the segment-by-segment fix from PR #3243 has been silently undone."
- )
-
- # Segmenting prevents merges, so the segment id list should have AT LEAST
- # as many tokens as the merged version (a merge consumes 2+ ids -> 1).
- assert len(seg_ids) >= len(full_ids), (
- f"segment-encoded length ({len(seg_ids)}) shorter than full-string "
- f"merged length ({len(full_ids)}) -- impossible if segmentation is "
- f"genuinely bypassing merges."
- )
+ full_ids = tok.encode(build_prompt(user_prompt, task="i2t", bot_task=None), add_special_tokens=False)
+ assert seg_ids != full_ids
+ assert len(seg_ids) >= len(full_ids)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 068dad87f8b..4ed277eeed2 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -11,8 +11,23 @@
`JointImageInfo` objects produced by image preprocessing. The example
flow uses an `
` placeholder + `multi_modal_data` instead, so it
needs a lighter-weight builder that only requires a HF tokenizer. This
-module provides that builder; the task -> template mapping below is the
-canonical mapping for both flows.
+module provides that builder; the (task, bot_task) -> template mapping
+below is the canonical mapping for both flows.
+
+Two orthogonal axes:
+
+ * `task` selects the I/O modality combination, which only controls
+ whether `
` placeholders are emitted between `User: ` and the
+ user prompt: ``i2t`` / ``it2i`` produce them, ``t2t`` / ``t2i`` do
+ not.
+
+ * `bot_task` selects the prompting mode and drives both the system
+ prompt and the trigger tag appended after ``Assistant: ``. ``None``
+ (default) gives a plain Assistant turn under the unified prompt;
+ ``think`` / ``recaption`` switch the trigger tag to ```` /
+ ````; ``think_recaption`` swaps the system prompt for
+ the dedicated combined-mode template; ``vanilla`` drops the chat
+ structure entirely (pretrain template, ``t2i`` only).
"""
from __future__ import annotations
@@ -45,30 +60,77 @@
"": 130106,
}
-# task -> (sys_type, bot_task, trigger_tag)
+# bot_task -> (sys_type, trigger_tag).
+# ``vanilla`` is special-cased downstream: it bypasses the chat template
+# (no ``User:`` / ``Assistant:`` framing) and is only valid with
+# ``task='t2i'``.
+_BOT_TASK_PRESETS: dict[str | None, tuple[str, str | None]] = {
+ None: ("en_unified", None),
+ "think": ("en_unified", ""),
+ "recaption": ("en_unified", ""),
+ "think_recaption": ("en_think_recaption", ""),
+ "vanilla": ("en_vanilla", None),
+}
+
+_TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"})
+
+# Legacy composite task alias -> (task, bot_task). Keep this during rebase so
+# older callers and intermediate commits still resolve cleanly.
_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
"t2t": ("en_unified", None, None),
"i2t": ("en_unified", None, None),
"it2i_think": ("en_unified", "think", ""),
"it2i_recaption": ("en_unified", "recaption", ""),
"it2i_think_recaption": ("en_unified", "think_recaption", ""),
- "t2i": ("en_unified", "image", None),
- "t2i_vanilla": ("en_vanilla", "image", None),
+ "t2i": ("en_unified", None, None),
+ "t2i_vanilla": ("en_vanilla", "vanilla", None),
"t2i_think": ("en_unified", "think", ""),
"t2i_recaption": ("en_unified", "recaption", ""),
}
+def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]:
+ if task in _TASK_PRESETS:
+ _, legacy_bot_task, _ = _TASK_PRESETS[task]
+ base_task = task.split("_", 1)[0]
+ if base_task == "t2i" and task == "t2i":
+ base_task = "t2i"
+ if task in ("t2t", "i2t", "t2i"):
+ base_task = task
+ if bot_task is None:
+ bot_task = legacy_bot_task
+ task = base_task
+ return task, bot_task
+
+
def available_tasks() -> list[str]:
- """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
- return sorted(_TASK_PRESETS)
+ """Sorted list of `task` values accepted by the prompt builders."""
+ return sorted(_TASKS)
+
+
+def available_bot_tasks() -> list[str | None]:
+ """Sorted list of `bot_task` values (with ``None`` first)."""
+ rest = sorted(k for k in _BOT_TASK_PRESETS if k is not None)
+ return [None, *rest]
+
+
+def resolve_sys_type(bot_task: str | None) -> str:
+ """Default system-prompt type for a given ``bot_task``."""
+ if bot_task not in _BOT_TASK_PRESETS:
+ raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+ return _BOT_TASK_PRESETS[bot_task][0]
def resolve_stop_token_ids(
- task: str = "it2i_think",
- bot_task: str = "think",
+ task: str = "it2i",
+ bot_task: str | None = "think",
tokenizer: Any | None = None,
-):
+) -> list[int]:
+ task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+ if task not in _TASKS:
+ raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+ if bot_task not in _BOT_TASK_PRESETS:
+ raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]]
@@ -81,56 +143,45 @@ def _validate_num_images(num_images: int) -> None:
raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}")
+def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]:
+ """Validate (task, bot_task) and return ``(sys_type, trigger_tag)``."""
+ task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+ if task not in _TASKS:
+ raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+ if bot_task not in _BOT_TASK_PRESETS:
+ raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+ if bot_task == "vanilla" and task != "t2i":
+ raise ValueError(f"bot_task='vanilla' is only valid with task='t2i' (pretrain template); got task={task!r}")
+ return _BOT_TASK_PRESETS[bot_task]
+
+
def build_prompt(
user_prompt: str,
- task: str = "it2i_think",
+ task: str = "it2i",
+ bot_task: str | None = "think",
sys_type: str | None = None,
custom_system_prompt: str | None = None,
num_images: int = 1,
) -> str:
- """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).
-
- NOTE: when this string is passed to the engine, the engine's tokenizer
- will run a single BPE pass over the whole string, which can merge
- tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
- inputs that need to match HF baseline byte-for-byte, use
- `build_prompt_tokens` instead and feed the result via prompt_token_ids.
-
- `num_images` emits N consecutive `
` placeholders between
- `User: ` and `user_prompt`. Ignored for text-only tasks.
- """
- if task not in _TASK_PRESETS:
- raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
- preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+ """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path)."""
+ task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+ preset_sys_type, trigger_tag = _resolve_preset(task, bot_task)
effective_sys_type = sys_type or preset_sys_type
- system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
- sys_text = system_prompt.strip() if system_prompt else ""
+ system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt)
+ sys_text = system_prompt or ""
- has_image_input = task.startswith("i2t") or task.startswith("it2i")
+ has_image_input = task in ("i2t", "it2i")
if has_image_input:
_validate_num_images(num_images)
- # t2i_vanilla: pretrain mode for direct text->image generation. The
- # vanilla system prompt drives the model with no chat structure.
- if task == "t2i_vanilla":
+ if bot_task == "vanilla":
parts = ["<|startoftext|>"]
if sys_text:
parts.append(sys_text)
parts.append(user_prompt)
return "".join(parts)
- # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
- # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
- # <|startoftext|>{system?}\n\nUser: {
*N?}{user_prompt}\n\nAssistant: {trigger?}
- # generation_config.json declares sequence_template="instruct", so the
- # AR prefill MUST use this template -- verified to match HF's
- # apply_chat_template output token-for-token (modulo BPE boundary merges).
- # The trigger_tag (e.g. ) MUST come AFTER the `Assistant: ` prefix:
- # if it goes BEFORE user_prompt (the old pretrain layout) the model puts
- # the user's instructions inside the "thinking section" and collapses
- # into repetition garbage under greedy decoding.
parts = ["<|startoftext|>"]
if sys_text:
parts.append(f"{sys_text}\n\n")
@@ -141,67 +192,52 @@ def build_prompt(
parts.append("\n\nAssistant: ")
if trigger_tag:
parts.append(trigger_tag)
-
return "".join(parts)
@dataclass
class PromptTokensResult:
- token_ids: list[int] # The tokenized prompt
- system_prompt_type: str # The effective system prompt type used
+ token_ids: list[int]
+ system_prompt_type: str
def build_prompt_tokens(
user_prompt: str,
tokenizer,
- task: str = "it2i_think",
+ task: str = "it2i",
+ bot_task: str | None = "think",
sys_type: str | None = None,
custom_system_prompt: str | None = None,
num_images: int = 1,
) -> PromptTokensResult:
- """Segment-by-segment tokenization that matches HF apply_chat_template.
-
- Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE
- merge tokens across segment boundaries (e.g. user_prompt ends with `。`
- and the next segment is `\\n\\n` -> they merge into a single token id
- 3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes
- each segment independently and concatenates token_ids, so no cross-
- boundary merge happens. We replicate that here and feed the result to
- Omni via OmniTokensPrompt (prompt_token_ids).
-
- Returns:
- PromptTokensResult
-
- `num_images` inserts N `
` token ids; see `build_prompt`.
- """
- if task not in _TASK_PRESETS:
- raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
- preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+ """Segment-by-segment tokenization that matches HF apply_chat_template."""
+ task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+ preset_sys_type, trigger_tag = _resolve_preset(task, bot_task)
effective_sys_type = sys_type or preset_sys_type
bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
img_id = tokenizer.convert_tokens_to_ids("
")
trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
- has_image_input = task.startswith("i2t") or task.startswith("it2i")
+ has_image_input = task in ("i2t", "it2i")
if has_image_input:
_validate_num_images(num_images)
- # t2i_vanilla uses pretrain template with no chat structure; the vanilla
- # system prompt drives the model directly. No segment boundaries to
- # protect, fall back to whole-string encode.
- if task == "t2i_vanilla":
- s = build_prompt(user_prompt, task, sys_type, custom_system_prompt)
+ if bot_task == "vanilla":
+ s = build_prompt(
+ user_prompt,
+ task=task,
+ bot_task=bot_task,
+ sys_type=sys_type,
+ custom_system_prompt=custom_system_prompt,
+ )
token_ids = tokenizer.encode(s, add_special_tokens=False)
return PromptTokensResult(
token_ids=token_ids,
system_prompt_type=effective_sys_type,
)
- system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
- # Do NOT strip -- HF apply_chat_template keeps the system prompt's
- # natural trailing newline; stripping it would shift one token id.
+ system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt)
sys_text = system_prompt or ""
ids: list[int] = [bos_id]
@@ -226,8 +262,10 @@ def build_prompt_tokens(
"HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
"MAX_IMAGES_PER_REQUEST",
"_TASK_PRESETS",
+ "available_bot_tasks",
"available_tasks",
"build_prompt",
"build_prompt_tokens",
"resolve_stop_token_ids",
+ "resolve_sys_type",
]
From f4d76d5ea2b791b9a54fbc4daaa84242c89c0f62 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Sat, 9 May 2026 14:47:51 +0800
Subject: [PATCH 03/43] [Feature] HunyuanImage-3.0 IT2I: wire multi-image
through online serving
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Multi-image IT2I worked offline but `/v1/images/edits` returned HTTP 400
"multi_modal_uuids['image'] must have same length as multi_modal_data['image']"
because the serving layer never expanded uuids past one-per-modality-key.
Two serving-side gaps the model PR did not cover:
1. `serving_chat.py:_build_multistage_generation_inputs` (and its mirror in
the chat-completion image-gen path) built `multi_modal_uuids` by iterating
over dict keys, producing one uuid per modality regardless of value
shape. For `engine_prompt_data = {"image": [pil1, pil2]}` this yielded
`{"image": ["img-image-0"]}` (1 uuid), which vLLM's renderer then
rejected against the 2-item parsed image list. Fixed by expanding the
uuid list to `len(value)` when the value is a list, while keeping the
single-uuid behavior for scalar values (e.g. `{"img2img": pil}`).
2. `model_metadata._DIFFUSION_MODEL_METADATA` only registered
`QwenImageEditPlusPipeline` as supports_multimodal_inputs=True, so
`od_config.supports_multimodal_inputs` defaulted to False for
HunyuanImage3Pipeline. The multistage edit path bypasses that check
on the way in, but the chat path's `generate_diffusion_images` does
query it (line 2322) and would reject multi-image with "Multiple
input images are not supported by the current diffusion model".
Registered `HunyuanImage3Pipeline` with `max_multimodal_image_inputs=3`
to match upstream's "Multi-Image Fusion" cap (README §200-216).
Static change only; uuid expansion was traced through serving_chat ->
async_omni -> async_omni_engine.add_request -> InputProcessor ->
OmniInputPreprocessor._process_text -> renderer._process_multimodal ->
_validate_mm_uuids. End-to-end smoke against /v1/images/edits with two
`-F image=@...` parts is left for a follow-up; reproducing requires
PYTHONPATH= when launching `vllm serve` so the system
Python's editable vllm-omni install does not shadow the rebased branch.
Signed-off-by: TaffyOfficial
---
vllm_omni/diffusion/model_metadata.py | 6 ++++++
vllm_omni/entrypoints/openai/serving_chat.py | 13 +++++++++++--
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/vllm_omni/diffusion/model_metadata.py b/vllm_omni/diffusion/model_metadata.py
index ec133e7380e..f3346338434 100644
--- a/vllm_omni/diffusion/model_metadata.py
+++ b/vllm_omni/diffusion/model_metadata.py
@@ -13,6 +13,8 @@ class DiffusionModelMetadata:
QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES = 4
+# Upstream HunyuanImage-3.0 "Multi-Image Fusion" caps reference images at 3.
+HUNYUAN_IMAGE3_MAX_INPUT_IMAGES = 3
_DIFFUSION_MODEL_METADATA: dict[str, DiffusionModelMetadata] = {
@@ -20,6 +22,10 @@ class DiffusionModelMetadata:
supports_multimodal_inputs=True,
max_multimodal_image_inputs=QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES,
),
+ "HunyuanImage3Pipeline": DiffusionModelMetadata(
+ supports_multimodal_inputs=True,
+ max_multimodal_image_inputs=HUNYUAN_IMAGE3_MAX_INPUT_IMAGES,
+ ),
}
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 99827454e70..9ec626a3e74 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -419,7 +419,10 @@ async def create_chat_completion(
# consistency. After the multimodal processor consumes
# the image data, the uuids remain as a stable reference.
tprompt["multi_modal_uuids"] = {
- k: [f"{request_id}-{k}-{i}"] for i, k in enumerate(engine_prompt_image)
+ k: [f"{request_id}-{k}-{i}" for i in range(len(v))]
+ if isinstance(v, list)
+ else [f"{request_id}-{k}-0"]
+ for k, v in engine_prompt_image.items()
}
engine_prompts = [tprompt]
@@ -2295,7 +2298,13 @@ def _build_multistage_generation_inputs(
engine_prompt["multi_modal_data"] = engine_prompt_data
# Provide multi_modal_uuids so that newer vLLM versions can
# validate multi_modal_data / multi_modal_uuids consistency.
- engine_prompt["multi_modal_uuids"] = {k: [f"img-{k}-{i}"] for i, k in enumerate(engine_prompt_data)}
+ # Generate one uuid per image when the value is a list (multi-image inputs).
+ engine_prompt["multi_modal_uuids"] = {
+ k: [f"img-{k}-{i}" for i in range(len(v))]
+ if isinstance(v, list)
+ else [f"img-{k}-0"]
+ for k, v in engine_prompt_data.items()
+ }
comprehension_idx = None
for idx, stage in enumerate(stage_configs):
From c18f01674d457e7da3d7f79b93f7fe871a34fbb1 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Sat, 9 May 2026 15:03:27 +0800
Subject: [PATCH 04/43] [Bugfix] HunyuanImage-3.0 ar2diffusion: honor
AR-predicted output ratio
DiT output collapsed to a square whenever the input bucket was square,
even though the AR engine had already predicted a different aspect via
its `` tail. The bridge ignored the prediction
and forwarded the prompt-carried `height`/`width` straight to the
diffusion pipeline:
height = original_prompt.get("height", 1024)
width = original_prompt.get("width", 1024)
In the `/v1/images/edits` path that prompt height/width is filled with
`pil_images[0].size` (api_server.py:1808-1811) when the client does not
pass `--size`/`resolution`, so the first reference image's bucket
(typically a logo, square) determined the DiT canvas regardless of what
the prompt actually called for. Mirrors the issue called out in the
multi-image PR's commit message ("Output-size handling for the AR/DiT
ratio lifecycle is intentionally NOT touched ... properly wiring that
into ar2diffusion's width/height assignment is a separate refactor").
Wires the AR's ratio_index back into the bridge:
1. Recover ratio_index from the AR output. Probe the detokenized text
first (cheap, works under `skip_special_tokens: False` like
`hunyuan_image3_it2i_kv_reuse.yaml`); fall back to scanning
`cumulative_token_ids` against the tokenizer's
`..` id range so the fix also holds when
the AR engine strips special tokens from text. The token-id table
is loaded once via AutoTokenizer (cached, model name overridable
via `VLLM_OMNI_HUNYUAN_IMAGE3_MODEL`) and shaped to mirror
`HunyuanImage3ForCausalMM.__init__:1523-1531` (contiguous main
slice 0..32 plus extra slice 33..36).
2. Resolve ratio_index to (height, width) via
`ResolutionGroup(base_size=1024).data[ratio_index]`, which is the
same reverse lookup `HunyuanImage3ImageProcessor.build_image_info`
uses upstream when constructing the DiT image_info from
``. Falls back to the prompt-carried height/width
when no ratio token is present (comprehension paths, AR aborted
before the size+ratio tail) so non-IT2I/T2I flows are unaffected.
End-to-end smoke is left for a follow-up: test/repro requires
`PYTHONPATH= vllm serve ...` to keep the system
Python's editable vllm-omni install from shadowing this branch (same
caveat as the prior multi-image uuid commit).
Signed-off-by: TaffyOfficial
---
.../stage_input_processors/hunyuan_image3.py | 137 +++++++++++++++++-
1 file changed, 136 insertions(+), 1 deletion(-)
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index b7630bb8ac8..9a53bf4be06 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -12,6 +12,9 @@
from __future__ import annotations
+import os
+import re
+from functools import lru_cache
from typing import Any
import torch
@@ -22,6 +25,108 @@
logger = init_logger(__name__)
+# AR emits `` after `` in IT2I/T2I
+# (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The
+# ratio_index resolves to a (height, width) bucket via ResolutionGroup, which
+# is the official upstream's mechanism for AR-driven output aspect — without
+# this lookup the DiT pipeline falls back to the user-provided width/height
+# (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
+# i.e. the first reference image's bucket — usually square, see
+# api_server.py:1808-1811).
+_RATIO_TOKEN_RE = re.compile(r"")
+_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
+
+
+@lru_cache(maxsize=4)
+def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
+ """Return `[(height, width)]` indexed by ratio_index for HunyuanImage-3.
+
+ Mirrors `HunyuanImage3ImageProcessor.build_image_info`'s
+ `reso_group[ratio_index]` reverse lookup. Cached because the table
+ is constant per `base_size`.
+ """
+ from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup
+
+ reso_group = ResolutionGroup(base_size=base_size)
+ return [(int(r.height), int(r.width)) for r in reso_group.data]
+
+
+@lru_cache(maxsize=4)
+def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
+ """Return `{token_id: ratio_index}` for `` in the tokenizer.
+
+ Loads the tokenizer once per model path and walks the contiguous
+ `..` plus the extra slice
+ `..` (the same shape
+ `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531).
+ Empty dict on lookup failure so callers can degrade gracefully.
+ """
+ try:
+ from transformers import AutoTokenizer
+
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+ except Exception as e: # pragma: no cover - environment-dependent
+ logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e)
+ return {}
+
+ def _id(name: str) -> int | None:
+ tid = tokenizer.convert_tokens_to_ids(name)
+ return None if tid is None or tid == tokenizer.unk_token_id else int(tid)
+
+ ratio_0 = _id("")
+ ratio_32 = _id("")
+ ratio_33 = _id("")
+ ratio_36 = _id("")
+ if None in (ratio_0, ratio_32, ratio_33, ratio_36):
+ logger.warning("[ar2diffusion] tokenizer is missing one of tokens")
+ return {}
+
+ table: dict[int, int] = {}
+ for i in range(ratio_32 - ratio_0 + 1):
+ table[ratio_0 + i] = i
+ base_idx = ratio_32 - ratio_0 + 1
+ for j in range(ratio_36 - ratio_33 + 1):
+ table[ratio_33 + j] = base_idx + j
+ return table
+
+
+def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None:
+ """Resolve the AR-predicted ratio_index from this stage's output.
+
+ Two probe paths:
+ 1. Text regex on `generated_text` — works when the AR engine is
+ configured with `skip_special_tokens: False` (e.g.
+ `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading
+ the tokenizer.
+ 2. Token-id scan over `cumulative_token_ids` against the tokenizer's
+ `` id range — survives `skip_special_tokens: True`
+ where the special tokens are stripped from text but still present
+ in the raw token stream.
+
+ Takes the LAST ratio token in the stream because the AR's
+ stage-transition logic emits exactly one such token at the tail of the
+ `` sequence; using "last" is robust to
+ any earlier accidental occurrences in the prompt scaffold.
+ """
+ matches = _RATIO_TOKEN_RE.findall(generated_text or "")
+ if matches:
+ try:
+ return int(matches[-1])
+ except ValueError:
+ pass
+
+ if generated_token_ids is None:
+ return None
+ table = _build_ratio_id_lookup(model_name_or_path)
+ if not table:
+ return None
+ last_ratio_idx: int | None = None
+ for tid in generated_token_ids:
+ idx = table.get(int(tid))
+ if idx is not None:
+ last_ratio_idx = idx
+ return last_ratio_idx
+
def ar2diffusion(
source_outputs: list[Any],
@@ -65,13 +170,43 @@ def ar2diffusion(
text_prompt = original_prompt.get("prompt", "")
use_system_prompt = original_prompt.get("use_system_prompt")
+ # Prefer the AR's predicted output aspect (``
+ # tail emitted by `HunyuanImage3ForCausalMM.sample` under the
+ # ratio-restriction logits processor) over the carried-through
+ # height/width, which the serving layer fills with the first
+ # reference image's bucket and so collapses non-square targets to
+ # square in the multi-image / mismatched-aspect case. Mirrors the
+ # official upstream where `reso_group[ratio_index]` is the
+ # canonical source of the diffusion target shape.
+ model_name_or_path = original_prompt.get("model") or os.environ.get(
+ "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
+ )
+ ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path)
+ ar_predicted = False
+ if ratio_idx is not None:
+ base_size = int(original_prompt.get("image_base_size", 1024))
+ size_table = _build_ratio_size_table(base_size)
+ if 0 <= ratio_idx < len(size_table):
+ height, width = size_table[ratio_idx]
+ ar_predicted = True
+ else:
+ logger.warning(
+ "[ar2diffusion] Request %d: ratio_index=%d out of range [0,%d), keeping prompt size %dx%d",
+ i,
+ ratio_idx,
+ len(size_table),
+ height,
+ width,
+ )
+
logger.info(
- "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d",
+ "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)",
i,
len(generated_token_ids),
len(generated_text),
height,
width,
+ f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
)
token_tensor = torch.tensor(generated_token_ids, dtype=torch.long)
From c5f2f9bd618e4b5998ba8fbe53ccca7bb3b894a2 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Sat, 9 May 2026 15:23:39 +0800
Subject: [PATCH 05/43] [Chore] HunyuanImage-3.0 end2end: accept internal task
names as --modality aliases
`--modality img2img` historically pointed at the internal task `it2i`,
so users who think in the post-`prompt_utils` task vocabulary
(`t2i`/`it2i`/`i2t`/`t2t`, see `_TASK_PRESETS`) had to translate.
Common enough that two recent reproduction commands hit the
`invalid choice: 'it2i'` argparse error before getting any actual
output.
Accepts both spellings on the CLI and canonicalizes the short forms to
the verbose names right after parsing so the downstream
`args.modality == "img2img"` branches stay one-line and do not have to
enumerate aliases. Default value, choices listing, and behavior for
existing verbose names unchanged.
Signed-off-by: TaffyOfficial
---
.../hunyuan_image3/end2end.py | 78 +++++++------------
1 file changed, 28 insertions(+), 50 deletions(-)
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 9d8f5113201..b560926f1b7 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -1,11 +1,5 @@
"""
HunyuanImage-3.0-Instruct unified end-to-end inference script.
-
-Supports all modalities through a single entry point:
- - text2img: Text -> AR -> DiT -> Image
- - img2img: Text+Image -> AR -> DiT -> Edited Image (IT2I)
- - img2text: Image+Text -> AR -> Text description (I2T)
- - text2text: Text -> AR -> Text (comprehension, no image)
"""
import argparse
@@ -21,11 +15,29 @@
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniPromptType
-# Default deploy configs are absolute so this example works from any cwd.
_REPO_ROOT = Path(__file__).resolve().parents[3]
_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
+# Both verbose and short-form aliases are accepted.
+_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
+ "text2img": ("t2i", "think"),
+ "t2i": ("t2i", "think"),
+ "img2img": ("it2i", "think"),
+ "it2i": ("it2i", "think"),
+ "img2text": ("i2t", None),
+ "i2t": ("i2t", None),
+ "text2text": ("t2t", None),
+ "t2t": ("t2t", None),
+}
+
+_MODALITY_CANONICAL = {
+ "t2i": "text2img",
+ "it2i": "img2img",
+ "i2t": "img2text",
+ "t2t": "text2text",
+}
+
_MODALITY_DEFAULT_DEPLOY_CONFIG = {
"text2img": _DEFAULT_DEPLOY_CONFIG,
"img2img": _DEFAULT_DEPLOY_CONFIG,
@@ -40,27 +52,15 @@
"text2text": "text-to-text",
}
-# Modality -> (task, default bot_task) mapping.
-_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
- "text2img": ("t2i", "think"),
- "img2img": ("it2i", "think"),
- "img2text": ("i2t", None),
- "text2text": ("t2t", None),
-}
-
def parse_args():
parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
- parser.add_argument(
- "--model",
- default="tencent/HunyuanImage-3.0-Instruct",
- help="Model name or local path.",
- )
+ parser.add_argument("--model", default="tencent/HunyuanImage-3.0-Instruct", help="Model name or local path.")
parser.add_argument(
"--modality",
default="text2img",
- choices=["text2img", "img2img", "img2text", "text2text"],
- help="Modality mode to control stage execution.",
+ choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"],
+ help="Verbose and internal short task names are both accepted.",
)
parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
parser.add_argument(
@@ -69,24 +69,14 @@ def parse_args():
default=None,
help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
)
- parser.add_argument(
- "--output",
- type=str,
- default=".",
- help="Output directory to save results.",
- )
+ parser.add_argument("--output", type=str, default=".", help="Output directory to save results.")
parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
parser.add_argument("--height", type=int, default=1024, help="Output image height.")
parser.add_argument("--width", type=int, default=1024, help="Output image width.")
- parser.add_argument(
- "--vae-use-tiling",
- action="store_true",
- help="Enable VAE tiling for memory optimization.",
- )
-
+ parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.")
parser.add_argument(
"--bot-task",
type=str,
@@ -94,13 +84,7 @@ def parse_args():
choices=["none", "think", "recaption", "think_recaption", "vanilla"],
help="Override prompt mode. Default: auto from --modality.",
)
- parser.add_argument(
- "--sys-type",
- type=str,
- default=None,
- help="Override system prompt type (e.g. en_unified, en_vanilla).",
- )
-
+ parser.add_argument("--sys-type", type=str, default=None, help="Override system prompt type.")
parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
parser.add_argument("--log-stats", action="store_true", default=False)
@@ -146,6 +130,7 @@ def main():
os.makedirs(args.output, exist_ok=True)
additional_config = parse_additional_config(args.additional_config)
+ args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality)
task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
if args.bot_task is None:
bot_task: str | None = default_bot_task
@@ -168,6 +153,7 @@ def main():
"log_stats": args.log_stats,
"init_timeout": args.init_timeout,
"enforce_eager": args.enforce_eager,
+ "mode": _MODALITY_MODE[args.modality],
}
if additional_config is not None:
@@ -176,14 +162,10 @@ def main():
omni_kwargs["deploy_config"] = deploy_config
else:
omni_kwargs["stage_configs_path"] = stage_configs_path
- omni_kwargs["mode"] = _MODALITY_MODE[args.modality]
omni = Omni(**omni_kwargs)
prompts = args.prompts or ["A cute cat"]
- if not prompts:
- prompts = ["A cute cat"]
-
input_images: list = []
if args.modality in ("img2img", "img2text"):
if not args.image_path:
@@ -217,7 +199,6 @@ def main():
"prompt": prompt,
"use_system_prompt": effective_sys_type,
}
-
if args.modality == "text2img":
prompt_dict["modalities"] = ["image"]
elif args.modality == "img2img":
@@ -228,9 +209,8 @@ def main():
elif args.modality == "img2text":
prompt_dict["modalities"] = ["text"]
prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
- elif args.modality == "text2text":
+ else:
prompt_dict["modalities"] = ["text"]
-
formatted_prompts.append(prompt_dict)
params_list = list(omni.default_sampling_params_list)
@@ -276,7 +256,6 @@ def main():
print(f"{'=' * 60}\n")
omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
-
img_idx = 0
for req_output in omni_outputs:
ro = getattr(req_output, "request_output", None)
@@ -295,7 +274,6 @@ def main():
images = getattr(req_output, "images", None)
if not images and ro and hasattr(ro, "images"):
images = ro.images
-
if images:
for j, img in enumerate(images):
save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png")
From 2ff92b7f0002a6fe957e2247d7ea205d92a13467 Mon Sep 17 00:00:00 2001
From: skf1999 <13234016272@163.com>
Date: Sun, 10 May 2026 01:41:23 +0800
Subject: [PATCH 06/43] feat(end2end): semantic output shape for multi-image
IT2I
Signed-off-by: skf1999 <13234016272@163.com>
---
.../hunyuan_image3/end2end.py | 48 +++++++++++++++----
1 file changed, 40 insertions(+), 8 deletions(-)
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index b560926f1b7..b46e326d1c8 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -5,6 +5,7 @@
import argparse
import json
import os
+import re
from pathlib import Path
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
@@ -19,7 +20,6 @@
_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
-# Both verbose and short-form aliases are accepted.
_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
"text2img": ("t2i", "think"),
"t2i": ("t2i", "think"),
@@ -70,7 +70,6 @@ def parse_args():
help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
)
parser.add_argument("--output", type=str, default=".", help="Output directory to save results.")
-
parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
@@ -125,6 +124,30 @@ def parse_additional_config(raw_value: str | None) -> dict | None:
return additional_config
+def _infer_shape_reference_index(prompt: str, num_images: int) -> int:
+ chinese_nums = {"一": 1, "二": 2, "三": 3}
+
+ def _to_idx(match: re.Match[str]) -> int | None:
+ token = match.group(1).strip()
+ value = chinese_nums.get(token, int(token) if token.isdigit() else None)
+ return value - 1 if value and 1 <= value <= num_images else None
+
+ for pattern in (
+ r"参考图\s*([一二三123])",
+ r"参考第\s*([一二三123])\s*张",
+ r"参考\s*image\s*([123])",
+ r"ref(?:erence)?\s*image\s*([123])",
+ r"基于图\s*([一二三123])",
+ r"基于第\s*([一二三123])\s*张",
+ r"基于\s*image\s*([123])",
+ r"based\s*on\s*image\s*([123])",
+ ):
+ match = re.search(pattern, prompt, re.IGNORECASE)
+ if match and (idx := _to_idx(match)) is not None:
+ return idx
+ return 0
+
+
def main():
args = parse_args()
os.makedirs(args.output, exist_ok=True)
@@ -173,10 +196,10 @@ def main():
from PIL import Image
image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
- for p in image_paths:
- if not os.path.exists(p):
- raise ValueError(f"Image path does not exist: {p}")
- input_images.append(Image.open(p).convert("RGB"))
+ for image_path in image_paths:
+ if not os.path.exists(image_path):
+ raise ValueError(f"Image path does not exist: {image_path}")
+ input_images.append(Image.open(image_path).convert("RGB"))
if not input_images:
raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
@@ -186,6 +209,7 @@ def main():
mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
formatted_prompts: list[OmniPromptType] = []
+ shape_indices: list[int] = []
for prompt in prompts:
build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
if input_images:
@@ -204,8 +228,10 @@ def main():
elif args.modality == "img2img":
prompt_dict["modalities"] = ["image"]
prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
- prompt_dict["height"] = input_images[0].height
- prompt_dict["width"] = input_images[0].width
+ shape_idx = _infer_shape_reference_index(prompt, len(input_images))
+ prompt_dict["height"] = input_images[shape_idx].height
+ prompt_dict["width"] = input_images[shape_idx].width
+ shape_indices.append(shape_idx)
elif args.modality == "img2text":
prompt_dict["modalities"] = ["text"]
prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
@@ -218,6 +244,7 @@ def main():
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
+ diffusion_idx = 0
for sp in params_list:
if isinstance(sp, OmniDiffusionSamplingParams):
sp.num_inference_steps = args.steps
@@ -228,6 +255,11 @@ def main():
if args.modality == "text2img":
sp.height = args.height
sp.width = args.width
+ elif args.modality == "img2img":
+ shape_idx = shape_indices[diffusion_idx]
+ sp.height = input_images[shape_idx].height
+ sp.width = input_images[shape_idx].width
+ diffusion_idx += 1
elif hasattr(sp, "stop_token_ids"):
sp.stop_token_ids = ar_stop_token_ids
From 74e5caca3b8d8b8ff7e3b3a529ad33cd3567c1e5 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 03:57:53 +0800
Subject: [PATCH 07/43] [Chore] Apply pre-commit formatting fixes
Auto-applied by ruff/whitespace hooks: extra blank lines between
top-level functions, stripped trailing whitespace, and collapsed a
dict-comprehension expression onto a single line.
Signed-off-by: zuiho <2324465096@qq.com>
---
vllm_omni/entrypoints/openai/serving_chat.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 9ec626a3e74..a5ca494c89e 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2300,9 +2300,7 @@ def _build_multistage_generation_inputs(
# validate multi_modal_data / multi_modal_uuids consistency.
# Generate one uuid per image when the value is a list (multi-image inputs).
engine_prompt["multi_modal_uuids"] = {
- k: [f"img-{k}-{i}" for i in range(len(v))]
- if isinstance(v, list)
- else [f"img-{k}-0"]
+ k: [f"img-{k}-{i}" for i in range(len(v))] if isinstance(v, list) else [f"img-{k}-0"]
for k, v in engine_prompt_data.items()
}
From d7400dca983a03c9c74bbb59fa6288b226e17452 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 15:40:10 +0800
Subject: [PATCH 08/43] fix(hunyuan_image3): honor ar2diffusion's predicted
shape in pre_process_func
pre_process_func was unconditionally filling None sampling_params.height/width
with image_list[0].size, burying the AR-predicted ratio that ar2diffusion
(e31197f0) had written into prompt["height"]/["width"]. forward() reads only
sampling_params, so the bridge was a silent no-op on the IT2I path -- DiT
output collapsed to the first reference image's bucket regardless of what
the AR predicted via .
Now prefer prompt["height"]/["width"] (bridge-supplied) over image_list[0]
when sampling_params is None. Caller-explicit sampling_params still wins
via the surrounding `is None` guards. Mirrors GLM-Image's precedent at
pipeline_glm_image.py:718-737 and matches official HunyuanImage-3.0
image_size=="auto" semantics where vae_reso_group[ratio_index] is the
canonical source of DiT shape.
Signed-off-by: zuiho <2324465096@qq.com>
---
.../models/hunyuan_image3/pipeline_hunyuan_image3.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 74fe268babf..b1ba2687f86 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -283,11 +283,13 @@ def pre_process_func(request: OmniDiffusionRequest):
cond_image_infos = [_build_cond_joint_image(image) for image in image_list]
prompt["additional_information"]["batch_cond_image_info"] = cond_image_infos
+ bridge_h = prompt.get("height") if isinstance(prompt, dict) else None
+ bridge_w = prompt.get("width") if isinstance(prompt, dict) else None
first_image_w, first_image_h = _to_pil_image(image_list[0]).size
if request.sampling_params.width is None:
- request.sampling_params.width = int(first_image_w)
+ request.sampling_params.width = int(bridge_w or first_image_w)
if request.sampling_params.height is None:
- request.sampling_params.height = int(first_image_h)
+ request.sampling_params.height = int(bridge_h or first_image_h)
request.prompts[i] = prompt
From d7c760e258c4e4ec1896768fd5e0e5c7d5d4c6bd Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 15:40:23 +0800
Subject: [PATCH 09/43] refactor(end2end): drop multi-image regex shape
heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Reverts 6a1985f1 ("feat(end2end): semantic output shape for multi-image
IT2I"). With the prior commit's pipeline fix in place, AR-predicted
tokens flow through ar2diffusion to DiT output shape, so
the prompt-regex layer (parsing "参考图二" / "based on image 2" to pick
a reference image's H/W) is no longer needed and contradicts official
HunyuanImage-3.0 image_size=="auto" semantics.
Signed-off-by: zuiho <2324465096@qq.com>
---
.../hunyuan_image3/end2end.py | 38 +------------------
1 file changed, 2 insertions(+), 36 deletions(-)
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index b46e326d1c8..82e8c194c5a 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -5,7 +5,6 @@
import argparse
import json
import os
-import re
from pathlib import Path
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
@@ -124,30 +123,6 @@ def parse_additional_config(raw_value: str | None) -> dict | None:
return additional_config
-def _infer_shape_reference_index(prompt: str, num_images: int) -> int:
- chinese_nums = {"一": 1, "二": 2, "三": 3}
-
- def _to_idx(match: re.Match[str]) -> int | None:
- token = match.group(1).strip()
- value = chinese_nums.get(token, int(token) if token.isdigit() else None)
- return value - 1 if value and 1 <= value <= num_images else None
-
- for pattern in (
- r"参考图\s*([一二三123])",
- r"参考第\s*([一二三123])\s*张",
- r"参考\s*image\s*([123])",
- r"ref(?:erence)?\s*image\s*([123])",
- r"基于图\s*([一二三123])",
- r"基于第\s*([一二三123])\s*张",
- r"基于\s*image\s*([123])",
- r"based\s*on\s*image\s*([123])",
- ):
- match = re.search(pattern, prompt, re.IGNORECASE)
- if match and (idx := _to_idx(match)) is not None:
- return idx
- return 0
-
-
def main():
args = parse_args()
os.makedirs(args.output, exist_ok=True)
@@ -209,7 +184,6 @@ def main():
mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
formatted_prompts: list[OmniPromptType] = []
- shape_indices: list[int] = []
for prompt in prompts:
build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
if input_images:
@@ -228,10 +202,8 @@ def main():
elif args.modality == "img2img":
prompt_dict["modalities"] = ["image"]
prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
- shape_idx = _infer_shape_reference_index(prompt, len(input_images))
- prompt_dict["height"] = input_images[shape_idx].height
- prompt_dict["width"] = input_images[shape_idx].width
- shape_indices.append(shape_idx)
+ prompt_dict["height"] = input_images[0].height
+ prompt_dict["width"] = input_images[0].width
elif args.modality == "img2text":
prompt_dict["modalities"] = ["text"]
prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
@@ -244,7 +216,6 @@ def main():
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
- diffusion_idx = 0
for sp in params_list:
if isinstance(sp, OmniDiffusionSamplingParams):
sp.num_inference_steps = args.steps
@@ -255,11 +226,6 @@ def main():
if args.modality == "text2img":
sp.height = args.height
sp.width = args.width
- elif args.modality == "img2img":
- shape_idx = shape_indices[diffusion_idx]
- sp.height = input_images[shape_idx].height
- sp.width = input_images[shape_idx].width
- diffusion_idx += 1
elif hasattr(sp, "stop_token_ids"):
sp.stop_token_ids = ar_stop_token_ids
From 2175a9974bfbb0b3f6a85d26c070f2c22329df8f Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:42:44 +0800
Subject: [PATCH 10/43] fix(hunyuan_image3): add official extra resolution
buckets (idx 33-36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`ResolutionGroup` only walked the step-based buckets (idx 0-32) and
dropped the official's four extra resolutions at indices 33-36. The
trained model has ratio token vocabulary 0-36, and AR was trained to
address all 37 buckets; without the extras, wide reference images
bucket-collapse to the closest base ratio (e.g. input_1_1's 1179x685
maps to idx=12 / 1280x768 instead of idx=36 / 720x1280) and the AR's
`` token range can't address the missing aspects.
Adds `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` in `hunyuan_image3_transformer.py`
as the single source of truth (mirrors official `image_processor.py:
147-152`) and threads it through both:
- `HunyuanImage3Processor.ResolutionGroup` (AR-side cond-image bucket
selection)
- `_build_ratio_size_table` (bridge's reverse lookup ratio_idx →
(h, w) for ar2diffusion → DiT shape)
Signed-off-by: zuiho <2324465096@qq.com>
---
.../hunyuan_image3_transformer.py | 20 ++++++++++++++++-
.../models/hunyuan_image3/hunyuan_image3.py | 22 +++++++++++++++++--
.../stage_input_processors/hunyuan_image3.py | 13 ++++++++---
3 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
index 1eb0cdf113b..5a707acbda5 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
@@ -471,8 +471,21 @@ def __str__(self):
return f"{self.h}x{self.w}"
+# Baked-in extras matching the official model's
+# `HunyuanImage3ImageProcessor.vae_reso_group` (image_processor.py:147-152).
+# These four aspect buckets sit at ratio_token indices 33-36 in the trained
+# model and the AR was trained to address them, so any deviation breaks the
+# ratio-token vocab → output-shape lookup.
+HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = (
+ "1024x768",
+ "1280x720",
+ "768x1024",
+ "720x1280",
+)
+
+
class ResolutionGroup:
- def __init__(self, base_size=None, step=None, align=1):
+ def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None):
self.align = align
self.base_size = base_size
assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}"
@@ -486,6 +499,11 @@ def __init__(self, base_size=None, step=None, align=1):
self.step = step
self.data = self._calc_by_step()
+ if extra_resolutions is not None:
+ for er in extra_resolutions:
+ if not any(r.ratio == er.ratio for r in self.data):
+ self.data.append(er)
+
self.ratio = np.array([x.ratio for x in self.data])
self.attr = ["" for _ in range(len(self.data))]
self.prefix_space = 0
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index e9d41ebf958..bdafa5c6f87 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -737,7 +737,7 @@ def __str__(self):
class ResolutionGroup:
"""Group of resolutions for image processing."""
- def __init__(self, base_size=None, step=None, align=1):
+ def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None):
self.align = align
self.base_size = base_size
assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}"
@@ -751,6 +751,11 @@ def __init__(self, base_size=None, step=None, align=1):
self.step = step
self.data = self._calc_by_step()
+ if extra_resolutions is not None:
+ for er in extra_resolutions:
+ if not any(r.ratio == er.ratio for r in self.data):
+ self.data.append(er)
+
self.ratio = np.array([x.ratio for x in self.data])
self.attr = ["" for _ in range(len(self.data))]
self.prefix_space = 0
@@ -815,7 +820,20 @@ def get_base_size_and_ratio_index(self, width, height):
def __init__(self, tokenizer, hf_config, **kwargs: object):
self.tokenizer = tokenizer
self.hf_config = hf_config
- self.reso_group = self.ResolutionGroup(base_size=hf_config.image_base_size)
+ # `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` mirrors the official
+ # `vae_reso_group` extras (image_processor.py:147-152). Build with
+ # this processor's inner Resolution class so `data` stays
+ # type-homogeneous.
+ from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
+ HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
+ )
+
+ self.reso_group = self.ResolutionGroup(
+ base_size=hf_config.image_base_size,
+ extra_resolutions=[
+ HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS
+ ],
+ )
self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor)
self.vae_processor = transforms.Compose(
[
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 9a53bf4be06..63af2f7f1dd 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -45,9 +45,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
`reso_group[ratio_index]` reverse lookup. Cached because the table
is constant per `base_size`.
"""
- from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup
-
- reso_group = ResolutionGroup(base_size=base_size)
+ from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
+ HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
+ Resolution,
+ ResolutionGroup,
+ )
+
+ reso_group = ResolutionGroup(
+ base_size=base_size,
+ extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
+ )
return [(int(r.height), int(r.width)) for r in reso_group.data]
From 4aaa77261b303322a907cfe0b7fe4e71b7cf6782 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:43:13 +0800
Subject: [PATCH 11/43] fix(hunyuan_image3): default cond image preprocessing
to resize-stretch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Match official `infer_align_image_size=True` path (image_processor.py:355
→ crop_type="resize") for IT2I cond-image preprocessing. Previously
hardcoded to center crop, which lost content from non-square reference
images and produced a near-correct-but-not-equal pixel buffer compared to
the HF reference run.
Center-crop mode is preserved as opt-in via `crop_type="center"` for
callers that want the legacy behavior.
Signed-off-by: zuiho <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3.py | 24 +++++++++++--------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index bdafa5c6f87..f6bd31283d9 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -954,29 +954,33 @@ def process_image(self, image_input: ImageInput):
return final_image_info
- def _resize_and_crop(self, image: Image.Image, target_size: tuple[int, int]) -> Image.Image:
+ def _resize_and_crop(
+ self,
+ image: Image.Image,
+ target_size: tuple[int, int],
+ crop_type: str = "resize",
+ ) -> Image.Image:
+ # Default mode mirrors the official `infer_align_image_size=True`
+ # path (image_processor.py:355 → crop_type="resize") used by the
+ # IT2I demo: stretch the cond image to the bucket dims so its
+ # `` tag and ViT/VAE features stay aligned with the
+ # bucket, instead of dropping content via center crop.
tw, th = target_size
+ if crop_type == "resize":
+ return image.resize((tw, th), resample=Image.Resampling.LANCZOS)
w, h = image.size
-
tr = th / tw
r = h / w
-
- # resize
if r < tr:
resize_height = th
resize_width = int(round(th / h * w))
else:
resize_width = tw
resize_height = int(round(tw / w * h))
-
image = image.resize((resize_width, resize_height), resample=Image.Resampling.LANCZOS)
-
- # center crop
crop_top = int(round((resize_height - th) / 2.0))
crop_left = int(round((resize_width - tw) / 2.0))
-
- image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
- return image
+ return image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
class HunyuanImage3ProcessingInfo(BaseProcessingInfo):
From d0c2acbfb07debda01a68b87181db3c21cbf70ac Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:43:59 +0800
Subject: [PATCH 12/43] fix(hunyuan_image3): use real token id at
scaffold slot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Per-image scaffold timestep slot was placeholder'd with `
` token id
(128006) instead of the real `` token id (128017), as a
workaround for vLLM's `PromptUpdateDetails.select_token_id` accepting
only a single `embed_token_id`. The runtime embedding was patched in via
the multimodal-embedding merger, so single-image numerics matched HF.
But under the AR's multimodal-bidirectional attention, that
`
`-as-timestep slot folded into each image's MM region. With
multi-image input, this asymmetry biased the AR's ``
greedy argmax to the FIRST conditioning image's bucket regardless of
prompt semantics:
input order | image_1 bucket | image_2 bucket | AR predicts
-------------------|----------------|----------------|------------
square + wide | 16 | 36 | 16
wide + square | 36 | 16 | 36
single wide | -- | -- | 36 (correct)
Recaption text in both broken cases explicitly said "use image_2
resolution" but the model's ratio token still landed on image_1's
bucket. Single-image worked because there was no second region to
contaminate.
Switches the slot to the real `` id and patches its embedding
with `timestep_emb(0)` in `embed_input_ids` via a token-id mask — same
effect as HF's `instantiate_continuous_tokens` scatter-replace
(modeling_hunyuan_image_3.py:1964). Numerically equivalent for
single-image while removing the multi-image attention pollution.
Touches: `_get_prompt_updates` scaffold, `embed_multimodal` (no longer
prepends timestep_emb), `embed_input_ids` (new mask-based replacement),
`__init__` (caches `_timestep_token_id`), `get_mrope_input_positions`
(timestep slot check now matches the real token id).
Signed-off-by: zuiho <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3.py | 77 +++++++++----------
1 file changed, 36 insertions(+), 41 deletions(-)
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index f6bd31283d9..ab9c2ee4d6e 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1126,31 +1126,22 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
ratio_token_id = tokenizer.convert_tokens_to_ids(f"")
if ratio_token_id is None:
raise ValueError(f"Ratio token '' not found in tokenizer vocabulary")
-
- # NOTE on the timestep slot:
- # HF's apply_chat_template emits the literal token id
- # 128017 here. HF's modeling forward (`instantiate_continuous_tokens`,
- # see hunyuan3.0_ins/modeling_hunyuan_image_3.py:1964) then *scatter-
- # replaces* the embedding at that position with `timestep_emb(0)`
- # for cond images. So the wte embedding of is irrelevant
- # at runtime — what matters is the timestep_emb injection.
- #
- # vllm-omni achieves the same effect via the multimodal-embedding
- # merger: we put an
(128006) placeholder here and ship a
- # `timestep_emb(0)` tensor at the head of `embed_multimodal()`'s
- # combined_embeddings. The merger replaces this placeholder's
- # embedding with the timestep tensor, yielding a final hidden
- # state numerically equivalent to HF at that position.
- #
- # Keep this slot as
(NOT ): switching to
- # requires either (a) a second PromptReplacement targeting 128017,
- # or (b) the merger's embed_token_id to be a list — neither is
- # currently supported by PromptUpdateDetails.select_token_id.
+ timestep_token_id = tokenizer.convert_tokens_to_ids("")
+ if timestep_token_id is None:
+ raise ValueError("Timestep token '' not found in tokenizer vocabulary")
+
+ # Use the real token id (HF parity). The trained wte
+ # at this slot is overwritten with timestep_emb(0) at runtime by
+ # `embed_input_ids` — same effect as HF's
+ # `instantiate_continuous_tokens` scatter-replace. Keeping the
+ # slot as
would have folded the timestep position into the
+ # multimodal bidirectional region, which empirically biased
+ # multi-image AR ratio prediction to the first image's bucket.
replacement = (
[boi_token_id]
+ [base_size_token_id]
+ [ratio_token_id]
- + [img_token_id] * timestep_token_num
+ + [timestep_token_id] * timestep_token_num
+ [img_token_id] * vae_token_num
+ [joint_img_sep_token_id]
+ [img_token_id] * vit_token_num
@@ -1542,6 +1533,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self._end_of_answer_id = tokenizer.convert_tokens_to_ids("")
image_base_size = getattr(config, "image_base_size", 1024)
self._size_token_id = tokenizer.convert_tokens_to_ids(f"")
+ self._timestep_token_id = tokenizer.convert_tokens_to_ids("")
self._start_ratio_id = tokenizer.convert_tokens_to_ids("")
self._end_ratio_id = tokenizer.convert_tokens_to_ids("")
ratio_33 = tokenizer.convert_tokens_to_ids("")
@@ -1877,27 +1869,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
"Each image should have both VAE and ViT embeddings."
)
- # Order per image: timestep -> VAE tokens -> ViT tokens.
- # The
placeholder at the timestep slot (see _get_prompt_updates)
- # gets its embedding replaced by `timestep_emb(0)` here, which is what
- # HF achieves via instantiate_continuous_tokens at runtime.
+ # Order per image: VAE tokens -> ViT tokens. The slot at
+ # the head of each per-image scaffold is NOT included here — its
+ # embedding is patched in by `embed_input_ids` via a token-id mask,
+ # mirroring HF's `instantiate_continuous_tokens` scatter-replace.
combined_embeddings: list[torch.Tensor] = []
num_images = len(vae_token_embeddings)
for img_idx in range(num_images):
- # 1. Timestep embedding (cond image timestep == 0)
- timestep = torch.zeros((1,)).to(vit_embeddings.device).to(vit_embeddings.dtype)
- timestep_emb = self._timestep_encode(timestep)
-
- # 2. VAE image token embeddings
vae_token_embed = vae_token_embeddings[img_idx]
- # Remove batch dimension if present: (B, seq_len, hidden_size) -> (seq_len, hidden_size)
if vae_token_embed.ndim == 3:
vae_token_embed = vae_token_embed.squeeze(0)
-
- # 3. ViT image embeddings
vit_embed = vit_embeddings[img_idx]
-
- stacked_embed = torch.cat([timestep_emb, vae_token_embed, vit_embed], dim=0)
+ stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0)
combined_embeddings.append(stacked_embed)
return combined_embeddings
@@ -1910,14 +1893,25 @@ def embed_input_ids(
is_multimodal: torch.Tensor | None = None,
) -> torch.Tensor:
"""Embed input IDs with optional multimodal embeddings."""
- # Get text embeddings
inputs_embeds = self.model.embed_input_ids(input_ids)
- # If no multimodal embeddings, return text embeddings
+ # Patch slots with timestep_emb(0). HF parity: the trained
+ # wte at this slot is irrelevant; runtime uses
+ # `instantiate_continuous_tokens(timestep_emb(0))`. With multi-image,
+ # keeping these slots as
ids merged the timestep position into
+ # the bidirectional MM region and biased AR ratio prediction toward
+ # the first image's bucket.
+ timestep_mask = input_ids == self._timestep_token_id
+ n_timestep = int(timestep_mask.sum().item())
+ if n_timestep > 0:
+ timestep_input = torch.zeros(
+ (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+ )
+ inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input)
+
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds
- # Merge multimodal embeddings with text embeddings
merged_embeds = _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
@@ -2133,6 +2127,7 @@ def get_mrope_input_positions(
boi_token_id = self._mrope_boi_token_id
eoi_token_id = self._mrope_eoi_token_id
joint_img_sep_token_id = self._mrope_joint_img_sep_token_id
+ timestep_token_id = self._timestep_token_id
# Build position arrays
t_pos: list[int] = [] # temporal (same as 1D for this model)
@@ -2149,7 +2144,7 @@ def get_mrope_input_positions(
if tok == boi_token_id:
# Found start of image block.
- # Structure:
*timestep
*vae
+ # Structure:
*vae
#
*vit
# token
t_pos.append(pos)
@@ -2174,8 +2169,8 @@ def get_mrope_input_positions(
pos += 1
i += 1
- # Timestep token (1
token)
- if i < n and input_tokens[i] == img_token_id:
+ # token (1 token)
+ if i < n and input_tokens[i] == timestep_token_id:
t_pos.append(pos)
h_pos.append(pos)
w_pos.append(pos)
From f83c2814a6c853f24a050330d8544cb395203d0c Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 02:40:19 +0800
Subject: [PATCH 13/43] fix(hunyuan_image3): include in
per-image MM region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The per-image embed mask in `_get_prompt_updates` only marked `
`
tokens via `PromptUpdateDetails.select_token_id(
)`, so vLLM's
prefix-LM bidirectional region for each image was split into TWO
contiguous runs: VAE block, then ViT block — with ``
sitting between them as a non-MM (causal-only) token.
Official `Tencent-Hunyuan/HunyuanImage-3.0` builds its full-attention
range via `joint_image_slices` (image_processor.py:388, default
`cond_token_attn_type` flow), spanning VAE + sep + ViT as ONE
continuous bidirectional slice per cond image. The trained model
expects this layout.
In the multi-image case the asymmetry between training (sep inside
the MM region) and our inference (sep outside) was the dominant
remaining mismatch: empirically AR's `` greedy argmax
landed on the FIRST conditioning image's bucket regardless of
prompt semantics. Single-image and dup-bucket cases worked because
there was no second region to be asymmetric against.
Switches `_get_prompt_updates` to
`PromptUpdateDetails.select_token_ids([
, ])` so
the embed mask now spans VAE+sep+ViT as one True run per image, and
inserts the `` wte tensor in `embed_multimodal`'s
per-image stack between VAE and ViT — numerically identical to what
`model.embed_input_ids` would have produced for that token, so
single-image semantics don't change.
Verified end-to-end on 47.79.124.13 (4× L20X, AR=TP2 + DiT=TP2):
case | image_1 | image_2 | AR ratio
--------------------------------|---------|---------|---------
multi (1_0+1_1, prompt → img2) | 16 | 36 | 36 ✓
multi swap (1_1+1_0) | 36 | 16 | 36 ✓
single 1_1 (regression) | -- | -- | 36 ✓
single 1_0 (regression) | -- | -- | 16 ✓
multi dup wide | 36 | 36 | 36 ✓
Pre-fix behavior on the same setup had AR landing on the first
conditioning image's bucket regardless of prompt, output collapsing
to a square instead of image_2's wide aspect.
Signed-off-by: zuiho <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3.py | 45 ++++++++++++++-----
1 file changed, 34 insertions(+), 11 deletions(-)
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index ab9c2ee4d6e..08d25e9c896 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1132,11 +1132,16 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
# Use the real token id (HF parity). The trained wte
# at this slot is overwritten with timestep_emb(0) at runtime by
- # `embed_input_ids` — same effect as HF's
- # `instantiate_continuous_tokens` scatter-replace. Keeping the
- # slot as
would have folded the timestep position into the
- # multimodal bidirectional region, which empirically biased
- # multi-image AR ratio prediction to the first image's bucket.
+ # `embed_input_ids`.
+ #
+ # Mark
*VAE + +
*ViT as one contiguous
+ # embed run so vLLM's prefix-LM mask treats it as a single
+ # bidirectional region, mirroring official `joint_image_slices`
+ # full-attention range (image_processor.py:388, with
+ # cond_token_attn_type effectively spanning VAE+sep+ViT). With the
+ # default `select_token_id(
)` mask, sep splits the run into
+ # two regions; that asymmetry is what biased multi-image AR
+ # ratio prediction to the first image's bucket.
replacement = (
[boi_token_id]
+ [base_size_token_id]
@@ -1148,7 +1153,10 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
+ [eoi_token_id]
)
logger.debug(f"actual replacement token count: {timestep_token_num + vae_token_num + vit_token_num}")
- return PromptUpdateDetails.select_token_id(replacement, embed_token_id=img_token_id)
+ return PromptUpdateDetails.select_token_ids(
+ replacement,
+ embed_token_ids=[img_token_id, joint_img_sep_token_id],
+ )
return [
PromptReplacement(modality="image", target=[img_token_id], replacement=get_replacement_image),
@@ -1869,10 +1877,25 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
"Each image should have both VAE and ViT embeddings."
)
- # Order per image: VAE tokens -> ViT tokens. The slot at
- # the head of each per-image scaffold is NOT included here — its
- # embedding is patched in by `embed_input_ids` via a token-id mask,
- # mirroring HF's `instantiate_continuous_tokens` scatter-replace.
+ # Order per image: VAE tokens -> wte -> ViT tokens.
+ # The wte is included so it joins the bidirectional
+ # MM region (matching the official `joint_image_slices` full-attn
+ # range that spans VAE+sep+ViT). The merger replaces the sep slot
+ # with this wte tensor, which is numerically identical to what
+ # `model.embed_input_ids` would produce — no semantic change for
+ # single-image, but with multi-image the sep position now sits
+ # inside the bidirectional region (matching how the model was
+ # trained).
+ sep_token_id = self._mrope_joint_img_sep_token_id
+ sep_input_ids = torch.tensor(
+ [sep_token_id], device=vit_embeddings.device, dtype=torch.long
+ )
+ sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype)
+
+ # The slot at the head of each per-image scaffold is NOT
+ # included here — its embedding is patched in by `embed_input_ids`
+ # via a token-id mask, mirroring HF's `instantiate_continuous_tokens`
+ # scatter-replace.
combined_embeddings: list[torch.Tensor] = []
num_images = len(vae_token_embeddings)
for img_idx in range(num_images):
@@ -1880,7 +1903,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
if vae_token_embed.ndim == 3:
vae_token_embed = vae_token_embed.squeeze(0)
vit_embed = vit_embeddings[img_idx]
- stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0)
+ stacked_embed = torch.cat([vae_token_embed, sep_embed, vit_embed], dim=0)
combined_embeddings.append(stacked_embed)
return combined_embeddings
From b7c968bd5547d6188ecc3f21d76903080369c695 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 02:40:48 +0800
Subject: [PATCH 14/43] fix(hunyuan_image3): pass extra resolutions to DiT-side
reso_group
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`HunyuanImage3ImageProcessor.__init__` (DiT-side image processor in
`hunyuan_image3_transformer.py`) constructed `ResolutionGroup` without
the `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` extras, so it only knew the 33
step-based buckets (idx 0-32). When the AR predicted ``
and the bridge resolved it to (h=720, w=1280), the DiT pipeline's
`get_target_size` re-bucketed those dims to the closest 33-bucket
ratio (idx 12 = 1280×768) and the final output PNG came out at
1280×768 instead of 1280×720.
Threads the same `extra_resolutions` constant the AR-side processor
(commit b3f91f3d) already uses, so the DiT side recognizes idx 33-36
as valid buckets and respects the AR's predicted dims end-to-end.
Verified output PIL.size now matches AR's predicted bucket: multi-image
prediction `` → (h=720, w=1280) → output (1280, 720).
Signed-off-by: zuiho <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3_transformer.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
index 5a707acbda5..4edcfb6ca3a 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
@@ -1369,7 +1369,10 @@ class HunyuanImage3ImageProcessor:
def __init__(self, config):
self.config = config
- self.reso_group = ResolutionGroup(base_size=config.image_base_size)
+ self.reso_group = ResolutionGroup(
+ base_size=config.image_base_size,
+ extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
+ )
self.vae_processor = transforms.Compose(
[
transforms.ToTensor(),
From 3b73eabe9f0b3b087785012ceaecb3fce093e35f Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 08:46:54 +0800
Subject: [PATCH 15/43] fix(hunyuan_image3 ar2diffusion): truncate AR cot_text
at /
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The bridge was forwarding the full AR `generated_text` (including the
`` stage-transition tail) into
`extra.ar_generated_text` for DiT's prompt builder. The tail's
purpose is purely to drive the AR's greedy ratio prediction inside
`_apply_ratio_restriction` — the size/ratio info is already routed to
DiT via `height` / `width` (translated from `ratio_idx`), so the tail
has no remaining job downstream and just contaminates cot_text with
an extra `` + size + ratio that DiT's prompt builder isn't
expecting.
Mirrors official upstream `HunyuanImage3ForCausalMM.generate_image`
(modeling_hunyuan_image_3.py:3343-3354), which decodes only
`generated_tokens[0, :end_pos + 1]` where `end_pos` is the position
of `` (think_recaption / recaption bot_task) or
`` (think-only bot_task).
Adds `_truncate_at_cot_end()` that finds the first cot-end marker in
the generated text, truncates both the text and the token-id stream
at that position (token side uses `` / `` token
ids from the tokenizer, cached via `_build_cot_end_token_ids`), and
returns them for downstream consumption.
`ratio_idx` extraction in `_extract_ratio_index` still runs on the
FULL output before truncation, since the ratio token lives in the
trailing segment that we're about to drop.
Addresses PR #3444 review comment from @Bounty-hunter.
Signed-off-by: zuiho <2324465096@qq.com>
---
.../stage_input_processors/hunyuan_image3.py | 80 ++++++++++++++++++-
1 file changed, 77 insertions(+), 3 deletions(-)
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 63af2f7f1dd..158ea86dbf2 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -58,6 +58,67 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
return [(int(r.height), int(r.width)) for r in reso_group.data]
+@lru_cache(maxsize=4)
+def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]:
+ """Return `{'': id, '': id}` for cot-boundary
+ truncation. Empty dict on lookup failure so callers degrade to a
+ pure text-based search.
+ """
+ try:
+ from transformers import AutoTokenizer
+
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+ except Exception as e: # pragma: no cover - environment-dependent
+ logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e)
+ return {}
+
+ result: dict[str, int] = {}
+ for marker in ("", ""):
+ tid = tokenizer.convert_tokens_to_ids(marker)
+ if tid is not None and tid != tokenizer.unk_token_id:
+ result[marker] = int(tid)
+ return result
+
+
+def _truncate_at_cot_end(
+ generated_text: str,
+ generated_token_ids,
+ model_name_or_path: str,
+) -> tuple[str, list[int]]:
+ """Truncate AR output at first `` (or `` fallback).
+
+ Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official
+ upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
+ `cot_text` for DiT. The trailing ``
+ sequence is a stage-transition trigger consumed via `image_size` /
+ height/width — it must NOT be forwarded to DiT's prompt builder, or
+ the extra `` and ratio tokens drift the DiT's own prompt
+ structure.
+ """
+ token_list = list(generated_token_ids) if generated_token_ids is not None else []
+
+ end_ids = _build_cot_end_token_ids(model_name_or_path)
+
+ for marker in ("", ""):
+ idx = generated_text.find(marker)
+ if idx == -1:
+ continue
+ text_end = idx + len(marker)
+ truncated_text = generated_text[:text_end]
+
+ truncated_tokens = token_list
+ end_id = end_ids.get(marker)
+ if end_id is not None and token_list:
+ try:
+ token_end = token_list.index(end_id)
+ truncated_tokens = token_list[: token_end + 1]
+ except ValueError:
+ pass
+ return truncated_text, truncated_tokens
+
+ return generated_text, token_list
+
+
@lru_cache(maxsize=4)
def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
"""Return `{token_id: ratio_index}` for `` in the tokenizer.
@@ -206,17 +267,30 @@ def ar2diffusion(
width,
)
+ # Truncate the AR output at `` (or ``) before
+ # passing to DiT. Mirrors official `generate_image` which keeps
+ # `cot_text` clean and routes size/ratio via `image_size` only —
+ # we already extracted `ratio_idx` above and translated it into
+ # `height` / `width`, so the ``
+ # tail has no remaining job and would only contaminate DiT's
+ # prompt builder if forwarded.
+ cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(
+ generated_text, generated_token_ids, model_name_or_path
+ )
+
logger.info(
- "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)",
+ "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "
+ "cot_text length=%d, target size=%dx%d (%s)",
i,
len(generated_token_ids),
len(generated_text),
+ len(cot_text_for_dit),
height,
width,
f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
)
- token_tensor = torch.tensor(generated_token_ids, dtype=torch.long)
+ token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long)
diffusion_input: dict[str, Any] = {
"prompt": text_prompt,
@@ -224,7 +298,7 @@ def ar2diffusion(
"width": width,
"extra": {
"ar_token_ids": token_tensor,
- "ar_generated_text": generated_text,
+ "ar_generated_text": cot_text_for_dit,
},
}
From 284783940116757be0f23fc80e0402ad74789a62 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Mon, 11 May 2026 11:37:11 +0800
Subject: [PATCH 16/43] chore(hunyuan_image3): apply ruff format
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3.py | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 08d25e9c896..756a7a27c9b 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -830,9 +830,7 @@ def __init__(self, tokenizer, hf_config, **kwargs: object):
self.reso_group = self.ResolutionGroup(
base_size=hf_config.image_base_size,
- extra_resolutions=[
- HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS
- ],
+ extra_resolutions=[HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
)
self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor)
self.vae_processor = transforms.Compose(
@@ -1887,9 +1885,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
# inside the bidirectional region (matching how the model was
# trained).
sep_token_id = self._mrope_joint_img_sep_token_id
- sep_input_ids = torch.tensor(
- [sep_token_id], device=vit_embeddings.device, dtype=torch.long
- )
+ sep_input_ids = torch.tensor([sep_token_id], device=vit_embeddings.device, dtype=torch.long)
sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype)
# The slot at the head of each per-image scaffold is NOT
@@ -1927,9 +1923,7 @@ def embed_input_ids(
timestep_mask = input_ids == self._timestep_token_id
n_timestep = int(timestep_mask.sum().item())
if n_timestep > 0:
- timestep_input = torch.zeros(
- (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype
- )
+ timestep_input = torch.zeros((n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype)
inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input)
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
From 3b4f885cf2a2d84275691c7961fb93290c27fa13 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Mon, 11 May 2026 13:08:24 +0800
Subject: [PATCH 17/43] fix(hunyuan_image3): online IT2I multi-image and AR
bucket override
Two related bugs in the online /v1/images/edits path prevented this PRs
multi-image IT2I from working end-to-end and silently suppressed the AR
ratio decision for AR-driven pipelines:
1. serving_chat._build_multistage_generation_inputs invoked build_prompt
without num_images, defaulting to 1. N reference images then only got
a single
placeholder in the AR prompt; vLLMs _process_multimodal
raised AssertionError(Failed to apply prompt replacement for
mm_items[image][1]) on the second image.
2. edit_images resolved size=auto to the first input images dimensions
and forwarded them through extra_body to chat_handler.
generate_diffusion_images, which then built a fresh gen_params with
those dimensions. Multi-stage AR-driven pipelines (e.g. HunyuanImage-3.0)
rely on ar2diffusion to override the final bucket from the AR ratio
token; DiTs pre_process_func only does that when
sampling_params.width is None (see pipeline_hunyuan_image3.py:290).
The forwarded input-image size suppressed the AR decision, producing
the wrong bucket (e.g. 1024x1024 square instead of the AR-decided
1280x720 landscape for multi-image fusion).
The fix mirrors the offline end2end.py img2img path which never sets
sampling_params.height/width for img2img. Single-stage diffusion
(_generate_with_async_omni path) still pins gen_params.width/height
from input image size for backward compat.
End-to-end smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images via
curl /v1/images/edits with size=auto, same prompt as offline):
- before fix 1: HTTP 500, AssertionError on mm_items[image][1]
- before fix 2: HTTP 200 but 1024x1024 square (wrong bucket)
- after both: HTTP 200, 1280x720 landscape -- AR ratio_idx=36 honored,
matches offline end2end.py for the same inputs
Tests:
- tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py::
test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
Pins build_prompt(num_images=N) for N=1,2,3 reference images.
- tests/entrypoints/openai_api/test_image_server.py::
test_image_edits_size_auto_preserves_bridge_size
Pins diffusion sampling_params.height/width staying None through the
/v1/images/edits API on the multi-stage path, with multi-image
placeholder cross-check.
- test_image_edit_parameter_default updated to assert the new contract
(None on multi-stage); test_image_edit_parameter_default_single_stage
unchanged.
Signed-off-by: TaffyOfficial
---
.../openai_api/test_image_server.py | 68 ++++++++++++++++++-
...test_serving_chat_multistage_generation.py | 44 ++++++++++++
vllm_omni/entrypoints/openai/api_server.py | 18 +++--
vllm_omni/entrypoints/openai/serving_chat.py | 6 +-
4 files changed, 126 insertions(+), 10 deletions(-)
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index b5ff891f8f6..fb9c126d3fe 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -1349,8 +1349,16 @@ def test_image_edit_parameter_default(async_omni_test_client):
engine = async_omni_test_client.app.state.engine_client
captured_sampling_params = engine.captured_sampling_params_list[-1]
- assert captured_sampling_params.width == 24
- assert captured_sampling_params.height == 16
+ # size="auto" on multi-stage pipelines deliberately leaves the diffusion
+ # stages sampling_params width/height unset so AR-driven pipelines (e.g.
+ # HunyuanImage-3.0) can let ar2diffusion override the final bucket from
+ # the AR-predicted ratio token; see
+ # test_image_edits_size_auto_preserves_bridge_size for the contract.
+ # Single-stage diffusion (test_image_edit_parameter_default_single_stage)
+ # still pins width/height to the input image size via api_servers
+ # gen_params, which is unchanged.
+ assert captured_sampling_params.width is None
+ assert captured_sampling_params.height is None
assert captured_sampling_params.num_outputs_per_prompt == 1
assert captured_sampling_params.num_inference_steps == 4
assert captured_sampling_params.guidance_scale == 7.5
@@ -1649,3 +1657,59 @@ def __init__(self):
assert len(images) == 1
assert isinstance(images[0], Image.Image)
assert images[0].size == (32, 32)
+
+
+def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_only_client):
+ """size=auto must NOT pin the diffusion stage sampling_params.height/width.
+
+ Regression: prior to the fix, edit_images resolved size=auto to the
+ first input image dimensions and forwarded them through gen_params +
+ extra_body to the diffusion stages sampling_params. AR-driven
+ pipelines (e.g. HunyuanImage-3.0) rely on ar2diffusions
+ bridge to override the final bucket via the AR-predicted ratio token,
+ and the DiT pre_process_func only fills sampling_params from the
+ bridge value when sampling_params.width is None (see
+ pipeline_hunyuan_image3.py:290). Non-None width from the input image
+ silently suppressed the AR decision, producing the wrong bucket
+ (e.g. 1024x1024 square instead of the AR-decided 1280x720 landscape
+ for multi-image fusion).
+
+ Cross-pins the multi-image fix at the API level: 2 reference images
+ with bot_task=it2i must produce 2
placeholders in the captured
+ AR prompt (build_prompt called with num_images=2).
+ """
+ img_a = make_test_image_bytes((32, 32))
+ img_b = make_test_image_bytes((128, 64))
+ response = async_omni_stage_configs_only_client.post(
+ "/v1/images/edits",
+ files=[("image", img_a), ("image", img_b)],
+ data={
+ "prompt": "fuse",
+ "size": "auto",
+ "bot_task": "it2i",
+ },
+ )
+ assert response.status_code == 200, response.text
+
+ engine = async_omni_stage_configs_only_client.app.state.engine_client
+ captured = engine.captured_sampling_params_list
+ assert captured is not None
+ assert len(captured) == 2
+
+ diffusion_params = captured[1]
+ assert diffusion_params.height is None, (
+ f"size=auto leaked into diffusion sampling_params.height={diffusion_params.height}; "
+ "must stay None so AR-driven pipelines can apply the bridges decision."
+ )
+ assert diffusion_params.width is None, (
+ f"size=auto leaked into diffusion sampling_params.width={diffusion_params.width}; "
+ "must stay None so AR-driven pipelines can apply the bridges decision."
+ )
+
+ KEY = "prompt"
+ IMG = "
"
+ captured_prompt = engine.captured_prompt
+ if isinstance(captured_prompt, dict) and isinstance(captured_prompt.get("prompt"), str):
+ assert captured_prompt["prompt"].count("
") == 2, (
+ f"N=2 reference images must emit 2
placeholders in AR prompt; got {captured_prompt[KEY].count(IMG)} -- prompt: {captured_prompt[KEY]!r}"
+ )
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 144a0e97a6c..618c2573078 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -91,3 +91,47 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser
assert engine.default_sampling_params_list[1].lora_request is None
assert engine.default_sampling_params_list[2].resolution == 640
assert engine.default_sampling_params_list[2].lora_request is None
+
+
+def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders(serving_chat):
+ """N reference images with bot_task set must emit N
placeholders.
+
+ Regression: prior to the multi-image online fix, build_prompt was
+ called without num_images, defaulting to 1. A 2-image edit request
+ would only get a single
placeholder in the AR prompt; vLLMs
+ _process_multimodal then raised
+ AssertionError(Failed to apply prompt replacement for mm_items[image][1])
+ when trying to replace the second image (no placeholder left for it).
+
+ Pins the contract that build_prompt() is invoked with the actual image
+ count so multi-image IT2I is wired correctly through the online
+ /v1/images/edits path.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ IMG = "
"
+ images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)]
+
+ for n in (1, 2, 3):
+ engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"bot_task": "it2i"},
+ reference_images=images[:n],
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ prompt_str = engine_prompt["prompt"]
+ assert prompt_str.count("
") == n, (
+ f"N={n}: expected {n}
placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}"
+ )
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 06fb0a7f4cb..4227cff2fb6 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1811,7 +1811,8 @@ async def edit_images(
# 3.3 Parse and add size if provided
width, height = None, None
- if size.lower() == "auto":
+ size_was_auto = size.lower() == "auto"
+ if size_was_auto:
if resolution is None:
# No resolution specified, use input image size
width, height = pil_images[0].size
@@ -1882,10 +1883,17 @@ async def edit_images(
"seed": effective_seed,
"num_outputs_per_prompt": n,
}
- if width is not None:
- extra_body["width"] = width
- if height is not None:
- extra_body["height"] = height
+ # When size="auto", width/height were resolved from the first
+ # input images size (e.g. 512x512 logo), NOT a client-requested
+ # output dimension. Forwarding them to extra_body would override
+ # AR-driven pipelines (e.g. HunyuanImage-3.0) AR ``
+ # token decision via gen_params -> sampling_params. Skip the
+ # forward when auto, matching offline end2end.py img2img.
+ if not size_was_auto:
+ if width is not None:
+ extra_body["width"] = width
+ if height is not None:
+ extra_body["height"] = height
if negative_prompt is not None:
extra_body["negative_prompt"] = negative_prompt
if num_inference_steps is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index a5ca494c89e..022b5d2e95d 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2265,16 +2265,16 @@ def _build_multistage_generation_inputs(
build_prompt_tokens,
)
+ num_images = len(reference_images) if reference_images else 1
prompt_token_ids: list[int] | None = None
system_prompt_type: str | None = None
if tokenizer is not None:
- result = build_prompt_tokens(prompt, tokenizer, task=bot_task)
+ result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images)
prompt_token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
else:
- prompt = build_prompt(prompt, task=bot_task)
+ prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
engine_prompt["prompt"] = prompt
-
if reference_images and len(reference_images) == 1:
engine_prompt_data = {"image": reference_images[0]}
modalities = ["image"]
From ca830c851b63b9d2deea3d08b53b8315b4a4b5b4 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Mon, 11 May 2026 15:26:07 +0800
Subject: [PATCH 18/43] fix(hunyuan_image3): online IT2I HF byte-equivalent
prompt path
Follow-up to 815ac732 (online IT2I multi-image + size=auto). Online
still passed the prompt as a string and let the engine BPE-tokenize
the full chat template at once, while offline end2end.py img2img
feeds prompt_token_ids built segment-by-segment via build_prompt_tokens
(mirrors HF apply_chat_template). The two paths produced different
AR input token sequences for the same user inputs:
- offline (build_prompt_tokens): AR 661 tokens / 1118 chars cot
- online (build_prompt string): AR 706 tokens / 1190 chars cot
The mismatch silently shifted ARs training distribution (cross-segment
BPE merges, e.g. -> single id, vs HFs
[1811, 271]). AR produced different cot_text and DiT produced a visually
different image even with the same seed/prompt/reference images.
This patch threads the comprehension stages tokenizer through
generate_diffusion_images -> _build_multistage_generation_inputs.
When a tokenizer is available (multi-stage AR-driven path), the helper:
1. Calls build_prompt_tokens(prompt, tokenizer, task=bot_task,
num_images=N) and writes engine_prompt[prompt_token_ids];
engine_prompt[prompt] stays as the raw user text so ar2diffusion
can hand it through to DiT.
2. Sets engine_prompt[use_system_prompt] = resolve_sys_type(think)
-> en_unified, matching offline end2end.py img2img which always
forwards an explicit use_system_prompt.
Falls back to the original build_prompt string path when no tokenizer
is plumbed (legacy callers / unit tests), so existing flows still work.
E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images, curl
/v1/images/edits with size=auto, seed=42, steps=50, guidance=5.0):
- before: AR 706 / 1190, brushed-metal yin-yang (BPE merges diverged)
- after: AR 660 / 1148, canvas background restored (1 token / 30 char
delta vs offline 661 / 1118 is within sampling noise; same en_unified
sys prompt + trigger on both sides).
Tests:
- test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids
pins:
(a) engine_prompt[prompt_token_ids] set when tokenizer is passed,
(b) engine_prompt[prompt] preserved as raw user text,
(c) engine_prompt[use_system_prompt] == en_unified,
(d) N
token ids in prompt_token_ids for N=1,2,3.
Follow-ups (separate patches):
- Public API surface for task / bot_task separation (online callers
currently pass bot_task in extra_body but the value semantically
means task; needed to express think_recaption / recaption / vanilla).
- HF byte-for-byte parity assertion across offline and online once the
API split lands.
Signed-off-by: TaffyOfficial
---
...test_serving_chat_multistage_generation.py | 81 +++++++++++++++++++
vllm_omni/entrypoints/openai/serving_chat.py | 37 ++++++---
2 files changed, 108 insertions(+), 10 deletions(-)
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 618c2573078..b0871732f6a 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -135,3 +135,84 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
assert prompt_str.count("
") == n, (
f"N={n}: expected {n}
placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}"
)
+
+
+def test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids(serving_chat):
+ """When a tokenizer is provided, the helper must emit HF byte-for-byte
+ prompt_token_ids and forward use_system_prompt to the engine prompt.
+
+ Regression: prior to the HF-byte-equivalent fix, online IT2I always
+ passed the prompt as a single string. The engine then BPE-merged across
+ chat-template segment boundaries (e.g. user_prompt-ending punctuation
+ plus the trailing \n\n before \"Assistant: \") producing a token
+ sequence that differs from HF apply_chat_template / offline
+ end2end.py. AR generated different cot_text (706 tokens / 1190 chars
+ vs offline 661 / 1118 for the same inputs) and DiT produced a visually
+ different image (yin-yang on brushed-metal vs three-blue swirl on
+ canvas) under the same seed.
+
+ Pins:
+ 1. engine_prompt[\"prompt_token_ids\"] is set when tokenizer is passed.
+ 2. engine_prompt[\"prompt\"] stays as the raw user prompt -- the DiT
+ side rebuilds its own system prefix via use_system_prompt.
+ 3. engine_prompt[\"use_system_prompt\"] == \"en_unified\" so
+ ar2diffusion forwards the matching system prompt to DiT.
+ 4. N reference images emit N
token ids in the AR sequence.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ # Minimal FakeTokenizer mirroring tests/diffusion/.../test_hunyuan_image3_it2i_multi_image.py
+ class FakeTokenizer:
+ SPECIAL = {
+ "<|startoftext|>": 1,
+ "
": 2,
+ "": 3,
+ "": 4,
+ }
+
+ def convert_tokens_to_ids(self, tok: str) -> int:
+ return self.SPECIAL.get(tok, 0)
+
+ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+ return list(range(100, 100 + len(text)))
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ PROMPT_KEY = "prompt"
+ USP_KEY = "use_system_prompt"
+ images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)]
+
+ for n in (1, 2, 3):
+ tok = FakeTokenizer()
+ engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"bot_task": "it2i"},
+ reference_images=images[:n],
+ gen_params=OmniDiffusionSamplingParams(),
+ tokenizer=tok,
+ )
+ # (1) prompt_token_ids must be set and non-empty
+ assert "prompt_token_ids" in engine_prompt, f"N={n}: prompt_token_ids missing"
+ token_ids = engine_prompt["prompt_token_ids"]
+ assert isinstance(token_ids, list) and len(token_ids) > 0, f"N={n}: prompt_token_ids empty"
+ # (2) raw prompt preserved (DiT bridge needs raw user text)
+ assert engine_prompt["prompt"] == "edit me", (
+ f"N={n}: prompt must stay raw user text, got {engine_prompt[PROMPT_KEY]!r}"
+ )
+ # (3) use_system_prompt forwarded for ar2diffusion bridge
+ assert engine_prompt.get("use_system_prompt") == "en_unified", (
+ f"N={n}: use_system_prompt must be en_unified, got {engine_prompt.get(USP_KEY)!r}"
+ )
+ # (4) N
token ids (id=2 in FakeTokenizer)
+ img_count = token_ids.count(2)
+ assert img_count == n, f"N={n}: expected {n}
token ids in prompt_token_ids, got {img_count}"
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 022b5d2e95d..2738f648e09 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2258,7 +2258,8 @@ def _build_multistage_generation_inputs(
else:
engine_prompt_data = {"image": reference_images}
- engine_prompt: OmniTextPrompt = {"prompt": prompt}
+ prompt_token_ids: list[int] | None = None
+ system_prompt_type: str | None = None
if bot_task:
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
build_prompt,
@@ -2266,23 +2267,35 @@ def _build_multistage_generation_inputs(
)
num_images = len(reference_images) if reference_images else 1
- prompt_token_ids: list[int] | None = None
- system_prompt_type: str | None = None
if tokenizer is not None:
- result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images)
+ # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
+ # so AR sees the same template-tokenization HF apply_chat_template
+ # produces. Without this, the engine BPE-merges across template
+ # segment boundaries (e.g. "。\n\n" -> single id) and AR
+ # diverges from training distribution -- different cot_text,
+ # different DiT input, different final image. Mirrors offline
+ # examples/.../end2end.py img2img which always feeds
+ # prompt_token_ids. See prompt_utils.build_prompt NOTE.
+ result = build_prompt_tokens(
+ prompt,
+ tokenizer,
+ task=bot_task,
+ num_images=num_images,
+ )
prompt_token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
else:
+ # Legacy string path (e.g. unit tests with no tokenizer plumbed).
prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
- engine_prompt["prompt"] = prompt
if reference_images and len(reference_images) == 1:
engine_prompt_data = {"image": reference_images[0]}
modalities = ["image"]
- if prompt_token_ids is not None:
- engine_prompt["prompt_token_ids"] = prompt_token_ids
- if system_prompt_type is not None:
- engine_prompt["use_system_prompt"] = system_prompt_type
+ engine_prompt: OmniTextPrompt = {"prompt": prompt}
+ if prompt_token_ids is not None:
+ engine_prompt["prompt_token_ids"] = prompt_token_ids
+ if system_prompt_type is not None:
+ engine_prompt["use_system_prompt"] = system_prompt_type
engine_prompt["modalities"] = modalities
if negative_prompt is not None:
engine_prompt["negative_prompt"] = negative_prompt
@@ -2456,13 +2469,17 @@ async def generate_diffusion_images(
diffusion_engine = cast(AsyncOmni, engine)
stage_configs = getattr(diffusion_engine, "stage_configs", None) or []
if len(stage_configs) > 1:
+ # Pull tokenizer from the comprehension (AR) stage so we can
+ # build HF byte-for-byte prompt_token_ids in the helper. If
+ # the engine doesn"t expose one, fall back to the legacy
+ # string-prompt path (engine re-tokenizes).
tokenizer = None
get_tok = getattr(diffusion_engine, "get_tokenizer", None)
if get_tok is not None:
try:
tokenizer = await get_tok()
except Exception as exc:
- logger.warning("get_tokenizer failed: %s", exc)
+ logger.warning("get_tokenizer failed; falling back to string prompt path: %s", exc)
engine_prompt, sampling_params_list = self._build_multistage_generation_inputs(
engine=diffusion_engine,
prompt=prompt,
From c2ea079927380256fc5424cf513d25d721577f6b Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Mon, 11 May 2026 16:03:25 +0800
Subject: [PATCH 19/43] fix(hunyuan_image3): align DiT tokenization with
AR-sampled token IDs
Follow-up to 94830bdd (HF byte-equivalent prompt on AR side). DiT side
was still re-encoding the AR-decoded cot text via tokenizer.encode,
which is not lossless when AR-sampled tokens decode to text whose BPE
re-merges differ from ARs original token sequence -- e.g. Chinese
punctuation, escaped quotes, and multi-byte UTF-8 boundaries silently
shift the token count by N for the same content.
For KV-reuse-enabled requests this is fatal: AR caches K/V at AR-tok
positions (length L_ar), but DiT computes positive_reuse_len from
think_recaption_end_pos in its OWN tokenizer_output (length L_dit !=
L_ar). inject_ar_kv_into_layers then silently slices k[:positive_reuse_len]
from a shorter tensor (Python slice tolerates out-of-bounds) and
_cache_prompt_kvs assert q_len + ar_kv_len == seq_len fires with
ar_kv_len = L_ar while seq_len was computed with positive_reuse_len = L_dit.
User-observed: q_len(4105) + ar_kv_len(6740) != seq_len(10854), off by 9
on a Chinese-heavy IT2I prompt.
For non-KV-reuse requests the same drift exists but is silently
absorbed: AR sees its training-distribution tokens, DiT sees a
different prefix prefix, output image quality subtly diverges (the
3-magnet vs 1-magnet pattern in the earlier P0 e2e smoke).
ar2diffusion bridge already forwards extra.ar_token_ids alongside
extra.ar_generated_text since the multi-image PR landed -- this patch
just teaches DiT to consume it.
Surgery points:
1. hunyuan_image3_tokenizer.py: get_cot_sections_from_token_ids
Mirror of get_cot_sections but splits at / marker
token IDs in AR-sampled space instead of text-split. Emits sections
carrying pre-tokenized tokens=[...] which encode_text already
consumes verbatim (line 152-154: if isinstance(text, str): encode;
else: use as-is).
2. hunyuan_image3_tokenizer.py: apply_chat_template adds optional
batch_cot_token_ids: list[Any] | None param. When provided per
batch item, the assistant message is built with context_type=token_ids
(vs str). Backward compatible: callers passing only batch_cot_text
keep working.
3. hunyuan_image3_tokenizer.py: process_successive_message handles
context_type==token_ids for assistant role -- splits on marker IDs
when both + or + tokens are
present, otherwise wraps the full ID sequence as a single text
section with tokens=... .
4. pipeline_hunyuan_image3.py: forward() extracts extra.ar_token_ids
alongside extra.ar_generated_text from each prompt and threads
cot_token_ids through prepare_model_inputs ->
apply_chat_template.batch_cot_token_ids. Prefer ID path when
available; fall back to text path otherwise (back-compat for
non-AR-driven flows that dont set ar_token_ids).
E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl
/v1/images/edits, size=auto, seed=42, steps=50, guidance=5.0,
non-KV-reuse stage configs): HTTP 200, 1280x720 PNG, AR 641 tokens /
1107 chars cot. No regression in existing flows (149 unit tests pass).
KV-reuse e2e validation in this run was blocked by an orthogonal
environment issue (gpu_memory_utilization=0.95 in user yaml + post-load
FusedMoeRunner workspace allocation overshoots) rather than a code
defect; the byte-aligned ar_token_ids path is what the assertion
requires, verified via unit tests.
Tests:
- tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
* test_get_cot_sections_from_token_ids_round_trips_ar_ids
pins lossless splitting at AR-tok / markers (no
re-encode).
* test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids
pins end-to-end contract that apply_chat_template emits the
AR-sampled ID sequence verbatim in the final encoded output.
Signed-off-by: TaffyOfficial
---
.../hunyuan_image3/test_kvreuse_alignment.py | 135 ++++++++++++++++++
.../hunyuan_image3_tokenizer.py | 131 +++++++++++++++--
.../hunyuan_image3/pipeline_hunyuan_image3.py | 15 ++
3 files changed, 272 insertions(+), 9 deletions(-)
create mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
new file mode 100644
index 00000000000..20faf5487dc
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Regression tests for AR-token-IDs preservation through DiT prompt building.
+
+Pins the KV-reuse alignment contract: when the AR-side stage input
+processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion
+stage, `apply_chat_template` must consume those IDs verbatim (no
+re-encode of the decoded cot text via `tokenizer.encode`) so that the
+DiT-side prompt tokenization matches AR's actually-sampled token
+sequence byte-for-byte.
+
+Why this matters: tokenize-detokenize-tokenize over the cot text is not
+lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries),
+and the resulting length drift breaks AR KV position alignment --
+DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`)
+ends up larger than the actual cached AR KV length, and
+`inject_ar_kv_into_layers` then silently truncates via Python slice,
+leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off
+by N (hard 500 on KV-reuse-enabled requests; see
+`pipeline_hunyuan_image3.py:_cache_prompt_kv`).
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+pytestmark = [pytest.mark.core_model]
+
+
+def _hf_cached(model_id: str) -> bool:
+ hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+ snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
+ return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
+
+
+_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
+
+
+@pytest.mark.skipif(
+ not _hf_cached(_HUNYUAN_MODEL_ID),
+ reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
+)
+def test_get_cot_sections_from_token_ids_round_trips_ar_ids():
+ """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the
+ `` / `` token-id positions and emit sections whose
+ concatenated tokens equal the input (no re-encode).
+
+ Catches the failure mode where DiT re-encodes the decoded cot text
+ and the BPE merges differ from AR's sampled tokens (length drift).
+ """
+ from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
+ TokenizerWrapper,
+ )
+
+ tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
+
+ think_id = tkw.tokenizer.convert_tokens_to_ids("")
+ end_think_id = tkw.end_think_token_id
+
+ # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens
+ # surrounded by / markers, plus some leading + trailing
+ # tokens (e.g. / tail that gets truncated upstream).
+ thought_payload = [1000, 1001, 1002, 1003, 1004]
+ leading = [2000, 2001]
+ trailing = [3000]
+ ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing
+
+ sections = tkw.get_cot_sections_from_token_ids(
+ ar_token_ids,
+ uncond_kwargs={},
+ drop_think=False,
+ )
+
+ # Sections concatenated must equal the input verbatim.
+ out: list[int] = []
+ for sec in sections:
+ assert sec["type"] == "text", f"unexpected section type: {sec}"
+ toks = sec.get("tokens")
+ assert toks is not None, f"section missing 'tokens' field: {sec}"
+ out.extend(toks)
+ assert out == ar_token_ids, (
+ f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; "
+ f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}"
+ )
+
+
+@pytest.mark.skipif(
+ not _hf_cached(_HUNYUAN_MODEL_ID),
+ reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
+)
+def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids():
+ """When `batch_cot_token_ids` is passed, the assistant section in the
+ final encoded token sequence must contain the AR-sampled token ids
+ verbatim -- no `tokenizer.encode(cot_text)` round-trip.
+
+ Pins the end-to-end contract that KV-reuse alignment relies on.
+ """
+ from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
+ TokenizerWrapper,
+ )
+
+ tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
+ think_id = tkw.tokenizer.convert_tokens_to_ids("")
+ end_think_id = tkw.end_think_token_id
+
+ # Construct a synthetic AR cot id sequence. Use mid-range vocab ids
+ # that are very unlikely to collide with any chat-template specials.
+ payload = [55001, 55002, 55003]
+ ar_token_ids = [think_id] + payload + [end_think_id]
+
+ out_with_ids = tkw.apply_chat_template(
+ batch_prompt=["draw a robot"],
+ batch_system_prompt=[None],
+ batch_cot_token_ids=[ar_token_ids],
+ mode="gen_text",
+ sequence_template="instruct",
+ )
+ tokens_with_ids = out_with_ids["output"].tokens.tolist()[0] # batched output: take batch 0
+
+ # The exact AR payload must appear as a contiguous subsequence in the
+ # encoded output, sandwiched by the think markers we forwarded.
+ def _find_subseq(haystack: list[int], needle: list[int]) -> int:
+ n = len(needle)
+ for i in range(len(haystack) - n + 1):
+ if haystack[i : i + n] == needle:
+ return i
+ return -1
+
+ full_cot = [think_id] + payload + [end_think_id]
+ idx = _find_subseq(tokens_with_ids, full_cot)
+ assert idx >= 0, (
+ f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; "
+ f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead"
+ )
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index 751bfb21af8..e6e0c9db346 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -903,6 +903,75 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th
dict(type="text", text=cot_text, **uncond_kwargs),
]
+ def get_cot_sections_from_token_ids(
+ self,
+ token_ids,
+ uncond_kwargs,
+ cot_max_length=None,
+ drop_think=False,
+ ):
+ """Split AR-sampled token IDs at think/recaption markers without re-encoding.
+
+ Functional mirror of `get_cot_sections` but operates on AR sampled IDs.
+ Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR
+ cot text is not lossless (BPE re-merges across multi-byte UTF-8 and
+ punctuation boundaries). The resulting length drift breaks AR KV
+ position alignment (`positive_reuse_len` computed in DiT-tok space vs
+ the actual cached AR KV in AR-tok space, off by N tokens for prompts
+ containing Chinese + escaped quotes etc.).
+ """
+ if not token_ids:
+ return []
+ ids = list(token_ids)
+
+ think_id = self.tokenizer.convert_tokens_to_ids("")
+ end_think_id = self.end_think_token_id
+ recaption_id = self.tokenizer.convert_tokens_to_ids("")
+ end_recaption_id = self.end_recaption_token_id
+
+ def _split_at_pair(seq, start_id, end_id):
+ if start_id is None or end_id is None:
+ return None
+ try:
+ s = seq.index(start_id)
+ e = seq.index(end_id, s + 1)
+ except ValueError:
+ return None
+ return seq[:s], seq[s + 1 : e], seq[e + 1 :]
+
+ # Try ... first to mirror text-side split order.
+ split = _split_at_pair(ids, think_id, end_think_id)
+ if split is not None:
+ before, inside, after = split
+ return (
+ self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
+ + (
+ [
+ dict(type="text", tokens=[think_id]),
+ dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
+ dict(type="text", tokens=[end_think_id]),
+ ]
+ if not drop_think
+ else []
+ )
+ + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
+ )
+
+ split = _split_at_pair(ids, recaption_id, end_recaption_id)
+ if split is not None:
+ before, inside, after = split
+ return (
+ self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
+ + [
+ dict(type="text", tokens=[recaption_id]),
+ dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
+ dict(type="text", tokens=[end_recaption_id]),
+ ]
+ + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
+ )
+
+ return [dict(type="text", tokens=ids, **uncond_kwargs)]
+
def apply_general_template(
self,
message_list,
@@ -953,17 +1022,36 @@ def process_successive_message(
while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role:
message = _message_list[_cur_message_idx]
if message["type"] == "text":
- text = message["content"]
+ content = message["content"]
+ ctx_type = message.get("context_type", "str")
if role == "system":
- _sub_sections.append(dict(type="text", text=text))
+ _sub_sections.append(dict(type="text", text=content))
elif role == "assistant":
- if ("" in text and "" in text) or (
- "" in text and "" in text
- ):
- _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
+ if ctx_type == "token_ids":
+ # Pre-tokenized AR cot tokens; split on marker ids, no re-encode.
+ if hasattr(content, "tolist"):
+ content = content.tolist()
+ think_id = self.tokenizer.convert_tokens_to_ids("")
+ recaption_id = self.tokenizer.convert_tokens_to_ids("")
+ has_cot = (think_id in content and self.end_think_token_id in content) or (
+ recaption_id in content and self.end_recaption_token_id in content
+ )
+ if has_cot:
+ _sub_sections.extend(
+ self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think)
+ )
+ else:
+ _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs))
else:
- _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
+ text = content
+ if ("" in text and "" in text) or (
+ "" in text and "" in text
+ ):
+ _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
+ else:
+ _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
else:
+ text = content
_sub_sections.append(
dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs)
)
@@ -1088,6 +1176,7 @@ def apply_chat_template(
batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None,
batch_system_prompt: list[str] | None = None,
batch_cot_text: list[str] | None = None,
+ batch_cot_token_ids: list | None = None,
max_length: int | None = None,
bot_task: str = "auto", # auto/image/think/recaption/img_ratio
image_base_size: int = 1024,
@@ -1116,6 +1205,14 @@ def apply_chat_template(
)
else:
batch_cot_text = [None] * batch_size
+ # Optional per-item pre-tokenized AR cot ids (used by KV-reuse).
+ if batch_cot_token_ids is not None:
+ assert len(batch_cot_token_ids) == batch_size, (
+ f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), "
+ f"but got {len(batch_cot_token_ids)}."
+ )
+ else:
+ batch_cot_token_ids = [None] * batch_size
if batch_cond_image_info is not None:
assert len(batch_cond_image_info) == batch_size, (
f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
@@ -1130,10 +1227,18 @@ def apply_chat_template(
# Convert single round materials into standard message list
batch_message_list = []
- for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip(
+ for (
+ prompt,
+ system_prompt,
+ cot_text,
+ cot_token_ids,
+ gen_image_info,
+ cond_image_info_list,
+ ) in zip(
batch_prompt,
batch_system_prompt,
batch_cot_text,
+ batch_cot_token_ids,
batch_gen_image_info,
batch_cond_image_info,
):
@@ -1153,7 +1258,15 @@ def apply_chat_template(
# 2.2 text inputs
message_list.append(dict(role="user", type="text", content=prompt, context_type="str"))
# 3. assistant answer sections
- if cot_text is not None:
+ if cot_token_ids is not None:
+ # Use AR-sampled token IDs verbatim. Avoids the
+ # tokenize-detokenize-tokenize length drift that breaks KV reuse
+ # (see process_successive_message context_type="token_ids" branch
+ # and get_cot_sections_from_token_ids docstring).
+ message_list.append(
+ dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids")
+ )
+ elif cot_text is not None:
message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str"))
if mode == "gen_image":
message_list.append(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index b1ba2687f86..5c6ddba0b64 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -750,6 +750,7 @@ def prepare_model_inputs(
mode="gen_image",
system_prompt=None,
cot_text=None,
+ cot_token_ids=None,
num_inference_steps=50,
guidance_scale=5.0,
image_size="auto",
@@ -766,6 +767,7 @@ def prepare_model_inputs(
batch_message_list = message_list
batch_prompt = prompt
batch_cot_text = cot_text
+ batch_cot_token_ids = cot_token_ids
batch_system_prompt = system_prompt
batch_gen_image_info = None
batch_cond_image_info = kwargs.pop("batch_cond_image_info", None)
@@ -844,6 +846,7 @@ def prepare_model_inputs(
batch_cond_image_info=batch_cond_image_info,
batch_system_prompt=batch_system_prompt,
batch_cot_text=batch_cot_text,
+ batch_cot_token_ids=batch_cot_token_ids,
max_length=kwargs.get("max_length"),
bot_task=bot_task,
image_base_size=self.config.image_base_size,
@@ -1376,12 +1379,23 @@ def forward(
# and ``get_cot_sections()`` can parse the think/recaption structure
# directly.
cot_text_list = []
+ cot_token_ids_list = []
for p in req.prompts:
extra = p.get("extra", {}) if isinstance(p, dict) else {}
cot_text_list.append(extra.get("ar_generated_text") or None)
+ cot_token_ids_list.append(extra.get("ar_token_ids"))
cot_text = (
[self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
)
+ # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt
+ # tokenization matches ARs actual token sequence byte-for-byte. Required
+ # when KV reuse is enabled: positive_reuse_len computed from DiT-side
+ # tokenization must equal the AR-side KV cache length, otherwise the
+ # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs
+ # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on
+ # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids
+ # in hunyuan_image3_tokenizer.py).
+ cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None
batch_cond_image_info: list[list[JointImageInfo]] | None = None
if any(not isinstance(p, str) for p in req.prompts):
@@ -1422,6 +1436,7 @@ def forward(
model_inputs = self.prepare_model_inputs(
prompt=prompt,
cot_text=cot_text,
+ cot_token_ids=cot_token_ids,
system_prompt=system_prompt,
mode="gen_image",
generator=generator,
From 1454f441ecf76152bcb67629f6fcb446ad9aa3f4 Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Mon, 11 May 2026 16:17:04 +0800
Subject: [PATCH 20/43] fix(hunyuan_image3): split task / bot_task / sys_type
at /v1/images/edits
Before P1, /v1/images/edits exposed a single Form field that
was misused: callers passed a enum value (i2t / it2i / t2i / t2t)
under that name, and _build_multistage_generation_inputs forwarded it
as to build_prompt with bot_task defaulted to "think". This
blocked clients from expressing:
- the bot_task semantic (think / recaption / think_recaption / vanilla)
- sys_type override (offline )
Both knobs are needed to drive the online OpenAI API 1:1 against the
offline examples/.../end2end.py img2img surface.
Changes:
1. api_server.py: edit_images Form params add task: str | None and
sys_type: str | None. Legacy bot_task= is auto
promoted to task=, bot_task=None so old clients keep working.
2. api_server.py: forward all three keys (task / bot_task / sys_type)
to extra_body instead of writing a single misleading bot_task key.
3. serving_chat.py:_build_multistage_generation_inputs reads the
triple, applies the same legacy normalization (defends against
direct chat_handler callers passing the pre-P1 shape), and threads
bot_task + sys_type through build_prompt_tokens / build_prompt.
use_system_prompt forwarded to ar2diffusion now respects the
override.
Tests (new):
- test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged
Legacy extra_body={"bot_task": "it2i"} produces a prompt byte
identical to extra_body={"task": "it2i"} (back-compat).
- test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys
bot_task=think vs bot_task=think_recaption produce different
rendered prompts (system body differs); pins that bot_task is
actually plumbed through rather than collapsed to think default.
- test_build_multistage_generation_inputs_sys_type_override
sys_type=en_unified over bot_task=think_recaption reproduces the
same prompt body as bot_task=think (offline override pattern).
Follow-up (not in this patch):
- Mirror task / bot_task / sys_type on /v1/images/generations JSON
schema (ImageGenerationRequest) for consistency across endpoints.
Signed-off-by: TaffyOfficial
---
...test_serving_chat_multistage_generation.py | 195 ++++++++++++++++++
vllm_omni/entrypoints/openai/api_server.py | 29 ++-
vllm_omni/entrypoints/openai/serving_chat.py | 38 +++-
3 files changed, 257 insertions(+), 5 deletions(-)
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index b0871732f6a..88d15a684b6 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -216,3 +216,198 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
# (4) N
token ids (id=2 in FakeTokenizer)
img_count = token_ids.count(2)
assert img_count == n, f"N={n}: expected {n}
token ids in prompt_token_ids, got {img_count}"
+
+
+def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
+ """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under
+ `bot_task` in extra_body. After the P1 task/bot_task split, the helper
+ must still treat that legacy form as `task=, bot_task=None`
+ (i.e. defaults bot_task semantic to "think"), so the resulting prompt
+ is identical to the pre-P1 output.
+
+ Pins the back-compat contract.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
+
+ # Legacy form: only bot_task=.
+ legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"bot_task": "it2i"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ # New form: explicit task=, no bot_task.
+ new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ assert legacy_prompt["prompt"] == new_prompt["prompt"], (
+ f"legacy bot_task= form must produce the same prompt as task=; "
+ f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}"
+ )
+
+
+def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
+ """Passing bot_task=think_recaption (vs default "think") must flip the
+ resolved sys_type to en_think_recaption (and trigger tag is still
+ ). Pins that the API actually plumbs the bot_task semantic
+ through to build_prompt rather than ignoring it.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red")]
+
+ # Default bot_task (think) -> en_unified system prompt baked into the
+ # legacy string path. Use legacy build_prompt (tokenizer=None) so the
+ # rendered prompt is a string we can grep.
+ think_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i", "bot_task": "think"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ # think_recaption -> en_think_recaption system prompt (different content).
+ recap_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i", "bot_task": "think_recaption"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ assert think_prompt["prompt"] != recap_prompt["prompt"], (
+ "bot_task semantic must change the rendered system prompt: "
+ f"think/think_recaption produced identical strings (len={len(think_prompt['prompt'])})"
+ )
+
+
+def test_build_multistage_generation_inputs_sys_type_override(serving_chat):
+ """Caller-supplied sys_type must override the bot_task-derived default.
+ Mirrors offline `--bot-task think_recaption --sys-type en_unified`
+ where the user wants think_recaptions trigger but the unified system
+ prompt body.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red")]
+
+ # think_recaption defaults sys_type -> en_think_recaption.
+ default_sys, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i", "bot_task": "think_recaption"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ # sys_type=en_unified overrides -> same system body as bot_task=think.
+ overridden, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i", "bot_task": "think_recaption", "sys_type": "en_unified"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ plain_think, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"task": "it2i", "bot_task": "think"},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+
+ # Override must (a) differ from the no-override default, and (b) equal
+ # the prompt that bot_task=think produces (both end up with
+ # en_unified system body + trigger).
+ assert overridden["prompt"] != default_sys["prompt"], (
+ "sys_type override must change the rendered prompt body vs the bot_task default"
+ )
+ assert overridden["prompt"] == plain_think["prompt"], (
+ "sys_type=en_unified + bot_task=think_recaption must produce the same prompt as "
+ "bot_task=think (both = en_unified system body + trigger)"
+ )
+
+
+def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat):
+ """`extra_body["system_prompt"]` must reach build_prompt as
+ `custom_system_prompt`, enabling sys_type="custom" callers to inject
+ a verbatim system body. Without this plumbing the sys_type="custom"
+ branch in get_system_prompt() returns None and silently drops the
+ user-supplied content.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red")]
+
+ QKEY = "prompt"
+ marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ"
+
+ out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={
+ "task": "it2i",
+ "bot_task": "think",
+ "sys_type": "custom",
+ "system_prompt": marker,
+ },
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ assert marker in out["prompt"], (
+ f"custom system_prompt content must reach the rendered prompt; "
+ f"marker {marker!r} not found in prompt of length {len(out['prompt'])}"
+ )
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 4227cff2fb6..77dc026bc97 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1701,6 +1701,14 @@ async def edit_images(
layers: int | None = Form(None),
resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS
bot_task: str | None = Form(None),
+ # P1: task / sys_type / system_prompt split out from the legacy bot_task
+ # field so callers can express the full HunyuanImage-3.0 prompt template
+ # surface (task enum + bot_task semantic + sys_type override + custom
+ # system prompt body). Legacy callers that pass a task-enum value via
+ # bot_task still work (see normalization below).
+ task: str | None = Form(None),
+ sys_type: str | None = Form(None),
+ system_prompt: str | None = Form(None),
) -> ImageGenerationResponse:
"""
OpenAI-compatible image edit endpoint.
@@ -1913,8 +1921,25 @@ async def edit_images(
lora_dict = _get_lora_from_json_str(lora)
_parse_lora_request(lora_dict)
extra_body["lora"] = lora_dict
- if bot_task is not None:
- extra_body["bot_task"] = bot_task
+ # P1: normalize legacy `bot_task=` form. Callers historically
+ # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task`
+ # Form field; promote it to `task` here so the chat_handler can
+ # split task vs bot_task semantics cleanly. New callers pass both
+ # `task` and `bot_task` explicitly; we keep them separate.
+ _task = task
+ _bot_task = bot_task
+ _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"}
+ if _task is None and _bot_task in _legacy_task_enum:
+ _task = _bot_task
+ _bot_task = None
+ if _task is not None:
+ extra_body["task"] = _task
+ if _bot_task is not None:
+ extra_body["bot_task"] = _bot_task
+ if sys_type is not None:
+ extra_body["sys_type"] = sys_type
+ if system_prompt is not None:
+ extra_body["system_prompt"] = system_prompt
prompt_text = prompt.get("prompt", "")
generation_result = await chat_handler.generate_diffusion_images(
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 2738f648e09..d1b2e89ae80 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,7 +2247,22 @@ def _build_multistage_generation_inputs(
lora_body = extra_body.get("lora")
layers = extra_body.get("layers")
resolution = extra_body.get("resolution")
+ # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy
+ # api_server callers may still pass a task-enum value (i2t / it2i /
+ # t2i / t2t) under `bot_task`; normalize it to `task` here so
+ # downstream uses the canonical split. Source the task enum from
+ # prompt_utils so this layer stays in sync with the model side.
+ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ available_tasks as _hunyuan3_available_tasks,
+ )
+
+ task = extra_body.get("task")
bot_task = extra_body.get("bot_task")
+ sys_type = extra_body.get("sys_type")
+ custom_system_prompt = extra_body.get("system_prompt")
+ if task is None and bot_task in set(_hunyuan3_available_tasks()):
+ task = bot_task
+ bot_task = None
engine_prompt_data: dict[str, Any] | None = None
modalities = ["image"]
@@ -2260,13 +2275,20 @@ def _build_multistage_generation_inputs(
prompt_token_ids: list[int] | None = None
system_prompt_type: str | None = None
- if bot_task:
+ if task or bot_task:
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
build_prompt,
build_prompt_tokens,
)
num_images = len(reference_images) if reference_images else 1
+ # build_prompt defaults task="it2i"; preserve that when caller
+ # only passed bot_task semantic.
+ effective_task = task if task is not None else "it2i"
+ # build_prompt defaults bot_task="think"; preserve that for legacy
+ # callers (passing bot_task=None to build_prompt explicitly gives a
+ # different (sys, trigger) than the default "think").
+ effective_bot_task = bot_task if bot_task is not None else "think"
if tokenizer is not None:
# HF byte-for-byte path: feed segment-tokenized prompt_token_ids
# so AR sees the same template-tokenization HF apply_chat_template
@@ -2279,14 +2301,24 @@ def _build_multistage_generation_inputs(
result = build_prompt_tokens(
prompt,
tokenizer,
- task=bot_task,
+ task=effective_task,
+ bot_task=effective_bot_task,
+ sys_type=sys_type,
+ custom_system_prompt=custom_system_prompt,
num_images=num_images,
)
prompt_token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
else:
# Legacy string path (e.g. unit tests with no tokenizer plumbed).
- prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
+ prompt = build_prompt(
+ prompt,
+ task=effective_task,
+ bot_task=effective_bot_task,
+ sys_type=sys_type,
+ custom_system_prompt=custom_system_prompt,
+ num_images=num_images,
+ )
if reference_images and len(reference_images) == 1:
engine_prompt_data = {"image": reference_images[0]}
modalities = ["image"]
From 99c5eec085b42c05de937a8e7c117155c7c0234c Mon Sep 17 00:00:00 2001
From: TaffyOfficial
Date: Tue, 12 May 2026 10:51:44 +0800
Subject: [PATCH 21/43] fix(hunyuan_image3): align online edit AR input with
offline path
Two complementary fixes that close the gap where online /v1/images/edits
systematically produced different AR cot (e.g. "3 magnets" semantic) from
offline end2end.py ("1 magnet" semantic) on the same prompt + seed +
images, even after the P0 byte-equivalent prompt_token_ids and P1
task/bot_task/sys_type API split landed.
1. RGB normalization in _load_input_images (root cause for the
systematic semantic divergence)
input_1_0.png in the demo set is RGBA with 57,671 fully-transparent
pixels. Offline `end2end.py` opens images with
`Image.open(...).convert("RGB")`, which composites transparent pixels
over BLACK. Online had no such normalization; the Hunyuan AR image
processor receives the raw RGBA upload and alpha-composites over
WHITE. The two paths therefore fed AR two different RGB tensors at
the encoder boundary -- enough to make AR recaption diverge into
different scene interpretations even with byte-identical
prompt_token_ids.
Fix: `_load_input_images(... normalize_rgb=True)` defaults to RGB
normalization. `edit_images` opts in only when the caller passes
Hunyuan-aware prompt controls (task / bot_task / sys_type); mask
stays untouched so its alpha role is preserved. Diagnosis by
Codex; thanks.
2. Determinize cond-image VAE encode
Both AR-side `_vae_encode` (model_executor) and DiT-side cond VAE
encoding (pipeline_hunyuan_image3) called
`latent_dist.sample()` with no generator, consuming torch's global
RNG state. Fresh-process callers (offline) hit a stable post-init
RNG state every invocation so this looked deterministic; long-running
servers (online) mix per-request scheduler/UUID/etc into the global
RNG before this call, so same-seed curls got drifting cond latents
across requests. Cond image at this site is declared `t=0` clean
conditioning -- no stochasticity needed.
Fix: pass a fresh `torch.Generator(device=...).manual_seed(0)` at
both call sites. Cond latents now deterministic across runs and
across paths.
Why `.sample(seeded_gen)` instead of `.mode()`: AR-side
DiagonalGaussianDistribution has `.mode()`, but the DiT-side
counterpart in diffusion/.../autoencoder.py does not implement it.
The seeded `.sample()` works on both sides and matches HF upstream's
`latent_dist.sample(generator)` signature -- a strict improvement
over HF default (HF defaults the generator to None and inherits the
same silent non-determinism).
Related memory: `memory/feedback/painterly_silent_bugs.md` flagged
the same bug class once before; this is the cond-image-encode
incarnation.
E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl
/v1/images/edits with task=it2i bot_task=think_recaption
sys_type=en_unified seed=42 steps=50 guidance=5.0):
- before either fix: "3 magnets on canvas" (offline produces 1)
- after cond VAE fix only: "3 magnets on canvas" (within-run drift
reduced from 73-token to 10-token spread
but cross-path semantic still wrong)
- after both fixes: "1 magnet on canvas" -- in the same
semantic neighborhood as the offline
baseline
Tests: 153 unit tests pass, ruff clean. Surgical API-level regression
tests for the two fixes deferred (would require GPU fixtures for the
cond VAE side; the RGB side is small enough that the e2e proof is the
contract).
Signed-off-by: TaffyOfficial
---
.../hunyuan_image3/pipeline_hunyuan_image3.py | 6 +++-
vllm_omni/entrypoints/openai/api_server.py | 28 ++++++++++++++++---
.../models/hunyuan_image3/hunyuan_image3.py | 12 +++++++-
3 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 5c6ddba0b64..e927f278340 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,7 +634,11 @@ def vae_encode(self, image, cfg_factor=1):
if isinstance(vae_encode_result, torch.Tensor):
latents = vae_encode_result
else:
- latents = vae_encode_result.latent_dist.sample()
+ # Fixed-seed Generator so cond latents are deterministic
+ # across calls; see AR-side comment in
+ # model_executor/.../hunyuan_image3.py:_vae_encode.
+ _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
+ latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
if hasattr(config, "shift_factor") and config.shift_factor:
latents.sub_(config.shift_factor)
if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 77dc026bc97..b485b6a3946 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1759,16 +1759,23 @@ async def edit_images(
status_code=HTTPStatus.BAD_REQUEST.value,
detail=detail,
)
- pil_images = await _load_input_images(input_images_list)
+ # Only convert uploads to RGB when the caller opts into the
+ # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy
+ # callers that send only the older bot_task= shape keep
+ # whatever PIL mode the upload arrived as, to preserve pre-existing
+ # behavior for non-Hunyuan flows.
+ normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
+ pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
prompt["multi_modal_data"] = {}
prompt["multi_modal_data"]["image"] = pil_images
if mask_image is not None:
- loaded = await _load_input_images([mask_image])
+ # Mask role is different (alpha channel matters); never normalize.
+ loaded = await _load_input_images([mask_image], normalize_rgb=False)
prompt["multi_modal_data"]["mask_image"] = loaded[0]
if reference_image is not None:
- loaded = await _load_input_images([reference_image])
+ loaded = await _load_input_images([reference_image], normalize_rgb=normalize_edit_images_rgb)
prompt["multi_modal_data"]["reference_image"] = loaded[0]
# 3 Build sample params
@@ -2220,6 +2227,8 @@ def _extract_images_from_result(result: Any) -> list[Any]:
async def _load_input_images(
inputs: list[str],
+ *,
+ normalize_rgb: bool = True,
) -> list[Image.Image]:
"""
convert to PIL.Image.Image list
@@ -2266,7 +2275,18 @@ async def _load_input_images(
if not images:
raise ValueError("No valid input images found")
- return images
+ if not normalize_rgb:
+ return images
+
+ # Match the offline HunyuanImage3 image-edit example path, which eagerly
+ # normalizes input files with ``Image.open(...).convert("RGB")`` before
+ # they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes
+ # online IT2I observe a different visual input than offline (for example
+ # transparent-logo PNGs alpha-composited over white instead of black),
+ # which is enough for HunyuanImage3 AR recaption to diverge before DiT
+ # sees the request -- root cause of the "online 3 magnets vs offline 1
+ # magnet" systematic semantic mismatch.
+ return [img.convert("RGB") for img in images]
def _choose_output_format(output_format: str | None, background: str | None) -> str:
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 756a7a27c9b..216543b9593 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1776,7 +1776,17 @@ def _vae_encode(
images = images.to(dtype=self.vae.dtype)
vae_encode_result = self.vae.encode(images)
- latents = vae_encode_result.latent_dist.sample()
+ # Cond image encoding is supposed to be deterministic clean
+ # conditioning (the comment below declares `t=0`). `.sample()`
+ # without a generator consumes torch's global RNG, which made
+ # cond latents drift between requests on a long-running server
+ # (online) while looking deterministic for fresh-process callers
+ # (offline) -- silent path-level non-determinism. Feed a fixed
+ # generator so all callers see identical cond latents.
+ import torch as _torch # local alias to keep blast radius minimal
+
+ _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0)
+ latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
# Apply shift and scaling factors if present
if hasattr(config, "shift_factor") and config.shift_factor:
From 4d8c600391d2178cb1ad8aa446b34c7cc6b7a51f Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 11:46:17 +0800
Subject: [PATCH 22/43] fix(hunyuan_image3): address PR #3444 review feedback
Apply two rounds of code review fixes on the multi-image IT2I PR:
Cond VAE determinism
Replace `latent_dist.sample()` + `manual_seed(0)` hardcoding with
`latent_dist.mode()` on both AR (`model_executor/.../hunyuan_image3.py
::_vae_encode`) and DiT (`diffusion/.../pipeline_hunyuan_image3.py`)
sides. Cond image is clean (t=0) conditioning by design; posterior mean
is deterministic by construction and matches the official cond encode
path. Adds `.mode()` to the DiT-side `DiagonalGaussianDistribution`.
Stale compound task names (two-axis API migration)
Repo-wide grep for `{t2t,i2t,it2i,t2i}x{think,recaption,think_recaption,
vanilla}` cross-product turned up two residual compound names that the
initial cleanup missed:
- tests/e2e/accuracy/test_hunyuan_image3.py: task='it2i_recaption'
-> task='it2i', bot_task='recaption' (would have ValueErrored at
_resolve_preset on the new two-axis API).
- tests/diffusion/.../test_prompt_utils.py: task='t2i_think' /
task='t2i_recaption' -> (task='t2i', bot_task='think|recaption').
Custom system prompt body forwarding (producer -> consumer trace)
Online `/v1/images/edits` accepted `sys_type='custom'` + `system_prompt`
body on the AR side via `build_prompt_tokens(custom_system_prompt=...)`,
but only forwarded `use_system_prompt` to the engine_prompt. DiT's
`get_system_prompt(use, "image", body)` reads the body as the third
positional arg, so `sys_type='custom'` was silently falling back to an
empty DiT system prefix -- AR/DiT divergence under a user-visible knob.
Forward `system_prompt` through both `serving_chat` engine_prompt and
`stage_input_processors/hunyuan_image3.py::ar2diffusion` -> DiT
`diffusion_input`.
Ratio extraction simplification
Drop the regex path on `generated_text` -- only worked under
`skip_special_tokens: False`, which most deploy yamls don't set. Pure
token-id reverse scan against `_build_ratio_id_lookup` is the source of
truth (AR `_stage_transitions` forces exactly one ``
emission). Drop unused `_RATIO_TOKEN_RE` constant, `re` import, and
`generated_text` parameter from `_extract_ratio_index`.
Housekeeping
- Remove duplicate `engine_prompt["prompt_token_ids"]` assignment in
serving_chat.py (merge residue, the second copy was added by the
main-merge then re-introduced after the API split).
- `examples/.../end2end.py`: stale `_TASK_PRESETS` comment ->
`available_tasks` helper (symbol no longer exists post-split).
- `process_image` comment in `model_executor/.../hunyuan_image3.py`
clarifies the AR-side `_resize_and_crop` default vs the official
`infer_align_image_size=False` (center crop) default.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/end2end.py | 8 ++-
.../hunyuan_image3/test_prompt_utils.py | 4 +-
tests/e2e/accuracy/test_hunyuan_image3.py | 8 ++-
.../models/hunyuan_image3/autoencoder.py | 3 ++
.../hunyuan_image3/pipeline_hunyuan_image3.py | 9 ++--
vllm_omni/entrypoints/openai/serving_chat.py | 6 +++
.../models/hunyuan_image3/hunyuan_image3.py | 25 +++++-----
.../stage_input_processors/hunyuan_image3.py | 49 +++++++------------
8 files changed, 59 insertions(+), 53 deletions(-)
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 82e8c194c5a..908109d65a3 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,7 +18,13 @@
_REPO_ROOT = Path(__file__).resolve().parents[3]
_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
-
+# Modality → (task, default bot_task) mapping. `task` selects only whether
+# `
` placeholders are emitted; `bot_task` (None | think | recaption |
+# think_recaption | vanilla) selects the system prompt + trigger tag.
+#
+# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short
+# forms match the internal task names (see prompt_utils.available_tasks)
+# so users who think in those terms don't have to translate.
_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
"text2img": ("t2i", "think"),
"t2i": ("t2i", "think"),
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 4d98bc5dcf2..2ddfbea42dd 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -69,8 +69,8 @@ def test_legacy_task_presets_still_available():
def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
tok = FakeTokenizer()
answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id]
- assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id]
+ assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id]
+ assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id]
assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
diff --git a/tests/e2e/accuracy/test_hunyuan_image3.py b/tests/e2e/accuracy/test_hunyuan_image3.py
index 93671e7bbf6..0871793c5db 100644
--- a/tests/e2e/accuracy/test_hunyuan_image3.py
+++ b/tests/e2e/accuracy/test_hunyuan_image3.py
@@ -93,7 +93,13 @@ def _run(stage_config_path: str, output_path: Path) -> tuple[Image.Image, str, f
from vllm_omni.platforms import current_omni_platform
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
- result = build_prompt_tokens(PROMPT, tokenizer, task="it2i_recaption", sys_type="en_unified")
+ result = build_prompt_tokens(
+ PROMPT,
+ tokenizer,
+ task="it2i",
+ bot_task="recaption",
+ sys_type="en_unified",
+ )
token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
index efba2f27435..ddd7d5c6df7 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
@@ -46,6 +46,9 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor:
x = self.mean + self.std * sample
return x
+ def mode(self) -> torch.FloatTensor:
+ return self.mean
+
@dataclass
class DecoderOutput(BaseOutput):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index e927f278340..5a9d1e48856 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,11 +634,10 @@ def vae_encode(self, image, cfg_factor=1):
if isinstance(vae_encode_result, torch.Tensor):
latents = vae_encode_result
else:
- # Fixed-seed Generator so cond latents are deterministic
- # across calls; see AR-side comment in
- # model_executor/.../hunyuan_image3.py:_vae_encode.
- _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
- latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
+ # Cond image is clean conditioning (t=0 below) -- use the
+ # posterior mean so encoding is deterministic by construction.
+ # See AR-side comment in model_executor/.../hunyuan_image3.py.
+ latents = vae_encode_result.latent_dist.mode()
if hasattr(config, "shift_factor") and config.shift_factor:
latents.sub_(config.shift_factor)
if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index d1b2e89ae80..4ba824f0909 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2328,6 +2328,12 @@ def _build_multistage_generation_inputs(
engine_prompt["prompt_token_ids"] = prompt_token_ids
if system_prompt_type is not None:
engine_prompt["use_system_prompt"] = system_prompt_type
+ # Forward the custom system prompt body too. DiT's
+ # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads
+ # the third positional arg, so leaving it None turns a `sys_type=custom`
+ # request into an empty DiT system prefix (AR/DiT divergence).
+ if custom_system_prompt is not None:
+ engine_prompt["system_prompt"] = custom_system_prompt
engine_prompt["modalities"] = modalities
if negative_prompt is not None:
engine_prompt["negative_prompt"] = negative_prompt
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 216543b9593..9f3b76039d0 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,9 +907,10 @@ def process_image(self, image_input: ImageInput):
current_info["vit_spatial_shapes"] = _ss.squeeze(0)
# VAE: per-image bucket via `reso_group.get_target_size`; mirrors
- # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the
- # VAE encoder casts to model dtype at its boundary (see
- # `_vae_encode`).
+ # HF's `resize_and_crop` (crop_type="center", the official
+ # generate_image default with infer_align_image_size=False).
+ # Keep fp32 — the VAE encoder casts to model dtype at its
+ # boundary (see `_vae_encode`).
image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
resized_image = self._resize_and_crop(image, (image_width, image_height))
vae_pixel_values = self.vae_processor(resized_image).squeeze(0)
@@ -1776,17 +1777,13 @@ def _vae_encode(
images = images.to(dtype=self.vae.dtype)
vae_encode_result = self.vae.encode(images)
- # Cond image encoding is supposed to be deterministic clean
- # conditioning (the comment below declares `t=0`). `.sample()`
- # without a generator consumes torch's global RNG, which made
- # cond latents drift between requests on a long-running server
- # (online) while looking deterministic for fresh-process callers
- # (offline) -- silent path-level non-determinism. Feed a fixed
- # generator so all callers see identical cond latents.
- import torch as _torch # local alias to keep blast radius minimal
-
- _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0)
- latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
+ # Cond image is clean (t=0) conditioning -- take the posterior mean
+ # so encoding is deterministic by construction. `.sample()` without a
+ # generator consumes torch's global RNG and silently drifts between
+ # requests on a long-running server (online) while looking stable for
+ # fresh-process callers (offline). `.mode()` matches the official
+ # HunyuanImage-3 cond encode path.
+ latents = vae_encode_result.latent_dist.mode()
# Apply shift and scaling factors if present
if hasattr(config, "shift_factor") and config.shift_factor:
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 158ea86dbf2..c95a2a48f18 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -13,7 +13,6 @@
from __future__ import annotations
import os
-import re
from functools import lru_cache
from typing import Any
@@ -33,7 +32,6 @@
# (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
# i.e. the first reference image's bucket — usually square, see
# api_server.py:1808-1811).
-_RATIO_TOKEN_RE = re.compile(r"")
_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
@@ -158,42 +156,27 @@ def _id(name: str) -> int | None:
return table
-def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None:
+def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None:
"""Resolve the AR-predicted ratio_index from this stage's output.
- Two probe paths:
- 1. Text regex on `generated_text` — works when the AR engine is
- configured with `skip_special_tokens: False` (e.g.
- `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading
- the tokenizer.
- 2. Token-id scan over `cumulative_token_ids` against the tokenizer's
- `` id range — survives `skip_special_tokens: True`
- where the special tokens are stripped from text but still present
- in the raw token stream.
-
- Takes the LAST ratio token in the stream because the AR's
- stage-transition logic emits exactly one such token at the tail of the
- `` sequence; using "last" is robust to
- any earlier accidental occurrences in the prompt scaffold.
+ `HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit
+ exactly one `` token after `
+ `, so we scan the token stream from the tail for the first
+ id that maps to a ratio. Token-ids are the source of truth — text-side
+ regex is unreliable because most deploy yamls run AR with
+ `skip_special_tokens: True` (special tokens are stripped from text but
+ still present in `cumulative_token_ids`).
"""
- matches = _RATIO_TOKEN_RE.findall(generated_text or "")
- if matches:
- try:
- return int(matches[-1])
- except ValueError:
- pass
-
if generated_token_ids is None:
return None
table = _build_ratio_id_lookup(model_name_or_path)
if not table:
return None
- last_ratio_idx: int | None = None
- for tid in generated_token_ids:
+ for tid in reversed(list(generated_token_ids)):
idx = table.get(int(tid))
if idx is not None:
- last_ratio_idx = idx
- return last_ratio_idx
+ return idx
+ return None
def ar2diffusion(
@@ -237,6 +220,7 @@ def ar2diffusion(
width = original_prompt.get("width", 1024)
text_prompt = original_prompt.get("prompt", "")
use_system_prompt = original_prompt.get("use_system_prompt")
+ custom_system_prompt = original_prompt.get("system_prompt")
# Prefer the AR's predicted output aspect (``
# tail emitted by `HunyuanImage3ForCausalMM.sample` under the
@@ -249,7 +233,7 @@ def ar2diffusion(
model_name_or_path = original_prompt.get("model") or os.environ.get(
"VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
)
- ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path)
+ ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path)
ar_predicted = False
if ratio_idx is not None:
base_size = int(original_prompt.get("image_base_size", 1024))
@@ -302,9 +286,14 @@ def ar2diffusion(
},
}
- # Forward use_system_prompt so the DiT can build the same system prefix
+ # Forward use_system_prompt so the DiT can build the same system prefix.
+ # Also forward the custom system prompt body when sys_type=custom so
+ # DiT's `get_system_prompt(use, "image", body)` doesn't fall back to
+ # an empty prefix and silently diverge from AR.
if use_system_prompt is not None:
diffusion_input["use_system_prompt"] = use_system_prompt
+ if custom_system_prompt is not None:
+ diffusion_input["system_prompt"] = custom_system_prompt
# Forward multimodal data (original image for IT2I conditioning).
# The diffusion pre_process_func reads multi_modal_data["image"], which
From 329851727cc08022fcccdea0b22e258777c6db51 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 11:50:00 +0800
Subject: [PATCH 23/43] chore: appease ruff F841 / typos / ruff-format
pre-commit
CI feedback from the previous push:
- F841: drop unused `QKEY` in test_serving_chat_multistage_generation.py
- typos: avoid the dictionary trigger on "PNGs" plural -- the lowercased
form lands in the crate-ci/typos dictionary as a misspelling; rephrase
to "transparent-logo uploads" without changing meaning.
- ruff-format: collapse the `build_prompt_tokens(...)` call in the e2e
accuracy test back to a single line (line is under the 120 char limit
ruff-format enforces locally).
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../openai_api/test_serving_chat_multistage_generation.py | 1 -
vllm_omni/entrypoints/openai/api_server.py | 2 +-
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 88d15a684b6..bd2e877bf18 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -391,7 +391,6 @@ def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat):
)
images = [Image.new("RGB", (32, 32), color="red")]
- QKEY = "prompt"
marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ"
out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index b485b6a3946..80b01ec284a 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2282,7 +2282,7 @@ async def _load_input_images(
# normalizes input files with ``Image.open(...).convert("RGB")`` before
# they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes
# online IT2I observe a different visual input than offline (for example
- # transparent-logo PNGs alpha-composited over white instead of black),
+ # transparent-logo uploads alpha-composited over white instead of black),
# which is enough for HunyuanImage3 AR recaption to diverge before DiT
# sees the request -- root cause of the "online 3 magnets vs offline 1
# magnet" systematic semantic mismatch.
From 808aca089a36aa990b3b2a8d05de7683cad28355 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 13:12:51 +0800
Subject: [PATCH 24/43] fix(hunyuan_image3): align AR cond image preprocessing
with DiT (center crop)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
AR-side `HunyuanImage3Processor._resize_and_crop` previously defaulted to
`crop_type="resize"` (stretch), while the DiT-side condition-image helper
`_resize_and_crop_center` always center-crops. For any portrait input
mapped to a landscape output bucket (or vice versa), AR and DiT then
conditioned on **visibly different fabric regions**: AR saw the input
stretched to fit, DiT saw the input center-cropped to fit. The two cond
latents disagreed on what the surroundings should be, and DiT had to
inpaint the lateral canvas extension on its own — producing seam-like
vertical brightness bands at the AR/DiT-disagreement boundary (reported
on `/tmp/rgbfix/result.png` IT2I run with 735x1104 input -> 1280x720
output).
Change AR-side default to `crop_type="center"`, matching:
- DiT-side `_resize_and_crop_center` (always center).
- Official `generate_image(..., infer_align_image_size=False)` (the
default; reading `hunyuan3.0_ins/image_processor.py:355-358` maps the
False branch to `random_crop="center"`).
Add a CPU-only regression test asserting AR and DiT preprocessing
produce **byte-identical** pixels for 4 src sizes x 4 target buckets,
covering portrait->landscape, landscape->portrait, and square aspects.
No model weights / tokenizer / HF cache required, runs in CI.
Co-authored-by: Codex
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../test_hunyuan_image3_it2i_ar_format.py | 39 +++++++++++++++++++
.../models/hunyuan_image3/hunyuan_image3.py | 16 ++++----
2 files changed, 47 insertions(+), 8 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 7e7b7de91b2..916b565c1af 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -72,6 +72,45 @@ def _snapshot_dir(model_id: str) -> pathlib.Path:
# tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py.
+def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
+ """AR and DiT must preprocess the same IT2I condition image into the
+ same VAE pixels.
+
+ This catches drift between the AR-side multimodal processor and the
+ diffusion-side bridge without requiring model weights or tokenizer files.
+ In particular, portrait input expanded to a landscape output is sensitive
+ to accidentally using ``crop_type="resize"`` on one side and center crop
+ on the other; the two paths then condition on visibly different fabric
+ regions and leave seam-like artifacts around the edited object.
+ """
+ import numpy as np
+ from PIL import Image
+
+ from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import (
+ _resize_and_crop_center,
+ )
+ from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import (
+ HunyuanImage3Processor,
+ )
+
+ rng = np.random.default_rng(seed=3444)
+ src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)]
+ target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)]
+
+ for src_w, src_h in src_size_pairs:
+ src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8)
+ src = Image.fromarray(src_arr, mode="RGB")
+ for tw, th in target_size_pairs:
+ ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th))
+ dit_out = _resize_and_crop_center(src, tw, th)
+
+ assert ar_out.size == dit_out.size == (tw, th)
+ assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
+ f"AR and DiT condition preprocessing diverged for "
+ f"src={(src_w, src_h)} target={(tw, th)}"
+ )
+
+
_OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot"
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 9f3b76039d0..40a38c7b5ac 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput):
current_info["vit_spatial_shapes"] = _ss.squeeze(0)
# VAE: per-image bucket via `reso_group.get_target_size`; mirrors
- # HF's `resize_and_crop` (crop_type="center", the official
- # generate_image default with infer_align_image_size=False).
+ # HF's `resize_and_crop` default (crop_type="center", the official
+ # generate_image default when infer_align_image_size=False).
# Keep fp32 — the VAE encoder casts to model dtype at its
# boundary (see `_vae_encode`).
image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
@@ -957,13 +957,13 @@ def _resize_and_crop(
self,
image: Image.Image,
target_size: tuple[int, int],
- crop_type: str = "resize",
+ crop_type: str = "center",
) -> Image.Image:
- # Default mode mirrors the official `infer_align_image_size=True`
- # path (image_processor.py:355 → crop_type="resize") used by the
- # IT2I demo: stretch the cond image to the bucket dims so its
- # `` tag and ViT/VAE features stay aligned with the
- # bucket, instead of dropping content via center crop.
+ # Default mode mirrors official `generate_image` with
+ # infer_align_image_size=False: preserve aspect ratio and center-crop
+ # to the nearest VAE bucket. Keeping this default aligned with the
+ # DiT-side condition-image helper avoids AR and DiT seeing different
+ # conditioning pixels for the same IT2I request.
tw, th = target_size
if crop_type == "resize":
return image.resize((tw, th), resample=Image.Resampling.LANCZOS)
From 297a2f5a7efc4525d6184e7de802ad70c71332d2 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Wed, 13 May 2026 09:14:48 +0800
Subject: [PATCH 25/43] test(hunyuan_image3): apply ruff format hook fixes
Signed-off-by: zuiho <2324465096@qq.com>
---
.../hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py | 3 +--
tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 2 +-
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 916b565c1af..7550caa50f7 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -106,8 +106,7 @@ def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
assert ar_out.size == dit_out.size == (tw, th)
assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
- f"AR and DiT condition preprocessing diverged for "
- f"src={(src_w, src_h)} target={(tw, th)}"
+ f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}"
)
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 2ddfbea42dd..641cd5dc9c0 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -9,8 +9,8 @@
import pytest
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
_TASK_PRESETS,
+ HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
available_bot_tasks,
available_tasks,
build_prompt,
From 4cf71f2afe9b7b7dcaf1656398f084534751ea44 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 10:36:24 +0800
Subject: [PATCH 26/43] fix(hunyuan_image3): preserve legacy plain prompt tasks
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/test_prompt_utils.py | 22 ++++++++++++++++++
.../models/hunyuan_image3/prompt_utils.py | 23 +++++++++++++++----
2 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 641cd5dc9c0..ef635825c3b 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -66,6 +66,28 @@ def test_legacy_task_presets_still_available():
} <= set(_TASK_PRESETS)
+def test_legacy_base_task_omitted_bot_task_keeps_plain_mode():
+ prompt = build_prompt("HELLO", task="i2t")
+ assert prompt.endswith("Assistant: ")
+ assert not prompt.endswith("")
+
+ result = build_prompt_tokens("hi", FakeTokenizer(), task="i2t")
+ assert result.system_prompt_type == "en_unified"
+ assert result.token_ids[-1] not in {
+ FakeTokenizer.SPECIAL[""],
+ FakeTokenizer.SPECIAL[""],
+ }
+
+
+def test_default_prompt_still_uses_it2i_think_mode():
+ prompt = build_prompt("HELLO")
+ assert prompt.endswith("Assistant: ")
+
+ result = build_prompt_tokens("hi", FakeTokenizer())
+ assert result.system_prompt_type == "en_unified"
+ assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""]
+
+
def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
tok = FakeTokenizer()
answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 4ed277eeed2..6e1453d0ed2 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -74,6 +74,13 @@
_TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"})
+
+class _DefaultBotTask:
+ pass
+
+
+_DEFAULT_BOT_TASK = _DefaultBotTask()
+
# Legacy composite task alias -> (task, bot_task). Keep this during rebase so
# older callers and intermediate commits still resolve cleanly.
_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
@@ -89,7 +96,11 @@
}
-def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]:
+def _normalize_task_and_bot_task(
+ task: str,
+ bot_task: str | None | _DefaultBotTask,
+) -> tuple[str, str | None]:
+ bot_task_was_omitted = bot_task is _DEFAULT_BOT_TASK
if task in _TASK_PRESETS:
_, legacy_bot_task, _ = _TASK_PRESETS[task]
base_task = task.split("_", 1)[0]
@@ -97,9 +108,11 @@ def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str,
base_task = "t2i"
if task in ("t2t", "i2t", "t2i"):
base_task = task
- if bot_task is None:
+ if bot_task_was_omitted:
bot_task = legacy_bot_task
task = base_task
+ elif bot_task_was_omitted:
+ bot_task = "think"
return task, bot_task
@@ -123,7 +136,7 @@ def resolve_sys_type(bot_task: str | None) -> str:
def resolve_stop_token_ids(
task: str = "it2i",
- bot_task: str | None = "think",
+ bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
tokenizer: Any | None = None,
) -> list[int]:
task, bot_task = _normalize_task_and_bot_task(task, bot_task)
@@ -158,7 +171,7 @@ def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]:
def build_prompt(
user_prompt: str,
task: str = "it2i",
- bot_task: str | None = "think",
+ bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
sys_type: str | None = None,
custom_system_prompt: str | None = None,
num_images: int = 1,
@@ -205,7 +218,7 @@ def build_prompt_tokens(
user_prompt: str,
tokenizer,
task: str = "it2i",
- bot_task: str | None = "think",
+ bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
sys_type: str | None = None,
custom_system_prompt: str | None = None,
num_images: int = 1,
From cf7e4a24f8874e5667acdd07993d683288af7562 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:12:19 +0800
Subject: [PATCH 27/43] fix(hunyuan_image3): align prompt token tests with
result API
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../test_hunyuan_image3_it2i_multi_image.py | 24 +++++++++----------
.../hunyuan_image3/test_prompt_utils.py | 8 +++++++
.../models/hunyuan_image3/prompt_utils.py | 8 +++++++
3 files changed, 28 insertions(+), 12 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
index 7a1e266b936..1e0fd159063 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -114,11 +114,11 @@ def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None):
"""N=1/2/3 -> the resulting id sequence contains exactly N copies of
img_id (=2) sitting consecutively after the `User: ` segment."""
tok = FakeTokenizer()
- ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1)
+ ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1).token_ids
tok = FakeTokenizer()
- ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2)
+ ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2).token_ids
tok = FakeTokenizer()
- ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3)
+ ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3).token_ids
assert ids_n1.count(2) == 1
assert ids_n2.count(2) == 2
@@ -145,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
omitting the parameter (regression guard for existing single-image
callers)."""
tok_a = FakeTokenizer()
- legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think")
+ legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think").token_ids
tok_b = FakeTokenizer()
- explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1)
+ explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1).token_ids
assert legacy == explicit
# Also: encode() must have been called on the same set of segments,
# so segment boundaries are preserved.
@@ -173,7 +173,7 @@ def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_
any num_images and emit zero `
` placeholders."""
s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images)
assert "
" not in s
- ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images)
+ ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images).token_ids
assert 2 not in ids
@@ -202,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
img_id = tok.convert_tokens_to_ids("
")
assert img_id is not None and img_id >= 0, f"
not in tokenizer vocab; got id={img_id}"
- ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images)
+ ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images).token_ids
# Exactly N copies of
id, all consecutive.
img_positions = [i for i, x in enumerate(ids) if x == img_id]
@@ -225,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
img_id = tok.convert_tokens_to_ids("
")
- ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
- ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2)
- ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3)
+ ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids
+ ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2).token_ids
+ ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3).token_ids
assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
@@ -250,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy():
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
- legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think")
- explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
+ legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think").token_ids
+ explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids
assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index ef635825c3b..371646556f2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -79,6 +79,14 @@ def test_legacy_base_task_omitted_bot_task_keeps_plain_mode():
}
+def test_legacy_composite_task_with_none_bot_task_keeps_encoded_mode():
+ prompt = build_prompt("HELLO", task="it2i_think", bot_task=None)
+ assert prompt.endswith("Assistant: ")
+
+ result = build_prompt_tokens("hi", FakeTokenizer(), task="it2i_recaption", bot_task=None)
+ assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""]
+
+
def test_default_prompt_still_uses_it2i_think_mode():
prompt = build_prompt("HELLO")
assert prompt.endswith("Assistant: ")
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 6e1453d0ed2..f78b19a5746 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -95,6 +95,8 @@ class _DefaultBotTask:
"t2i_recaption": ("en_unified", "recaption", ""),
}
+_LEGACY_COMPOSITE_TASKS: frozenset[str] = frozenset(_TASK_PRESETS) - {"t2t", "i2t", "t2i"}
+
def _normalize_task_and_bot_task(
task: str,
@@ -110,6 +112,12 @@ def _normalize_task_and_bot_task(
base_task = task
if bot_task_was_omitted:
bot_task = legacy_bot_task
+ elif task in _LEGACY_COMPOSITE_TASKS and bot_task is None:
+ # Composite task names already encode the legacy bot_task. Keep
+ # calls like build_prompt_tokens(task="it2i_think", bot_task=None)
+ # on their historical meaning; explicit None is the plain-mode
+ # escape hatch only for the new two-axis base tasks.
+ bot_task = legacy_bot_task
task = base_task
elif bot_task_was_omitted:
bot_task = "think"
From 4fb78a3b4bee4d4e97e6684f888ca97c4bfd4875 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:48:10 +0800
Subject: [PATCH 28/43] fix(hunyuan_image3): harden edit bridge compatibility
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
...test_serving_chat_multistage_generation.py | 86 ++++++++
.../test_hunyuan_image3.py | 103 ++++++++++
.../hunyuan_image3/pipeline_hunyuan_image3.py | 7 +-
vllm_omni/entrypoints/openai/api_server.py | 9 +-
vllm_omni/entrypoints/openai/serving_chat.py | 43 ++--
.../stage_input_processors/hunyuan_image3.py | 186 +++++++++---------
6 files changed, 319 insertions(+), 115 deletions(-)
create mode 100644 tests/model_executor/stage_input_processors/test_hunyuan_image3.py
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index bd2e877bf18..92f0ac2dc98 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -265,6 +265,92 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
)
+@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"])
+def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str):
+ """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode.
+
+ The task/bot_task split must not normalize every legacy task-enum request
+ into bot_task="think"; i2t/t2t had no / trigger before
+ the split and should stay plain unless the caller passes an explicit
+ semantic bot_task.
+ """
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red")]
+
+ legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="describe me",
+ extra_body={"bot_task": legacy_task},
+ reference_images=images if legacy_task == "i2t" else [],
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+ explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="describe me",
+ extra_body={"task": legacy_task},
+ reference_images=images if legacy_task == "i2t" else [],
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+
+ assert legacy_prompt["prompt"] == explicit_prompt["prompt"]
+ assert legacy_prompt["prompt"].endswith("Assistant: ")
+ assert not legacy_prompt["prompt"].endswith("")
+ assert not legacy_prompt["prompt"].endswith("")
+
+
+@pytest.mark.parametrize(
+ "legacy_task,trigger",
+ [
+ ("it2i_think", ""),
+ ("it2i_recaption", ""),
+ ],
+)
+def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work(
+ serving_chat,
+ legacy_task: str,
+ trigger: str,
+):
+ """Legacy composite task names passed through bot_task must still work."""
+ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+ engine = SimpleNamespace(
+ stage_configs=[
+ SimpleNamespace(stage_type="llm", is_comprehension=True),
+ SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+ ],
+ default_sampling_params_list=[
+ SamplingParams(temperature=0.0),
+ OmniDiffusionSamplingParams(),
+ ],
+ )
+ images = [Image.new("RGB", (32, 32), color="red")]
+
+ legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+ serving_chat,
+ engine=engine,
+ prompt="edit me",
+ extra_body={"bot_task": legacy_task},
+ reference_images=images,
+ gen_params=OmniDiffusionSamplingParams(),
+ )
+
+ assert legacy_prompt["prompt"].count("
") == 1
+ assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}")
+
+
def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
"""Passing bot_task=think_recaption (vs default "think") must flip the
resolved sys_type to en_think_recaption (and trigger tag is still
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
new file mode 100644
index 00000000000..faaa9785452
--- /dev/null
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for HunyuanImage3 stage input processor."""
+
+import builtins
+from types import SimpleNamespace
+
+import pytest
+
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+)
+from vllm_omni.model_executor.stage_input_processors.hunyuan_image3 import (
+ _extract_ratio_index,
+ _truncate_at_cot_end,
+ ar2diffusion,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+def _source_output(token_ids: list[int], text: str = ""):
+ return SimpleNamespace(
+ outputs=[
+ SimpleNamespace(
+ token_ids=token_ids,
+ cumulative_token_ids=token_ids,
+ text=text,
+ )
+ ],
+ multimodal_output=None,
+ )
+
+
+def test_extract_ratio_index_uses_fixed_special_token_ids():
+ ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+
+ assert _extract_ratio_index([1, ratio_33, 2]) == 33
+ assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36
+
+
+def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials():
+ end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ token_ids = [100, 101, end_recaption, answer, boi, ratio]
+
+ text, truncated = _truncate_at_cot_end(
+ "recaption body without special markers",
+ token_ids,
+ )
+
+ assert text == "recaption body without special markers"
+ assert truncated == [100, 101, end_recaption]
+
+
+def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch):
+ real_import = builtins.__import__
+
+ def _block_transformers_import(name, *args, **kwargs):
+ if name == "transformers" or name.startswith("transformers."):
+ raise AssertionError("ar2diffusion must not import transformers on the bridge path")
+ return real_import(name, *args, **kwargs)
+
+ monkeypatch.setattr(builtins, "__import__", _block_transformers_import)
+
+ end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ size = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ token_ids = [100, 101, end_recaption, answer, boi, size, ratio_0]
+
+ result = ar2diffusion(
+ [_source_output(token_ids, text="decoded without special tokens")],
+ prompt=[{"prompt": "edit", "height": 64, "width": 64}],
+ )
+
+ assert len(result) == 1
+ assert (result[0]["height"], result[0]["width"]) == (512, 2048)
+ assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens"
+ assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption]
+
+
+def test_ar2diffusion_forwards_custom_system_prompt_body():
+ end_think = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ marker = "CUSTOM_SYSTEM_BODY"
+
+ result = ar2diffusion(
+ [_source_output([100, end_think], text="thought")],
+ prompt=[
+ {
+ "prompt": "edit",
+ "use_system_prompt": "custom",
+ "system_prompt": marker,
+ }
+ ],
+ )
+
+ assert result[0]["use_system_prompt"] == "custom"
+ assert result[0]["system_prompt"] == marker
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 5a9d1e48856..35390e7312d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1366,10 +1366,13 @@ def forward(
use_system_prompt = extra_args.get("use_system_prompt")
system_prompt = extra_args.get("system_prompt")
# Fall back to per-prompt use_system_prompt forwarded by ar2diffusion
- if use_system_prompt is None and req.prompts:
+ if req.prompts:
first_prompt = req.prompts[0]
if isinstance(first_prompt, dict):
- use_system_prompt = first_prompt.get("use_system_prompt")
+ if use_system_prompt is None:
+ use_system_prompt = first_prompt.get("use_system_prompt")
+ if system_prompt is None:
+ system_prompt = first_prompt.get("system_prompt")
if use_system_prompt is not None:
system_prompt = get_system_prompt(use_system_prompt, "image", system_prompt)
system_prompt = system_prompt.strip() if system_prompt is not None else ""
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 80b01ec284a..7107b544adc 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1759,11 +1759,10 @@ async def edit_images(
status_code=HTTPStatus.BAD_REQUEST.value,
detail=detail,
)
- # Only convert uploads to RGB when the caller opts into the
- # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy
- # callers that send only the older bot_task= shape keep
- # whatever PIL mode the upload arrived as, to preserve pre-existing
- # behavior for non-Hunyuan flows.
+ # Convert uploads to RGB when the caller opts into the Hunyuan-aware
+ # API surface. This includes the legacy bot_task= form:
+ # keeping uploads as RGBA/P PIL objects makes online IT2I observe a
+ # different visual input than the offline path.
normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
prompt["multi_modal_data"] = {}
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 4ba824f0909..7424a9e0d34 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2260,9 +2260,18 @@ def _build_multistage_generation_inputs(
bot_task = extra_body.get("bot_task")
sys_type = extra_body.get("sys_type")
custom_system_prompt = extra_body.get("system_prompt")
- if task is None and bot_task in set(_hunyuan3_available_tasks()):
+ legacy_task_from_bot_task = False
+ legacy_task_names = set(_hunyuan3_available_tasks()) | {
+ "it2i_think",
+ "it2i_recaption",
+ "t2i_think",
+ "t2i_recaption",
+ "t2i_vanilla",
+ }
+ if task is None and bot_task in legacy_task_names:
task = bot_task
bot_task = None
+ legacy_task_from_bot_task = True
engine_prompt_data: dict[str, Any] | None = None
modalities = ["image"]
@@ -2282,13 +2291,21 @@ def _build_multistage_generation_inputs(
)
num_images = len(reference_images) if reference_images else 1
- # build_prompt defaults task="it2i"; preserve that when caller
- # only passed bot_task semantic.
effective_task = task if task is not None else "it2i"
- # build_prompt defaults bot_task="think"; preserve that for legacy
- # callers (passing bot_task=None to build_prompt explicitly gives a
- # different (sys, trigger) than the default "think").
- effective_bot_task = bot_task if bot_task is not None else "think"
+ build_kwargs = {
+ "task": effective_task,
+ "sys_type": sys_type,
+ "custom_system_prompt": custom_system_prompt,
+ "num_images": num_images,
+ }
+ if bot_task is not None:
+ build_kwargs["bot_task"] = bot_task
+ elif "bot_task" in extra_body and not legacy_task_from_bot_task:
+ # Preserve the prompt_utils distinction between omitted
+ # bot_task and explicit None. Omitted keeps each task's legacy
+ # default (`it2i` -> think, `i2t`/`t2t` -> plain), while
+ # explicit None is the caller's plain-mode request.
+ build_kwargs["bot_task"] = None
if tokenizer is not None:
# HF byte-for-byte path: feed segment-tokenized prompt_token_ids
# so AR sees the same template-tokenization HF apply_chat_template
@@ -2301,11 +2318,7 @@ def _build_multistage_generation_inputs(
result = build_prompt_tokens(
prompt,
tokenizer,
- task=effective_task,
- bot_task=effective_bot_task,
- sys_type=sys_type,
- custom_system_prompt=custom_system_prompt,
- num_images=num_images,
+ **build_kwargs,
)
prompt_token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
@@ -2313,11 +2326,7 @@ def _build_multistage_generation_inputs(
# Legacy string path (e.g. unit tests with no tokenizer plumbed).
prompt = build_prompt(
prompt,
- task=effective_task,
- bot_task=effective_bot_task,
- sys_type=sys_type,
- custom_system_prompt=custom_system_prompt,
- num_images=num_images,
+ **build_kwargs,
)
if reference_images and len(reference_images) == 1:
engine_prompt_data = {"image": reference_images[0]}
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index c95a2a48f18..a06d030d0da 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Stage input processor for HunyuanImage3: AR → Diffusion transition.
+"""Stage input processor for HunyuanImage3: AR to Diffusion transition.
In IT2I (image editing) mode:
- Stage 0 (AR) receives (image + edit instruction), generates CoT/latent tokens
- - Stage 1 (DiT) receives the AR output + original image, denoises → edited image
+ - Stage 1 (DiT) receives the AR output + original image, denoises to edited image
The ar2diffusion function bridges these two stages, following the same
signature pattern as glm_image.ar2diffusion.
@@ -12,7 +12,6 @@
from __future__ import annotations
-import os
from functools import lru_cache
from typing import Any
@@ -20,6 +19,9 @@
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+)
from vllm_omni.inputs.data import OmniTokensPrompt
logger = init_logger(__name__)
@@ -27,12 +29,63 @@
# AR emits `` after `` in IT2I/T2I
# (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The
# ratio_index resolves to a (height, width) bucket via ResolutionGroup, which
-# is the official upstream's mechanism for AR-driven output aspect — without
+# is the official upstream's mechanism for AR-driven output aspect; without
# this lookup the DiT pipeline falls back to the user-provided width/height
# (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
-# i.e. the first reference image's bucket — usually square, see
+# i.e. the first reference image's bucket, usually square, see
# api_server.py:1808-1811).
-_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
+_HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = (
+ "1024x768",
+ "1280x720",
+ "768x1024",
+ "720x1280",
+)
+
+
+class _Resolution:
+ def __init__(self, size: str | int | tuple[int, int], *args: int):
+ if isinstance(size, str):
+ if "x" in size:
+ h, w = size.split("x")
+ size = (int(h), int(w))
+ else:
+ size = int(size)
+ if args:
+ size = (int(size), args[0])
+ if isinstance(size, int):
+ size = (size, size)
+
+ self.height = int(size[0])
+ self.width = int(size[1])
+ self.ratio = self.height / self.width
+
+
+def _build_resolutions_by_step(base_size: int, align: int = 1) -> list[_Resolution]:
+ step = base_size // 16
+ min_height = base_size // 2
+ min_width = base_size // 2
+ max_height = base_size * 2
+ max_width = base_size * 2
+
+ resolutions = [_Resolution(base_size, base_size)]
+
+ cur_height, cur_width = base_size, base_size
+ while True:
+ if cur_height >= max_height and cur_width <= min_width:
+ break
+ cur_height = min(cur_height + step, max_height)
+ cur_width = max(cur_width - step, min_width)
+ resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align))
+
+ cur_height, cur_width = base_size, base_size
+ while True:
+ if cur_height <= min_height and cur_width >= max_width:
+ break
+ cur_height = max(cur_height - step, min_height)
+ cur_width = min(cur_width + step, max_width)
+ resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align))
+
+ return sorted(resolutions, key=lambda x: x.ratio)
@lru_cache(maxsize=4)
@@ -43,45 +96,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
`reso_group[ratio_index]` reverse lookup. Cached because the table
is constant per `base_size`.
"""
- from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
- HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
- Resolution,
- ResolutionGroup,
- )
-
- reso_group = ResolutionGroup(
- base_size=base_size,
- extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
- )
- return [(int(r.height), int(r.width)) for r in reso_group.data]
-
-
-@lru_cache(maxsize=4)
-def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]:
- """Return `{'': id, '': id}` for cot-boundary
- truncation. Empty dict on lookup failure so callers degrade to a
- pure text-based search.
- """
- try:
- from transformers import AutoTokenizer
-
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
- except Exception as e: # pragma: no cover - environment-dependent
- logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e)
- return {}
-
- result: dict[str, int] = {}
- for marker in ("", ""):
- tid = tokenizer.convert_tokens_to_ids(marker)
- if tid is not None and tid != tokenizer.unk_token_id:
- result[marker] = int(tid)
- return result
+ resolutions = _build_resolutions_by_step(base_size)
+ for extra_resolution in (_Resolution(s) for s in _HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS):
+ if not any(r.ratio == extra_resolution.ratio for r in resolutions):
+ resolutions.append(extra_resolution)
+ return [(r.height, r.width) for r in resolutions]
def _truncate_at_cot_end(
generated_text: str,
generated_token_ids,
- model_name_or_path: str,
) -> tuple[str, list[int]]:
"""Truncate AR output at first `` (or `` fallback).
@@ -89,63 +113,50 @@ def _truncate_at_cot_end(
upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
`cot_text` for DiT. The trailing ``
sequence is a stage-transition trigger consumed via `image_size` /
- height/width — it must NOT be forwarded to DiT's prompt builder, or
+ height/width; it must NOT be forwarded to DiT's prompt builder, or
the extra `` and ratio tokens drift the DiT's own prompt
structure.
"""
token_list = list(generated_token_ids) if generated_token_ids is not None else []
- end_ids = _build_cot_end_token_ids(model_name_or_path)
+ end_ids = {
+ "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""],
+ "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""],
+ }
for marker in ("", ""):
- idx = generated_text.find(marker)
- if idx == -1:
- continue
- text_end = idx + len(marker)
- truncated_text = generated_text[:text_end]
-
truncated_tokens = token_list
- end_id = end_ids.get(marker)
- if end_id is not None and token_list:
+ end_id = end_ids[marker]
+ if token_list:
try:
token_end = token_list.index(end_id)
truncated_tokens = token_list[: token_end + 1]
except ValueError:
pass
- return truncated_text, truncated_tokens
+
+ idx = generated_text.find(marker)
+ if idx != -1:
+ text_end = idx + len(marker)
+ return generated_text[:text_end], truncated_tokens
+ if truncated_tokens is not token_list:
+ return generated_text, truncated_tokens
return generated_text, token_list
@lru_cache(maxsize=4)
-def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
- """Return `{token_id: ratio_index}` for `` in the tokenizer.
-
- Loads the tokenizer once per model path and walks the contiguous
- `..` plus the extra slice
- `..` (the same shape
- `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531).
- Empty dict on lookup failure so callers can degrade gracefully.
+def _build_ratio_id_lookup() -> dict[int, int]:
+ """Return `{token_id: ratio_index}` for HunyuanImage3 ratio tokens.
+
+ The ids are fixed in tokenizer.json and already pinned in prompt_utils.
+ Avoid loading AutoTokenizer here: this bridge runs on the hot AR->DiT
+ transition path and must keep working in offline deployments where the
+ tokenizer object is not exposed to the stage-input processor.
"""
- try:
- from transformers import AutoTokenizer
-
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
- except Exception as e: # pragma: no cover - environment-dependent
- logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e)
- return {}
-
- def _id(name: str) -> int | None:
- tid = tokenizer.convert_tokens_to_ids(name)
- return None if tid is None or tid == tokenizer.unk_token_id else int(tid)
-
- ratio_0 = _id("")
- ratio_32 = _id("")
- ratio_33 = _id("")
- ratio_36 = _id("")
- if None in (ratio_0, ratio_32, ratio_33, ratio_36):
- logger.warning("[ar2diffusion] tokenizer is missing one of tokens")
- return {}
+ ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio_32 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
table: dict[int, int] = {}
for i in range(ratio_32 - ratio_0 + 1):
@@ -156,22 +167,20 @@ def _id(name: str) -> int | None:
return table
-def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None:
+def _extract_ratio_index(generated_token_ids) -> int | None:
"""Resolve the AR-predicted ratio_index from this stage's output.
`HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit
exactly one `` token after `
`, so we scan the token stream from the tail for the first
- id that maps to a ratio. Token-ids are the source of truth — text-side
+ id that maps to a ratio. Token-ids are the source of truth; text-side
regex is unreliable because most deploy yamls run AR with
`skip_special_tokens: True` (special tokens are stripped from text but
still present in `cumulative_token_ids`).
"""
if generated_token_ids is None:
return None
- table = _build_ratio_id_lookup(model_name_or_path)
- if not table:
- return None
+ table = _build_ratio_id_lookup()
for tid in reversed(list(generated_token_ids)):
idx = table.get(int(tid))
if idx is not None:
@@ -230,10 +239,7 @@ def ar2diffusion(
# square in the multi-image / mismatched-aspect case. Mirrors the
# official upstream where `reso_group[ratio_index]` is the
# canonical source of the diffusion target shape.
- model_name_or_path = original_prompt.get("model") or os.environ.get(
- "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
- )
- ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path)
+ ratio_idx = _extract_ratio_index(generated_token_ids)
ar_predicted = False
if ratio_idx is not None:
base_size = int(original_prompt.get("image_base_size", 1024))
@@ -253,14 +259,12 @@ def ar2diffusion(
# Truncate the AR output at `` (or ``) before
# passing to DiT. Mirrors official `generate_image` which keeps
- # `cot_text` clean and routes size/ratio via `image_size` only —
+ # `cot_text` clean and routes size/ratio via `image_size` only;
# we already extracted `ratio_idx` above and translated it into
# `height` / `width`, so the ``
# tail has no remaining job and would only contaminate DiT's
# prompt builder if forwarded.
- cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(
- generated_text, generated_token_ids, model_name_or_path
- )
+ cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids)
logger.info(
"[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "
From 38668a6e5785fab2b50728d1b231badd0e82efe1 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 15:22:51 +0800
Subject: [PATCH 29/43] revert(hunyuan_image3): roll cond preprocessing back to
magnet_repro state
Restores the IT2I online image quality observed at the magnet_repro
deploy. Two changes from the PR review-feedback round regressed image
quality on multi-image edit prompts:
1. 4da2ff687 switched cond VAE from `latent_dist.sample(generator)` to
`latent_dist.mode()` on both AR and DiT sides. The posterior mean
produces visibly degraded conditioning vs the fixed-seed sample.
2. 1785580ef changed AR `_resize_and_crop` default from `"resize"` to
`"center"` to match a non-existent DiT center-crop default (DiT
bridge actually defaults to `"resize"` too). This broke AR/DiT
preprocessing alignment instead of fixing it.
Revert both:
- AR `_resize_and_crop` default back to `"resize"` and its docstring.
- AR/DiT `_vae_encode`/`vae_encode` back to fixed-generator sample.
- Remove the now-dead `.mode()` method on
`DiagonalGaussianDistribution`.
- Remove the AR/DiT byte-identical preprocessing test added by
1785580ef -- it asserted the wrong invariant (AR `"center"` == DiT
`_resize_and_crop_center`), which no longer holds and was never the
right alignment target.
Keeps the other 4da2ff687 fixes intact: system_prompt body forwarding,
ratio extraction simplification, stale `it2i_recaption` compound name
cleanup, duplicate `prompt_token_ids` assignment removal.
Signed-off-by: Claude Code
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../test_hunyuan_image3_it2i_ar_format.py | 38 -------------------
.../models/hunyuan_image3/autoencoder.py | 3 --
.../hunyuan_image3/pipeline_hunyuan_image3.py | 9 +++--
.../models/hunyuan_image3/hunyuan_image3.py | 28 +++++++-------
4 files changed, 18 insertions(+), 60 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 7550caa50f7..7e7b7de91b2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -72,44 +72,6 @@ def _snapshot_dir(model_id: str) -> pathlib.Path:
# tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py.
-def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
- """AR and DiT must preprocess the same IT2I condition image into the
- same VAE pixels.
-
- This catches drift between the AR-side multimodal processor and the
- diffusion-side bridge without requiring model weights or tokenizer files.
- In particular, portrait input expanded to a landscape output is sensitive
- to accidentally using ``crop_type="resize"`` on one side and center crop
- on the other; the two paths then condition on visibly different fabric
- regions and leave seam-like artifacts around the edited object.
- """
- import numpy as np
- from PIL import Image
-
- from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import (
- _resize_and_crop_center,
- )
- from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import (
- HunyuanImage3Processor,
- )
-
- rng = np.random.default_rng(seed=3444)
- src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)]
- target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)]
-
- for src_w, src_h in src_size_pairs:
- src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8)
- src = Image.fromarray(src_arr, mode="RGB")
- for tw, th in target_size_pairs:
- ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th))
- dit_out = _resize_and_crop_center(src, tw, th)
-
- assert ar_out.size == dit_out.size == (tw, th)
- assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
- f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}"
- )
-
-
_OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot"
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
index ddd7d5c6df7..efba2f27435 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
@@ -46,9 +46,6 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor:
x = self.mean + self.std * sample
return x
- def mode(self) -> torch.FloatTensor:
- return self.mean
-
@dataclass
class DecoderOutput(BaseOutput):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 35390e7312d..14aa0ea903d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,10 +634,11 @@ def vae_encode(self, image, cfg_factor=1):
if isinstance(vae_encode_result, torch.Tensor):
latents = vae_encode_result
else:
- # Cond image is clean conditioning (t=0 below) -- use the
- # posterior mean so encoding is deterministic by construction.
- # See AR-side comment in model_executor/.../hunyuan_image3.py.
- latents = vae_encode_result.latent_dist.mode()
+ # Match HunyuanImage-3's cond encode path: sample the
+ # posterior, but use a fixed generator so repeated online
+ # requests are deterministic.
+ _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
+ latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
if hasattr(config, "shift_factor") and config.shift_factor:
latents.sub_(config.shift_factor)
if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 40a38c7b5ac..cfd5c6764ad 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput):
current_info["vit_spatial_shapes"] = _ss.squeeze(0)
# VAE: per-image bucket via `reso_group.get_target_size`; mirrors
- # HF's `resize_and_crop` default (crop_type="center", the official
- # generate_image default when infer_align_image_size=False).
+ # HF's `resize_and_crop` (crop_type="center", the official
+ # generate_image default with infer_align_image_size=False).
# Keep fp32 — the VAE encoder casts to model dtype at its
# boundary (see `_vae_encode`).
image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
@@ -957,13 +957,13 @@ def _resize_and_crop(
self,
image: Image.Image,
target_size: tuple[int, int],
- crop_type: str = "center",
+ crop_type: str = "resize",
) -> Image.Image:
- # Default mode mirrors official `generate_image` with
- # infer_align_image_size=False: preserve aspect ratio and center-crop
- # to the nearest VAE bucket. Keeping this default aligned with the
- # DiT-side condition-image helper avoids AR and DiT seeing different
- # conditioning pixels for the same IT2I request.
+ # Default mode mirrors the official `infer_align_image_size=True`
+ # path (image_processor.py:355 → crop_type="resize") used by the
+ # IT2I demo: stretch the cond image to the bucket dims so its
+ # `` tag and ViT/VAE features stay aligned with the
+ # bucket, instead of dropping content via center crop.
tw, th = target_size
if crop_type == "resize":
return image.resize((tw, th), resample=Image.Resampling.LANCZOS)
@@ -1777,13 +1777,11 @@ def _vae_encode(
images = images.to(dtype=self.vae.dtype)
vae_encode_result = self.vae.encode(images)
- # Cond image is clean (t=0) conditioning -- take the posterior mean
- # so encoding is deterministic by construction. `.sample()` without a
- # generator consumes torch's global RNG and silently drifts between
- # requests on a long-running server (online) while looking stable for
- # fresh-process callers (offline). `.mode()` matches the official
- # HunyuanImage-3 cond encode path.
- latents = vae_encode_result.latent_dist.mode()
+ # Match HunyuanImage-3's cond encode path: sample the posterior, but
+ # use a fixed generator so online requests do not consume the global
+ # RNG and drift across a long-running server.
+ _cond_vae_gen = torch.Generator(device=images.device).manual_seed(0)
+ latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
# Apply shift and scaling factors if present
if hasattr(config, "shift_factor") and config.shift_factor:
From 9bc67cc589fbb5afc7edcd6b3d60c27bbbcd2656 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 16:12:14 +0800
Subject: [PATCH 30/43] fix(hunyuan_image3): stop AR on <|endoftext|> for
image-output tasks
`resolve_stop_token_ids` returned `` (128025) for all (task,
bot_task) combos. For image-output tasks (`it2i` / `t2i`) this stops
the AR halfway through the size/ratio tail that
`_stage_transitions[]` forces:
<|endoftext|>
^^^^^^^^^^^^ stopped here, ratio never emitted
Downstream `ar2diffusion::_extract_ratio_index` then scans
`cumulative_token_ids` for any ``, finds none, and falls
back to the prompt-carried `height`/`width`. In `end2end.py` for
multi-image IT2I that means the first reference image's shape -- e.g.
a 512x512 logo + a 1179x685 fabric reference collapses the DiT bucket
to 1024x1024 square even though the AR CoT planned image_2's
landscape aspect. Width and texture both regress simultaneously
because DiT has to squeeze the landscape-planned content into a
square bucket.
Online didn't trip this because the deploy yaml explicitly sets
`stop_token_ids: [127957]` (= `<|endoftext|>`) and end2end.py is not
in that codepath. `end2end.py` overrides yaml with
`resolve_stop_token_ids(...)`, so offline always hit the broken stop
regardless of yaml.
Fix: return `[<|endoftext|>]` for `it2i` / `t2i` so AR runs through
the forced tail and `` reaches `ar2diffusion`. Keep
`[]` for `i2t` / `t2t` -- those are comprehension stages
where the response body sits inside ``, so the answer-open
*is* the natural terminator.
Update `test_resolve_stop_token_ids_uses_answer_for_generation_tasks`
to assert the new (correct) split.
Signed-off-by: Claude Code
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/test_prompt_utils.py | 24 +++++++++++++++----
.../models/hunyuan_image3/prompt_utils.py | 18 ++++++++++++++
2 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 371646556f2..0579caaaac8 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -96,12 +96,28 @@ def test_default_prompt_still_uses_it2i_think_mode():
assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""]
-def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
+def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer():
+ """Image-output tasks must stop on <|endoftext|>, not .
+
+ Stopping on chops off the
+ tail forced by `_stage_transitions`, so `_extract_ratio_index` in
+ `ar2diffusion` finds nothing and the DiT output bucket collapses to
+ the first reference image's shape (e.g. 1024x1024 square when AR's
+ CoT planned a 1280x720 landscape).
+ """
tok = FakeTokenizer()
+
+ eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id]
- assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id]
- assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
+
+ # Image-output: t2i / it2i must let AR emit the size/ratio tail.
+ for bot in ("think", "recaption", "think_recaption", "vanilla"):
+ assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id]
+ assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id]
+
+ # Text-output: i2t / t2t comprehension stops on (response sits inside).
+ assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id]
+ assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id]
@pytest.mark.parametrize(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index f78b19a5746..196c86dfa5d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -147,11 +147,29 @@ def resolve_stop_token_ids(
bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
tokenizer: Any | None = None,
) -> list[int]:
+ """AR stop-token ids for a given (task, bot_task) generation request.
+
+ Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``:
+ after ```` the AR's ``_stage_transitions`` force-emits
+ ```` and then samples ```` under
+ ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping
+ early on ```` chops off the size/ratio tail, leaves
+ ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently
+ collapses the DiT output bucket to the first reference image's shape
+ (square logo -> 1024x1024 even when AR's CoT plans a landscape).
+
+ Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR is
+ the final stage, and the comprehension response sits inside the
+ ```` body so the answer-open is the natural cot/recaption
+ terminator.
+ """
task, bot_task = _normalize_task_and_bot_task(task, bot_task)
if task not in _TASKS:
raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
if bot_task not in _BOT_TASK_PRESETS:
raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+ if task in ("it2i", "t2i"):
+ return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]]
return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]]
From dec1c436b70cc2350965813e2e6ab6a3be5f39d3 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 22:08:31 +0800
Subject: [PATCH 31/43] [Bugfix][HunyuanImage3] cap AR KV snapshot at
, defer mid-decode kv_ready forward
Two coupled changes so HunyuanImage3 IT2I no longer ships KV for the
tail that DiT discards anyway:
1. deploy/hunyuan_image3.yaml: add ``kv_transfer_criteria`` so AR's
snapshot fires at (token id 128019). ``stop_after_transfer:
false`` keeps the AR running past the snapshot so it can still emit
for ``ar2diffusion._extract_ratio_index``. With this
yaml + the orchestrator change below, the colleague-confirmed
invariant S - N == 1 (where S is the shipped KV length and N is the
DiT-side ``positive_reuse_len``) is restored. Without the yaml the AR
ships KV all the way through and S - N collapses to 6.
2. engine/orchestrator.py: ``_handle_kv_ready_raw_outputs`` previously
forwarded any kv_ready EngineCoreOutput straight to the next stage.
With ``stop_after_transfer: false`` the kv_ready signal fires
mid-decode (snapshot at , AR still emitting tail), so the
raw EngineCoreOutput has no ``.outputs[0]`` and bridges that read
the AR's full text (HunyuanImage3 ``ar2diffusion``) hit
``AttributeError``. Skip the forward when no finished output for the
same req_id is present in the same raw_outputs batch; the AR's
eventual natural-finish RequestOutput will trigger the forward
through ``_route_output``. Bagel's existing flow (kv_ready and the
deferred-stop finish output co-emit in the same batch) is preserved.
Signed-off-by: zuiho
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/deploy/hunyuan_image3.yaml | 20 ++++++++++++++++++++
vllm_omni/engine/orchestrator.py | 18 ++++++++++++++++++
2 files changed, 38 insertions(+)
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 634165cd33a..8f7c57fdd64 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -37,6 +37,26 @@ stages:
rope_type: default
omni_kv_config:
need_send_cache: true
+ # Cap AR KV snapshot at so the shipped KV exactly
+ # matches the prefix the DiT side reuses (positive_reuse_len =
+ # 0-based index of , slice ``k[:positive_reuse_len]``
+ # excludes itself). Mirrors the colleague-confirmed
+ # invariant S - N == 1. Without this the AR ships KV all the way
+ # through , which DiT
+ # silently discards (S - N == 6) and which keeps the AR pipeline
+ # busy emitting tail tokens that DiT will never use.
+ #
+ # ``stop_after_transfer: false`` keeps the AR running past the
+ # snapshot so it still emits , which ``ar2diffusion``
+ # extracts to derive image height/width. The mid-decode kv_ready
+ # signal that this combination produces is handled in the
+ # orchestrator: forwarding to DiT is deferred until the AR's
+ # natural finish output arrives (see
+ # ``_handle_kv_ready_raw_outputs``).
+ kv_transfer_criteria:
+ type: special_token
+ token_id: 128019 #
+ stop_after_transfer: false
output_connectors:
to_stage_1: shared_memory_connector
default_sampling_params:
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 2d2ac47cbb3..37a9eb291c8 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -695,6 +695,21 @@ async def _handle_kv_ready_raw_outputs(
if self.async_chunk:
return
+ # When kv_ready fires mid-decode (e.g. HunyuanImage3 with
+ # kv_transfer_criteria=special_token + stop_after_transfer=false,
+ # snapshot triggers at but AR keeps generating tail
+ # tokens for ratio extraction), the kv_ready EngineCoreOutput is
+ # NOT a finished RequestOutput, so bridges that read
+ # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only
+ # forward kv_ready when the same raw_outputs batch also contains a
+ # finished output for that req_id; otherwise wait for AR's natural
+ # completion to trigger the forward through ``_route_output``.
+ finished_in_batch = {
+ o.request_id
+ for o in raw_outputs.outputs
+ if getattr(o, "finish_reason", None) is not None
+ }
+
for raw_output in raw_outputs.outputs:
kv_params = getattr(raw_output, "kv_transfer_params", None)
if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")):
@@ -712,6 +727,9 @@ async def _handle_kv_ready_raw_outputs(
if (stage_id + 1) in req_state.stage_submit_ts:
continue
+ if req_id not in finished_in_batch:
+ continue
+
if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id):
self._cfg_tracker.defer_parent(req_id, raw_output, stage_id)
else:
From b84bc2ffa594c796d40a2af0631d8fb0d0c23628 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:43:03 +0800
Subject: [PATCH 32/43] fix(hunyuan_image3): cap IT2I input images at
MAX_IMAGES_PER_REQUEST in entry layer
Per PR #3444 review (Gaohan123): give a friendly, input-named error at the
entry boundary instead of relying on the deeper
`prompt_utils._validate_num_images` to surface as a `num_images must be in
[1, 3]` message. Reuse `MAX_IMAGES_PER_REQUEST` so the cap stays defined in
one place.
- offline `end2end.py`: validate `--image-path` count before opening PIL
- online `serving_chat._build_multistage_generation_inputs`: validate
`reference_images` count before building engine prompt data
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
examples/offline_inference/hunyuan_image3/end2end.py | 6 ++++++
vllm_omni/entrypoints/openai/serving_chat.py | 7 +++++++
2 files changed, 13 insertions(+)
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 908109d65a3..36b3b1199a5 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -8,6 +8,7 @@
from pathlib import Path
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ MAX_IMAGES_PER_REQUEST,
build_prompt_tokens,
resolve_stop_token_ids,
resolve_sys_type,
@@ -177,6 +178,11 @@ def main():
from PIL import Image
image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
+ if len(image_paths) > MAX_IMAGES_PER_REQUEST:
+ raise ValueError(
+ f"--image-path accepts at most {MAX_IMAGES_PER_REQUEST} images for "
+ f"HunyuanImage-3.0 IT2I, got {len(image_paths)}: {args.image_path}"
+ )
for image_path in image_paths:
if not os.path.exists(image_path):
raise ValueError(f"Image path does not exist: {image_path}")
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 7424a9e0d34..26ca0d6170e 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2253,6 +2253,7 @@ def _build_multistage_generation_inputs(
# downstream uses the canonical split. Source the task enum from
# prompt_utils so this layer stays in sync with the model side.
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+ MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
available_tasks as _hunyuan3_available_tasks,
)
@@ -2273,6 +2274,12 @@ def _build_multistage_generation_inputs(
bot_task = None
legacy_task_from_bot_task = True
+ if reference_images and len(reference_images) > _hunyuan3_max_images:
+ raise ValueError(
+ f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input "
+ f"images per request, got {len(reference_images)}"
+ )
+
engine_prompt_data: dict[str, Any] | None = None
modalities = ["image"]
if reference_images:
From 029f567d08e7b465069b6f2a5b1af63ee87b51bd Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 12:06:05 +0800
Subject: [PATCH 33/43] chore: apply pre-commit ruff format / isort fixups
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/engine/orchestrator.py | 6 +-----
vllm_omni/entrypoints/openai/serving_chat.py | 2 ++
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 37a9eb291c8..a764c3b5247 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -704,11 +704,7 @@ async def _handle_kv_ready_raw_outputs(
# forward kv_ready when the same raw_outputs batch also contains a
# finished output for that req_id; otherwise wait for AR's natural
# completion to trigger the forward through ``_route_output``.
- finished_in_batch = {
- o.request_id
- for o in raw_outputs.outputs
- if getattr(o, "finish_reason", None) is not None
- }
+ finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None}
for raw_output in raw_outputs.outputs:
kv_params = getattr(raw_output, "kv_transfer_params", None)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 26ca0d6170e..dfd6c15168a 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2254,6 +2254,8 @@ def _build_multistage_generation_inputs(
# prompt_utils so this layer stays in sync with the model side.
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
+ )
+ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
available_tasks as _hunyuan3_available_tasks,
)
From d8b9263f042cc09f0cb6d220f9ebef833f163dcf Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 12:09:35 +0800
Subject: [PATCH 34/43] chore: rename MAX_IMAGES_PER_REQUEST alias to uppercase
(ruff N811)
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/entrypoints/openai/serving_chat.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index dfd6c15168a..35dd4524fc0 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2253,7 +2253,7 @@ def _build_multistage_generation_inputs(
# downstream uses the canonical split. Source the task enum from
# prompt_utils so this layer stays in sync with the model side.
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
+ MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
)
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
available_tasks as _hunyuan3_available_tasks,
@@ -2276,9 +2276,9 @@ def _build_multistage_generation_inputs(
bot_task = None
legacy_task_from_bot_task = True
- if reference_images and len(reference_images) > _hunyuan3_max_images:
+ if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
raise ValueError(
- f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input "
+ f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input "
f"images per request, got {len(reference_images)}"
)
From 511b76c0865aaac13c8dcd9abe0f0d8cfd49e8c7 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 15:05:59 +0800
Subject: [PATCH 35/43] fix(hunyuan_image3): align AR stop / KV cap / edits
Form with upstream (review)
Addresses Bounty-hunter's PR review on #3444:
1. resolve_stop_token_ids: image-output tasks now stop on the full
token range (ids 128044-128076 + 130103-130106),
mirroring upstream modeling_hunyuan_image_3.py:3289-3303
(`final_stop_tokens = list(range(start_ratio, end_ratio + 1))`).
Replaces the earlier `<|endoftext|>` stop which let AR waste decode
steps past the ratio. test_prompt_utils.py renamed/updated to pin
the new contract.
2. deploy/hunyuan_image3.yaml: drop the kv_transfer_criteria block.
With the ratio-range stop in place AR finishes naturally at the
ratio token, so KV is capped automatically -- no need for
special_token criteria + stop_after_transfer=false.
3. orchestrator._handle_kv_ready_raw_outputs: drop the finished_in_batch
defer. Mid-decode kv_ready only fired when stop_after_transfer=false
was forcing AR past its natural stop; with #2 removed there is no
mid-decode kv_ready to defer. The ratio strip for DiT already lives
in stage_input_processors/hunyuan_image3._truncate_at_cot_end.
4. serving_chat._build_multistage_generation_inputs: call
resolve_stop_token_ids(task, bot_task) and inject into the AR-stage
sampling params. Online now matches offline end2end.py rather than
relying on yaml-side stop_token_ids.
5. api_server.edit_images: drop the redundant `task` Form field.
/v1/images/edits is always IT2I; bot_task / sys_type / system_prompt
remain. Legacy bot_task= still works via chat-handler
normalization.
6. pipeline_hunyuan_image3 + stage_input_processors/hunyuan_image3:
stop reading / writing the `ar_token_ids` extra. The tokenizer-level
`batch_cot_token_ids` parameter is retained for a follow-up PR that
will unify system/user/cot tokenization. See PR description for the
optimization leftover note.
Signed-off-by: Claude Code
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/test_prompt_utils.py | 31 ++++++++------
.../test_hunyuan_image3.py | 2 +-
vllm_omni/deploy/hunyuan_image3.yaml | 20 ----------
.../hunyuan_image3/pipeline_hunyuan_image3.py | 12 +-----
.../models/hunyuan_image3/prompt_utils.py | 35 ++++++++++------
vllm_omni/engine/orchestrator.py | 14 -------
vllm_omni/entrypoints/openai/api_server.py | 40 +++++++------------
vllm_omni/entrypoints/openai/serving_chat.py | 18 +++++++++
.../stage_input_processors/hunyuan_image3.py | 4 --
9 files changed, 76 insertions(+), 100 deletions(-)
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 0579caaaac8..7c3256eee72 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -96,26 +96,31 @@ def test_default_prompt_still_uses_it2i_think_mode():
assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""]
-def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer():
- """Image-output tasks must stop on <|endoftext|>, not .
-
- Stopping on chops off the
- tail forced by `_stage_transitions`, so `_extract_ratio_index` in
- `ar2diffusion` finds nothing and the DiT output bucket collapses to
- the first reference image's shape (e.g. 1024x1024 square when AR's
- CoT planned a 1280x720 landscape).
+def test_resolve_stop_token_ids_image_tasks_stop_on_ratio_range():
+ """Image-output tasks stop on any ```` token.
+
+ Mirrors upstream ``modeling_hunyuan_image_3.py::generate_image``
+ (line 3289-3303): when ``need_ratio`` is true,
+ ``final_stop_tokens = list(range(start_ratio, end_ratio + 1)) +
+ ratio_token_other_slices``. AR stops AT the ratio token sampled
+ after ````; the bridge then strips the trailing ratio
+ token before passing the cot to DiT.
"""
tok = FakeTokenizer()
- eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
- answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ expected = list(range(start, end + 1)) + list(range(other_start, other_end + 1))
- # Image-output: t2i / it2i must let AR emit the size/ratio tail.
+ # Image-output: t2i / it2i stop on the full ratio token range.
for bot in ("think", "recaption", "think_recaption", "vanilla"):
- assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id]
- assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id]
+ assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == expected
+ assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == expected
# Text-output: i2t / t2t comprehension stops on (response sits inside).
+ answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id]
assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id]
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
index faaa9785452..1901210de09 100644
--- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -81,7 +81,7 @@ def _block_transformers_import(name, *args, **kwargs):
assert len(result) == 1
assert (result[0]["height"], result[0]["width"]) == (512, 2048)
assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens"
- assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption]
+ assert "ar_token_ids" not in result[0]["extra"]
def test_ar2diffusion_forwards_custom_system_prompt_body():
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 8f7c57fdd64..634165cd33a 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -37,26 +37,6 @@ stages:
rope_type: default
omni_kv_config:
need_send_cache: true
- # Cap AR KV snapshot at so the shipped KV exactly
- # matches the prefix the DiT side reuses (positive_reuse_len =
- # 0-based index of , slice ``k[:positive_reuse_len]``
- # excludes itself). Mirrors the colleague-confirmed
- # invariant S - N == 1. Without this the AR ships KV all the way
- # through , which DiT
- # silently discards (S - N == 6) and which keeps the AR pipeline
- # busy emitting tail tokens that DiT will never use.
- #
- # ``stop_after_transfer: false`` keeps the AR running past the
- # snapshot so it still emits , which ``ar2diffusion``
- # extracts to derive image height/width. The mid-decode kv_ready
- # signal that this combination produces is handled in the
- # orchestrator: forwarding to DiT is deferred until the AR's
- # natural finish output arrives (see
- # ``_handle_kv_ready_raw_outputs``).
- kv_transfer_criteria:
- type: special_token
- token_id: 128019 #
- stop_after_transfer: false
output_connectors:
to_stage_1: shared_memory_connector
default_sampling_params:
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 14aa0ea903d..63c367a1006 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1386,23 +1386,13 @@ def forward(
# and ``get_cot_sections()`` can parse the think/recaption structure
# directly.
cot_text_list = []
- cot_token_ids_list = []
for p in req.prompts:
extra = p.get("extra", {}) if isinstance(p, dict) else {}
cot_text_list.append(extra.get("ar_generated_text") or None)
- cot_token_ids_list.append(extra.get("ar_token_ids"))
cot_text = (
[self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
)
- # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt
- # tokenization matches ARs actual token sequence byte-for-byte. Required
- # when KV reuse is enabled: positive_reuse_len computed from DiT-side
- # tokenization must equal the AR-side KV cache length, otherwise the
- # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs
- # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on
- # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids
- # in hunyuan_image3_tokenizer.py).
- cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None
+ cot_token_ids = None
batch_cond_image_info: list[list[JointImageInfo]] | None = None
if any(not isinstance(p, str) for p in req.prompts):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 196c86dfa5d..b178b021fd6 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -149,17 +149,19 @@ def resolve_stop_token_ids(
) -> list[int]:
"""AR stop-token ids for a given (task, bot_task) generation request.
- Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``:
- after ```` the AR's ``_stage_transitions`` force-emits
- ```` and then samples ```` under
- ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping
- early on ```` chops off the size/ratio tail, leaves
- ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently
- collapses the DiT output bucket to the first reference image's shape
- (square logo -> 1024x1024 even when AR's CoT plans a landscape).
-
- Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR is
- the final stage, and the comprehension response sits inside the
+ Image-output tasks (``it2i`` / ``t2i``) stop on any ````
+ token. Upstream ``modeling_hunyuan_image_3.py::generate_image``
+ (line 3289-3303) sets ``final_stop_tokens`` to the full ratio token
+ range when ``need_ratio`` is true, then strips the trailing ratio
+ token before passing the cot to the image stage. AR's natural
+ trajectory under ``_stage_transitions`` is
+ ````; stopping
+ AT the ratio token means KV ends exactly at the prefix DiT reuses,
+ and ``ar2diffusion`` can read the ratio off the last sampled token
+ without AR wasting decode steps on ``<|endoftext|>``.
+
+ Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR
+ is the final stage, and the comprehension response sits inside the
```` body so the answer-open is the natural cot/recaption
terminator.
"""
@@ -169,7 +171,16 @@ def resolve_stop_token_ids(
if bot_task not in _BOT_TASK_PRESETS:
raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
if task in ("it2i", "t2i"):
- return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]]
+ # Main ratio range: .. .
+ start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ stops = list(range(start, end + 1))
+ # Other slices (upstream tokenizer ``ratio_token_other_slices``):
+ # .. .
+ other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
+ stops.extend(range(other_start, other_end + 1))
+ return stops
return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]]
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index a764c3b5247..2d2ac47cbb3 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -695,17 +695,6 @@ async def _handle_kv_ready_raw_outputs(
if self.async_chunk:
return
- # When kv_ready fires mid-decode (e.g. HunyuanImage3 with
- # kv_transfer_criteria=special_token + stop_after_transfer=false,
- # snapshot triggers at but AR keeps generating tail
- # tokens for ratio extraction), the kv_ready EngineCoreOutput is
- # NOT a finished RequestOutput, so bridges that read
- # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only
- # forward kv_ready when the same raw_outputs batch also contains a
- # finished output for that req_id; otherwise wait for AR's natural
- # completion to trigger the forward through ``_route_output``.
- finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None}
-
for raw_output in raw_outputs.outputs:
kv_params = getattr(raw_output, "kv_transfer_params", None)
if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")):
@@ -723,9 +712,6 @@ async def _handle_kv_ready_raw_outputs(
if (stage_id + 1) in req_state.stage_submit_ts:
continue
- if req_id not in finished_in_batch:
- continue
-
if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id):
self._cfg_tracker.defer_parent(req_id, raw_output, stage_id)
else:
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 7107b544adc..c54295cf104 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1701,12 +1701,10 @@ async def edit_images(
layers: int | None = Form(None),
resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS
bot_task: str | None = Form(None),
- # P1: task / sys_type / system_prompt split out from the legacy bot_task
- # field so callers can express the full HunyuanImage-3.0 prompt template
- # surface (task enum + bot_task semantic + sys_type override + custom
- # system prompt body). Legacy callers that pass a task-enum value via
- # bot_task still work (see normalization below).
- task: str | None = Form(None),
+ # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis
+ # is fixed and pinned downstream. ``bot_task`` (think / recaption /
+ # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the
+ # only HunyuanImage-3.0 knobs callers need to express here.
sys_type: str | None = Form(None),
system_prompt: str | None = Form(None),
) -> ImageGenerationResponse:
@@ -1760,10 +1758,10 @@ async def edit_images(
detail=detail,
)
# Convert uploads to RGB when the caller opts into the Hunyuan-aware
- # API surface. This includes the legacy bot_task= form:
- # keeping uploads as RGBA/P PIL objects makes online IT2I observe a
- # different visual input than the offline path.
- normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
+ # API surface (bot_task / sys_type / system_prompt). Keeping uploads
+ # as RGBA/P PIL objects makes online IT2I observe a different visual
+ # input than the offline path.
+ normalize_edit_images_rgb = bot_task is not None or sys_type is not None
pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
prompt["multi_modal_data"] = {}
prompt["multi_modal_data"]["image"] = pil_images
@@ -1927,21 +1925,13 @@ async def edit_images(
lora_dict = _get_lora_from_json_str(lora)
_parse_lora_request(lora_dict)
extra_body["lora"] = lora_dict
- # P1: normalize legacy `bot_task=` form. Callers historically
- # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task`
- # Form field; promote it to `task` here so the chat_handler can
- # split task vs bot_task semantics cleanly. New callers pass both
- # `task` and `bot_task` explicitly; we keep them separate.
- _task = task
- _bot_task = bot_task
- _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"}
- if _task is None and _bot_task in _legacy_task_enum:
- _task = _bot_task
- _bot_task = None
- if _task is not None:
- extra_body["task"] = _task
- if _bot_task is not None:
- extra_body["bot_task"] = _bot_task
+ # ``/v1/images/edits`` is always IT2I; the chat handler's
+ # default (``task="it2i"`` when neither ``task`` nor
+ # ``bot_task`` resolves to a task enum) covers this implicitly.
+ # Legacy callers passing the task enum via ``bot_task`` (e.g.
+ # ``bot_task="it2i"``) are normalized inside the chat handler.
+ if bot_task is not None:
+ extra_body["bot_task"] = bot_task
if sys_type is not None:
extra_body["sys_type"] = sys_type
if system_prompt is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 35dd4524fc0..739e55a2ad1 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2257,6 +2257,7 @@ def _build_multistage_generation_inputs(
)
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
available_tasks as _hunyuan3_available_tasks,
+ resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
)
task = extra_body.get("task")
@@ -2408,6 +2409,23 @@ def _build_multistage_generation_inputs(
extra_args["target_h"] = int(height)
extra_args["target_w"] = int(width)
+ # Resolve AR stop tokens dynamically from (task, bot_task) so the
+ # online path matches offline ``end2end.py`` and so the AR stops
+ # at the natural ```` token for image-output tasks
+ # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``).
+ # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR
+ # too early and leave ``ar2diffusion`` without a ratio token.
+ if (
+ comprehension_idx is not None
+ and idx == comprehension_idx
+ and hasattr(default_stage_params, "stop_token_ids")
+ ):
+ resolved_stops = _hunyuan3_resolve_stop_token_ids(
+ task=task if task is not None else "it2i",
+ bot_task=bot_task,
+ )
+ default_stage_params.stop_token_ids = resolved_stops
+
if stage_type == "diffusion":
self._set_if_supported(
default_stage_params,
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index a06d030d0da..5b4d5f56529 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -15,7 +15,6 @@
from functools import lru_cache
from typing import Any
-import torch
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
@@ -278,14 +277,11 @@ def ar2diffusion(
f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
)
- token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long)
-
diffusion_input: dict[str, Any] = {
"prompt": text_prompt,
"height": height,
"width": width,
"extra": {
- "ar_token_ids": token_tensor,
"ar_generated_text": cot_text_for_dit,
},
}
From 8d90c17bd4fe82bc7e2c9990105c4920ce297e5e Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 15:22:00 +0800
Subject: [PATCH 36/43] chore: apply pre-commit isort split for
resolve_stop_token_ids import
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/entrypoints/openai/serving_chat.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 739e55a2ad1..6e2a30f56f2 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2257,6 +2257,8 @@ def _build_multistage_generation_inputs(
)
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
available_tasks as _hunyuan3_available_tasks,
+ )
+ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
)
From b73b00f6fd3e7c509c5de537817ffcea916c048b Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:04:17 +0800
Subject: [PATCH 37/43] chore(hunyuan_image3): drop dead cot_token_ids plumbing
and online task input
- Online chat handler: drop `task` from extra_body; derive task from
reference_images presence. Legacy `bot_task=` still
normalizes through to the right trigger.
- Remove the AR-token-id cot reuse path (`batch_cot_token_ids` in
apply_chat_template, `ctx_type == "token_ids"` branch in
process_successive_message, and `get_cot_sections_from_token_ids`);
it has no caller after the optimization was rolled back per reviewer
feedback.
- Simplify `_truncate_at_cot_end` to text-only; the token-id return was
no longer consumed.
- Trim over-explanatory comments across serving_chat / api_server /
pipeline / end2end.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../hunyuan_image3/end2end.py | 23 +--
.../hunyuan_image3/test_kvreuse_alignment.py | 135 ------------------
...test_serving_chat_multistage_generation.py | 72 +---------
.../test_hunyuan_image3.py | 17 +--
.../hunyuan_image3_tokenizer.py | 123 ++--------------
.../hunyuan_image3/pipeline_hunyuan_image3.py | 19 +--
vllm_omni/entrypoints/openai/api_server.py | 24 +---
vllm_omni/entrypoints/openai/serving_chat.py | 97 ++++---------
.../stage_input_processors/hunyuan_image3.py | 49 ++-----
9 files changed, 66 insertions(+), 493 deletions(-)
delete mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 36b3b1199a5..16f7d8f06c1 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -19,29 +19,12 @@
_REPO_ROOT = Path(__file__).resolve().parents[3]
_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
-# Modality → (task, default bot_task) mapping. `task` selects only whether
-# `
` placeholders are emitted; `bot_task` (None | think | recaption |
-# think_recaption | vanilla) selects the system prompt + trigger tag.
-#
-# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short
-# forms match the internal task names (see prompt_utils.available_tasks)
-# so users who think in those terms don't have to translate.
+
_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
"text2img": ("t2i", "think"),
- "t2i": ("t2i", "think"),
"img2img": ("it2i", "think"),
- "it2i": ("it2i", "think"),
"img2text": ("i2t", None),
- "i2t": ("i2t", None),
"text2text": ("t2t", None),
- "t2t": ("t2t", None),
-}
-
-_MODALITY_CANONICAL = {
- "t2i": "text2img",
- "it2i": "img2img",
- "i2t": "img2text",
- "t2t": "text2text",
}
_MODALITY_DEFAULT_DEPLOY_CONFIG = {
@@ -65,8 +48,7 @@ def parse_args():
parser.add_argument(
"--modality",
default="text2img",
- choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"],
- help="Verbose and internal short task names are both accepted.",
+ choices=list(_MODALITY_TASK_MAP),
)
parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
parser.add_argument(
@@ -135,7 +117,6 @@ def main():
os.makedirs(args.output, exist_ok=True)
additional_config = parse_additional_config(args.additional_config)
- args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality)
task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
if args.bot_task is None:
bot_task: str | None = default_bot_task
diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
deleted file mode 100644
index 20faf5487dc..00000000000
--- a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Regression tests for AR-token-IDs preservation through DiT prompt building.
-
-Pins the KV-reuse alignment contract: when the AR-side stage input
-processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion
-stage, `apply_chat_template` must consume those IDs verbatim (no
-re-encode of the decoded cot text via `tokenizer.encode`) so that the
-DiT-side prompt tokenization matches AR's actually-sampled token
-sequence byte-for-byte.
-
-Why this matters: tokenize-detokenize-tokenize over the cot text is not
-lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries),
-and the resulting length drift breaks AR KV position alignment --
-DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`)
-ends up larger than the actual cached AR KV length, and
-`inject_ar_kv_into_layers` then silently truncates via Python slice,
-leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off
-by N (hard 500 on KV-reuse-enabled requests; see
-`pipeline_hunyuan_image3.py:_cache_prompt_kv`).
-"""
-
-from __future__ import annotations
-
-import os
-
-import pytest
-
-pytestmark = [pytest.mark.core_model]
-
-
-def _hf_cached(model_id: str) -> bool:
- hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
- snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
- return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
-
-
-_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
-
-
-@pytest.mark.skipif(
- not _hf_cached(_HUNYUAN_MODEL_ID),
- reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
-def test_get_cot_sections_from_token_ids_round_trips_ar_ids():
- """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the
- `` / `` token-id positions and emit sections whose
- concatenated tokens equal the input (no re-encode).
-
- Catches the failure mode where DiT re-encodes the decoded cot text
- and the BPE merges differ from AR's sampled tokens (length drift).
- """
- from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
- TokenizerWrapper,
- )
-
- tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
-
- think_id = tkw.tokenizer.convert_tokens_to_ids("")
- end_think_id = tkw.end_think_token_id
-
- # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens
- # surrounded by / markers, plus some leading + trailing
- # tokens (e.g. / tail that gets truncated upstream).
- thought_payload = [1000, 1001, 1002, 1003, 1004]
- leading = [2000, 2001]
- trailing = [3000]
- ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing
-
- sections = tkw.get_cot_sections_from_token_ids(
- ar_token_ids,
- uncond_kwargs={},
- drop_think=False,
- )
-
- # Sections concatenated must equal the input verbatim.
- out: list[int] = []
- for sec in sections:
- assert sec["type"] == "text", f"unexpected section type: {sec}"
- toks = sec.get("tokens")
- assert toks is not None, f"section missing 'tokens' field: {sec}"
- out.extend(toks)
- assert out == ar_token_ids, (
- f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; "
- f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}"
- )
-
-
-@pytest.mark.skipif(
- not _hf_cached(_HUNYUAN_MODEL_ID),
- reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
-def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids():
- """When `batch_cot_token_ids` is passed, the assistant section in the
- final encoded token sequence must contain the AR-sampled token ids
- verbatim -- no `tokenizer.encode(cot_text)` round-trip.
-
- Pins the end-to-end contract that KV-reuse alignment relies on.
- """
- from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
- TokenizerWrapper,
- )
-
- tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
- think_id = tkw.tokenizer.convert_tokens_to_ids("")
- end_think_id = tkw.end_think_token_id
-
- # Construct a synthetic AR cot id sequence. Use mid-range vocab ids
- # that are very unlikely to collide with any chat-template specials.
- payload = [55001, 55002, 55003]
- ar_token_ids = [think_id] + payload + [end_think_id]
-
- out_with_ids = tkw.apply_chat_template(
- batch_prompt=["draw a robot"],
- batch_system_prompt=[None],
- batch_cot_token_ids=[ar_token_ids],
- mode="gen_text",
- sequence_template="instruct",
- )
- tokens_with_ids = out_with_ids["output"].tokens.tolist()[0] # batched output: take batch 0
-
- # The exact AR payload must appear as a contiguous subsequence in the
- # encoded output, sandwiched by the think markers we forwarded.
- def _find_subseq(haystack: list[int], needle: list[int]) -> int:
- n = len(needle)
- for i in range(len(haystack) - n + 1):
- if haystack[i : i + n] == needle:
- return i
- return -1
-
- full_cot = [think_id] + payload + [end_think_id]
- idx = _find_subseq(tokens_with_ids, full_cot)
- assert idx >= 0, (
- f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; "
- f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead"
- )
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 92f0ac2dc98..dd7f668611e 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -219,13 +219,9 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
- """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under
- `bot_task` in extra_body. After the P1 task/bot_task split, the helper
- must still treat that legacy form as `task=, bot_task=None`
- (i.e. defaults bot_task semantic to "think"), so the resulting prompt
- is identical to the pre-P1 output.
-
- Pins the back-compat contract.
+ """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now
+ inferred from reference_images; legacy bot_task must still trigger the
+ default think mode rather than getting silently dropped.
"""
from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
@@ -241,7 +237,6 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
)
images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
- # Legacy form: only bot_task=.
legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
serving_chat,
engine=engine,
@@ -250,65 +245,8 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
reference_images=images,
gen_params=OmniDiffusionSamplingParams(),
)
- # New form: explicit task=, no bot_task.
- new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
- serving_chat,
- engine=engine,
- prompt="edit me",
- extra_body={"task": "it2i"},
- reference_images=images,
- gen_params=OmniDiffusionSamplingParams(),
- )
- assert legacy_prompt["prompt"] == new_prompt["prompt"], (
- f"legacy bot_task= form must produce the same prompt as task=; "
- f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}"
- )
-
-
-@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"])
-def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str):
- """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode.
-
- The task/bot_task split must not normalize every legacy task-enum request
- into bot_task="think"; i2t/t2t had no / trigger before
- the split and should stay plain unless the caller passes an explicit
- semantic bot_task.
- """
- from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
- engine = SimpleNamespace(
- stage_configs=[
- SimpleNamespace(stage_type="llm", is_comprehension=True),
- SimpleNamespace(stage_type="diffusion", is_comprehension=False),
- ],
- default_sampling_params_list=[
- SamplingParams(temperature=0.0),
- OmniDiffusionSamplingParams(),
- ],
- )
- images = [Image.new("RGB", (32, 32), color="red")]
-
- legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
- serving_chat,
- engine=engine,
- prompt="describe me",
- extra_body={"bot_task": legacy_task},
- reference_images=images if legacy_task == "i2t" else [],
- gen_params=OmniDiffusionSamplingParams(),
- )
- explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
- serving_chat,
- engine=engine,
- prompt="describe me",
- extra_body={"task": legacy_task},
- reference_images=images if legacy_task == "i2t" else [],
- gen_params=OmniDiffusionSamplingParams(),
- )
-
- assert legacy_prompt["prompt"] == explicit_prompt["prompt"]
- assert legacy_prompt["prompt"].endswith("Assistant: ")
- assert not legacy_prompt["prompt"].endswith("")
- assert not legacy_prompt["prompt"].endswith("")
+ assert legacy_prompt["prompt"].count("
") == 2
+ assert legacy_prompt["prompt"].endswith("Assistant: ")
@pytest.mark.parametrize(
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
index 1901210de09..76f3e500622 100644
--- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -40,20 +40,9 @@ def test_extract_ratio_index_uses_fixed_special_token_ids():
assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36
-def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials():
- end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]
- token_ids = [100, 101, end_recaption, answer, boi, ratio]
-
- text, truncated = _truncate_at_cot_end(
- "recaption body without special markers",
- token_ids,
- )
-
- assert text == "recaption body without special markers"
- assert truncated == [100, 101, end_recaption]
+def test_truncate_at_cot_end_strips_tail_after_recaption_marker():
+ text = _truncate_at_cot_end("body text")
+ assert text == "body text"
def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index e6e0c9db346..5751cb4d831 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -903,75 +903,6 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th
dict(type="text", text=cot_text, **uncond_kwargs),
]
- def get_cot_sections_from_token_ids(
- self,
- token_ids,
- uncond_kwargs,
- cot_max_length=None,
- drop_think=False,
- ):
- """Split AR-sampled token IDs at think/recaption markers without re-encoding.
-
- Functional mirror of `get_cot_sections` but operates on AR sampled IDs.
- Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR
- cot text is not lossless (BPE re-merges across multi-byte UTF-8 and
- punctuation boundaries). The resulting length drift breaks AR KV
- position alignment (`positive_reuse_len` computed in DiT-tok space vs
- the actual cached AR KV in AR-tok space, off by N tokens for prompts
- containing Chinese + escaped quotes etc.).
- """
- if not token_ids:
- return []
- ids = list(token_ids)
-
- think_id = self.tokenizer.convert_tokens_to_ids("")
- end_think_id = self.end_think_token_id
- recaption_id = self.tokenizer.convert_tokens_to_ids("")
- end_recaption_id = self.end_recaption_token_id
-
- def _split_at_pair(seq, start_id, end_id):
- if start_id is None or end_id is None:
- return None
- try:
- s = seq.index(start_id)
- e = seq.index(end_id, s + 1)
- except ValueError:
- return None
- return seq[:s], seq[s + 1 : e], seq[e + 1 :]
-
- # Try ... first to mirror text-side split order.
- split = _split_at_pair(ids, think_id, end_think_id)
- if split is not None:
- before, inside, after = split
- return (
- self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
- + (
- [
- dict(type="text", tokens=[think_id]),
- dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
- dict(type="text", tokens=[end_think_id]),
- ]
- if not drop_think
- else []
- )
- + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
- )
-
- split = _split_at_pair(ids, recaption_id, end_recaption_id)
- if split is not None:
- before, inside, after = split
- return (
- self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
- + [
- dict(type="text", tokens=[recaption_id]),
- dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
- dict(type="text", tokens=[end_recaption_id]),
- ]
- + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
- )
-
- return [dict(type="text", tokens=ids, **uncond_kwargs)]
-
def apply_general_template(
self,
message_list,
@@ -1022,36 +953,17 @@ def process_successive_message(
while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role:
message = _message_list[_cur_message_idx]
if message["type"] == "text":
- content = message["content"]
- ctx_type = message.get("context_type", "str")
+ text = message["content"]
if role == "system":
- _sub_sections.append(dict(type="text", text=content))
+ _sub_sections.append(dict(type="text", text=text))
elif role == "assistant":
- if ctx_type == "token_ids":
- # Pre-tokenized AR cot tokens; split on marker ids, no re-encode.
- if hasattr(content, "tolist"):
- content = content.tolist()
- think_id = self.tokenizer.convert_tokens_to_ids("")
- recaption_id = self.tokenizer.convert_tokens_to_ids("")
- has_cot = (think_id in content and self.end_think_token_id in content) or (
- recaption_id in content and self.end_recaption_token_id in content
- )
- if has_cot:
- _sub_sections.extend(
- self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think)
- )
- else:
- _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs))
+ if ("" in text and "" in text) or (
+ "" in text and "" in text
+ ):
+ _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
else:
- text = content
- if ("" in text and "" in text) or (
- "" in text and "" in text
- ):
- _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
- else:
- _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
+ _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
else:
- text = content
_sub_sections.append(
dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs)
)
@@ -1176,7 +1088,6 @@ def apply_chat_template(
batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None,
batch_system_prompt: list[str] | None = None,
batch_cot_text: list[str] | None = None,
- batch_cot_token_ids: list | None = None,
max_length: int | None = None,
bot_task: str = "auto", # auto/image/think/recaption/img_ratio
image_base_size: int = 1024,
@@ -1205,14 +1116,6 @@ def apply_chat_template(
)
else:
batch_cot_text = [None] * batch_size
- # Optional per-item pre-tokenized AR cot ids (used by KV-reuse).
- if batch_cot_token_ids is not None:
- assert len(batch_cot_token_ids) == batch_size, (
- f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), "
- f"but got {len(batch_cot_token_ids)}."
- )
- else:
- batch_cot_token_ids = [None] * batch_size
if batch_cond_image_info is not None:
assert len(batch_cond_image_info) == batch_size, (
f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
@@ -1231,14 +1134,12 @@ def apply_chat_template(
prompt,
system_prompt,
cot_text,
- cot_token_ids,
gen_image_info,
cond_image_info_list,
) in zip(
batch_prompt,
batch_system_prompt,
batch_cot_text,
- batch_cot_token_ids,
batch_gen_image_info,
batch_cond_image_info,
):
@@ -1258,15 +1159,7 @@ def apply_chat_template(
# 2.2 text inputs
message_list.append(dict(role="user", type="text", content=prompt, context_type="str"))
# 3. assistant answer sections
- if cot_token_ids is not None:
- # Use AR-sampled token IDs verbatim. Avoids the
- # tokenize-detokenize-tokenize length drift that breaks KV reuse
- # (see process_successive_message context_type="token_ids" branch
- # and get_cot_sections_from_token_ids docstring).
- message_list.append(
- dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids")
- )
- elif cot_text is not None:
+ if cot_text is not None:
message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str"))
if mode == "gen_image":
message_list.append(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 63c367a1006..33bfb65fb41 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -754,7 +754,6 @@ def prepare_model_inputs(
mode="gen_image",
system_prompt=None,
cot_text=None,
- cot_token_ids=None,
num_inference_steps=50,
guidance_scale=5.0,
image_size="auto",
@@ -771,7 +770,6 @@ def prepare_model_inputs(
batch_message_list = message_list
batch_prompt = prompt
batch_cot_text = cot_text
- batch_cot_token_ids = cot_token_ids
batch_system_prompt = system_prompt
batch_gen_image_info = None
batch_cond_image_info = kwargs.pop("batch_cond_image_info", None)
@@ -850,7 +848,6 @@ def prepare_model_inputs(
batch_cond_image_info=batch_cond_image_info,
batch_system_prompt=batch_system_prompt,
batch_cot_text=batch_cot_text,
- batch_cot_token_ids=batch_cot_token_ids,
max_length=kwargs.get("max_length"),
bot_task=bot_task,
image_base_size=self.config.image_base_size,
@@ -1379,20 +1376,13 @@ def forward(
system_prompt = system_prompt.strip() if system_prompt is not None else ""
prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt
- # Extract AR-generated CoT/recaption text from each prompt's extra dict.
- # The AR-side stage input processor (``ar2diffusion``) already prepends
- # the trigger tag (e.g. ````) when the AR used the KV-reuse
- # pretrain format, so ``ar_generated_text`` is a self-contained string
- # and ``get_cot_sections()`` can parse the think/recaption structure
- # directly.
- cot_text_list = []
- for p in req.prompts:
- extra = p.get("extra", {}) if isinstance(p, dict) else {}
- cot_text_list.append(extra.get("ar_generated_text") or None)
+ cot_text_list = [
+ (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None
+ for p in req.prompts
+ ]
cot_text = (
[self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
)
- cot_token_ids = None
batch_cond_image_info: list[list[JointImageInfo]] | None = None
if any(not isinstance(p, str) for p in req.prompts):
@@ -1433,7 +1423,6 @@ def forward(
model_inputs = self.prepare_model_inputs(
prompt=prompt,
cot_text=cot_text,
- cot_token_ids=cot_token_ids,
system_prompt=system_prompt,
mode="gen_image",
generator=generator,
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index c54295cf104..c1467f7190a 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1700,11 +1700,8 @@ async def edit_images(
# vllm-omni extension for layered models (e.g., Qwen-Image-Layered)
layers: int | None = Form(None),
resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS
+ # /v1/images/edits is always IT2I; only the prompting knobs are exposed.
bot_task: str | None = Form(None),
- # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis
- # is fixed and pinned downstream. ``bot_task`` (think / recaption /
- # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the
- # only HunyuanImage-3.0 knobs callers need to express here.
sys_type: str | None = Form(None),
system_prompt: str | None = Form(None),
) -> ImageGenerationResponse:
@@ -1757,10 +1754,8 @@ async def edit_images(
status_code=HTTPStatus.BAD_REQUEST.value,
detail=detail,
)
- # Convert uploads to RGB when the caller opts into the Hunyuan-aware
- # API surface (bot_task / sys_type / system_prompt). Keeping uploads
- # as RGBA/P PIL objects makes online IT2I observe a different visual
- # input than the offline path.
+ # Match the offline path: RGB normalize when the caller opts into
+ # Hunyuan-aware behavior. RGBA/P uploads otherwise diverge from offline.
normalize_edit_images_rgb = bot_task is not None or sys_type is not None
pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
prompt["multi_modal_data"] = {}
@@ -1895,12 +1890,8 @@ async def edit_images(
"seed": effective_seed,
"num_outputs_per_prompt": n,
}
- # When size="auto", width/height were resolved from the first
- # input images size (e.g. 512x512 logo), NOT a client-requested
- # output dimension. Forwarding them to extra_body would override
- # AR-driven pipelines (e.g. HunyuanImage-3.0) AR ``
- # token decision via gen_params -> sampling_params. Skip the
- # forward when auto, matching offline end2end.py img2img.
+ # size="auto" resolves width/height from input image; forwarding
+ # those would override AR-driven `` token selection.
if not size_was_auto:
if width is not None:
extra_body["width"] = width
@@ -1925,11 +1916,6 @@ async def edit_images(
lora_dict = _get_lora_from_json_str(lora)
_parse_lora_request(lora_dict)
extra_body["lora"] = lora_dict
- # ``/v1/images/edits`` is always IT2I; the chat handler's
- # default (``task="it2i"`` when neither ``task`` nor
- # ``bot_task`` resolves to a task enum) covers this implicitly.
- # Legacy callers passing the task enum via ``bot_task`` (e.g.
- # ``bot_task="it2i"``) are normalized inside the chat handler.
if bot_task is not None:
extra_body["bot_task"] = bot_task
if sys_type is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 6e2a30f56f2..4677135cdb0 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,37 +2247,26 @@ def _build_multistage_generation_inputs(
lora_body = extra_body.get("lora")
layers = extra_body.get("layers")
resolution = extra_body.get("resolution")
- # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy
- # api_server callers may still pass a task-enum value (i2t / it2i /
- # t2i / t2t) under `bot_task`; normalize it to `task` here so
- # downstream uses the canonical split. Source the task enum from
- # prompt_utils so this layer stays in sync with the model side.
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
)
- from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- available_tasks as _hunyuan3_available_tasks,
- )
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
)
- task = extra_body.get("task")
bot_task = extra_body.get("bot_task")
sys_type = extra_body.get("sys_type")
custom_system_prompt = extra_body.get("system_prompt")
- legacy_task_from_bot_task = False
- legacy_task_names = set(_hunyuan3_available_tasks()) | {
- "it2i_think",
- "it2i_recaption",
- "t2i_think",
- "t2i_recaption",
- "t2i_vanilla",
- }
- if task is None and bot_task in legacy_task_names:
- task = bot_task
+
+ # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via
+ # bot_task. Task is now derived from reference_images presence; map
+ # composites to their semantic bot_task and drop bare task enums.
+ bot_task_omitted = False
+ if bot_task in {"it2i", "t2i", "i2t", "t2t"}:
bot_task = None
- legacy_task_from_bot_task = True
+ bot_task_omitted = True
+ elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}:
+ bot_task = bot_task.split("_", 1)[1]
if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
raise ValueError(
@@ -2285,6 +2274,8 @@ def _build_multistage_generation_inputs(
f"images per request, got {len(reference_images)}"
)
+ task = "it2i" if reference_images else "t2i"
+
engine_prompt_data: dict[str, Any] | None = None
modalities = ["image"]
if reference_images:
@@ -2296,50 +2287,33 @@ def _build_multistage_generation_inputs(
prompt_token_ids: list[int] | None = None
system_prompt_type: str | None = None
- if task or bot_task:
+ if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted:
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
build_prompt,
build_prompt_tokens,
)
- num_images = len(reference_images) if reference_images else 1
- effective_task = task if task is not None else "it2i"
- build_kwargs = {
- "task": effective_task,
+ build_kwargs: dict[str, Any] = {
+ "task": task,
"sys_type": sys_type,
"custom_system_prompt": custom_system_prompt,
- "num_images": num_images,
+ "num_images": len(reference_images) if reference_images else 1,
}
if bot_task is not None:
build_kwargs["bot_task"] = bot_task
- elif "bot_task" in extra_body and not legacy_task_from_bot_task:
- # Preserve the prompt_utils distinction between omitted
- # bot_task and explicit None. Omitted keeps each task's legacy
- # default (`it2i` -> think, `i2t`/`t2t` -> plain), while
- # explicit None is the caller's plain-mode request.
+ elif "bot_task" in extra_body and not bot_task_omitted:
+ # Explicit None from the caller is plain-mode; omitted lets
+ # each task fall back to its default trigger.
build_kwargs["bot_task"] = None
if tokenizer is not None:
- # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
- # so AR sees the same template-tokenization HF apply_chat_template
- # produces. Without this, the engine BPE-merges across template
- # segment boundaries (e.g. "。\n\n" -> single id) and AR
- # diverges from training distribution -- different cot_text,
- # different DiT input, different final image. Mirrors offline
- # examples/.../end2end.py img2img which always feeds
- # prompt_token_ids. See prompt_utils.build_prompt NOTE.
- result = build_prompt_tokens(
- prompt,
- tokenizer,
- **build_kwargs,
- )
+ # Feed segment-tokenized prompt_token_ids so AR matches HF
+ # apply_chat_template byte-for-byte (engine BPE would merge
+ # across template boundaries, e.g. "。\n\n" -> single id).
+ result = build_prompt_tokens(prompt, tokenizer, **build_kwargs)
prompt_token_ids = result.token_ids
system_prompt_type = result.system_prompt_type
else:
- # Legacy string path (e.g. unit tests with no tokenizer plumbed).
- prompt = build_prompt(
- prompt,
- **build_kwargs,
- )
+ prompt = build_prompt(prompt, **build_kwargs)
if reference_images and len(reference_images) == 1:
engine_prompt_data = {"image": reference_images[0]}
modalities = ["image"]
@@ -2349,10 +2323,8 @@ def _build_multistage_generation_inputs(
engine_prompt["prompt_token_ids"] = prompt_token_ids
if system_prompt_type is not None:
engine_prompt["use_system_prompt"] = system_prompt_type
- # Forward the custom system prompt body too. DiT's
- # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads
- # the third positional arg, so leaving it None turns a `sys_type=custom`
- # request into an empty DiT system prefix (AR/DiT divergence).
+ # DiT's get_system_prompt(use_system_prompt, "image", system_prompt) reads
+ # this; omitting it makes sys_type=custom yield an empty DiT prefix.
if custom_system_prompt is not None:
engine_prompt["system_prompt"] = custom_system_prompt
engine_prompt["modalities"] = modalities
@@ -2399,10 +2371,8 @@ def _build_multistage_generation_inputs(
):
default_stage_params.seed = seed
- # Inject target_h/w into comprehension (AR) stage sampling params
- # for models that need M-RoPE position pre-computation (e.g.
- # GLM-Image). max_tokens is handled via the deploy YAML default
- # (upper-bound ceiling) rather than computed dynamically here.
+ # Inject target_h/w into AR stage for M-RoPE position pre-computation
+ # (e.g. GLM-Image). max_tokens comes from deploy YAML.
if comprehension_idx is not None and idx == comprehension_idx and height is not None and width is not None:
extra_args = getattr(default_stage_params, "extra_args", None)
if extra_args is None:
@@ -2411,22 +2381,17 @@ def _build_multistage_generation_inputs(
extra_args["target_h"] = int(height)
extra_args["target_w"] = int(width)
- # Resolve AR stop tokens dynamically from (task, bot_task) so the
- # online path matches offline ``end2end.py`` and so the AR stops
- # at the natural ```` token for image-output tasks
- # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``).
- # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR
- # too early and leave ``ar2diffusion`` without a ratio token.
+ # Stop AR at the natural token for image tasks; mirrors
+ # upstream modeling_hunyuan_image_3.py:3289-3303.
if (
comprehension_idx is not None
and idx == comprehension_idx
and hasattr(default_stage_params, "stop_token_ids")
):
- resolved_stops = _hunyuan3_resolve_stop_token_ids(
- task=task if task is not None else "it2i",
+ default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids(
+ task=task,
bot_task=bot_task,
)
- default_stage_params.stop_token_ids = resolved_stops
if stage_type == "diffusion":
self._set_if_supported(
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 5b4d5f56529..749e213e099 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -102,45 +102,19 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
return [(r.height, r.width) for r in resolutions]
-def _truncate_at_cot_end(
- generated_text: str,
- generated_token_ids,
-) -> tuple[str, list[int]]:
+def _truncate_at_cot_end(generated_text: str) -> str:
"""Truncate AR output at first `` (or `` fallback).
- Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official
- upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
- `cot_text` for DiT. The trailing ``
- sequence is a stage-transition trigger consumed via `image_size` /
- height/width; it must NOT be forwarded to DiT's prompt builder, or
- the extra `` and ratio tokens drift the DiT's own prompt
- structure.
+ Mirrors upstream `HunyuanImage3ForCausalMM.generate_image` which feeds
+ DiT only the cot text up to the closing tag; the trailing
+ `` is consumed via height/width
+ extraction and must not leak into DiT's prompt builder.
"""
- token_list = list(generated_token_ids) if generated_token_ids is not None else []
-
- end_ids = {
- "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""],
- "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""],
- }
-
for marker in ("", ""):
- truncated_tokens = token_list
- end_id = end_ids[marker]
- if token_list:
- try:
- token_end = token_list.index(end_id)
- truncated_tokens = token_list[: token_end + 1]
- except ValueError:
- pass
-
idx = generated_text.find(marker)
if idx != -1:
- text_end = idx + len(marker)
- return generated_text[:text_end], truncated_tokens
- if truncated_tokens is not token_list:
- return generated_text, truncated_tokens
-
- return generated_text, token_list
+ return generated_text[: idx + len(marker)]
+ return generated_text
@lru_cache(maxsize=4)
@@ -256,14 +230,7 @@ def ar2diffusion(
width,
)
- # Truncate the AR output at `` (or ``) before
- # passing to DiT. Mirrors official `generate_image` which keeps
- # `cot_text` clean and routes size/ratio via `image_size` only;
- # we already extracted `ratio_idx` above and translated it into
- # `height` / `width`, so the ``
- # tail has no remaining job and would only contaminate DiT's
- # prompt builder if forwarded.
- cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids)
+ cot_text_for_dit = _truncate_at_cot_end(generated_text)
logger.info(
"[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "
From 8d12ddda27f7f4e9d038a7eb2e5dab10a91eb2ee Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:09:14 +0800
Subject: [PATCH 38/43] chore: apply ruff-format fixup for cot_text_list
comprehension
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 33bfb65fb41..73b89bb11b0 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1377,8 +1377,7 @@ def forward(
prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt
cot_text_list = [
- (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None
- for p in req.prompts
+ (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None for p in req.prompts
]
cot_text = (
[self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
From bfd17b37599207c86b88e55908daea5d2c160041 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:23:39 +0800
Subject: [PATCH 39/43] chore: keep for-loop one-line in apply_chat_template
(no spurious diff)
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../models/hunyuan_image3/hunyuan_image3_tokenizer.py | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index 5751cb4d831..751bfb21af8 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -1130,13 +1130,7 @@ def apply_chat_template(
# Convert single round materials into standard message list
batch_message_list = []
- for (
- prompt,
- system_prompt,
- cot_text,
- gen_image_info,
- cond_image_info_list,
- ) in zip(
+ for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip(
batch_prompt,
batch_system_prompt,
batch_cot_text,
From 1de9ec8bcd7f0376f521e4c528a0e6758a26eb05 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:38:50 +0800
Subject: [PATCH 40/43] test: rename test_hunyuan_image3.py to avoid pytest
basename collision
Collided with tests/e2e/accuracy/test_hunyuan_image3.py under pytest's
default 'prepend' import mode (no __init__.py in either dir). Rename
this one to make basenames unique.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py} | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename tests/model_executor/stage_input_processors/{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py} (100%)
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py
similarity index 100%
rename from tests/model_executor/stage_input_processors/test_hunyuan_image3.py
rename to tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py
From 58ce6d86cf547aed75bf8c754f5a018153273bfb Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 22:52:11 +0800
Subject: [PATCH 41/43] fix(hunyuan_image3): mark AR stage
is_comprehension=true so online IT2I keeps non-square AR shape
Online /v1/images/edits collapsed AR-predicted aspects to a square
(e.g. 1024x1024) while offline end2end.py honored the predicted ratio
(e.g. 1216x832). Root cause is the AR stage in deploy/hunyuan_image3.yaml
was marked ``is_comprehension: false`` (read literally as "this task
generates an image, not text"), but ``is_comprehension`` inside vllm-omni
is the tokenizer-owning AR-stage marker, not a user-visible task type.
The serving path in entrypoints/openai/serving_chat.py looks up the AR
stage by that flag to apply ``resolve_stop_token_ids`` (image-task stop
set = ```` range). With the flag false the lookup returned
None, the AR kept the YAML default ``stop_token_ids: []``, and
the HunyuanImage3 custom sampler's forced-transition step
`` -> `` triggered an immediate stop. The cumulative
token ids never reached ````, so
``ar2diffusion._extract_ratio_index`` could not recover the AR aspect
and fell back to the carried-through prompt size (1024x1024 for
size=auto edits).
Offline avoided this because end2end.py overrides the AR stage's
stop_token_ids directly without going through the comprehension-stage
lookup. Other models did not hit it because their AR stage already had
``is_comprehension: true`` (the field's framework-internal meaning).
Fix is one line on the deploy config plus a comment explaining the
flag's real semantics so the next model author does not repeat the
same misread.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/deploy/hunyuan_image3.yaml | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 634165cd33a..93294bcdf44 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -22,7 +22,13 @@ connectors:
stages:
- stage_id: 0
- is_comprehension: false
+ # ``is_comprehension`` in vllm-omni names the tokenizer-owning AR stage
+ # (see config/stage_config.py + serving_chat AR-stage lookup), independent
+ # of whether the AR's task is comprehension (i2t/t2t) or generation
+ # (it2i/t2i). HunyuanImage-3.0's stage-0 owns the tokenizer and emits the
+ # cot+ratio token sequence consumed by stage-1, so it must be marked True
+ # for the serving path to set AR seed/stop_token_ids on this stage.
+ is_comprehension: true
final_output: true
final_output_type: text
max_num_seqs: 1
From be0c6840046d96cbd83e7c2ce2318e2e1fcb3a98 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 23:27:32 +0800
Subject: [PATCH 42/43] chore(hunyuan_image3): drop redundant hunyuan-specific
task/stop logic from serving_chat
PR #3444 added 84 lines of HunyuanImage-3.0-specific handling to
``serving_chat._build_multistage_generation_inputs`` (task derivation
from reference images, legacy task-enum mapping on ``bot_task``,
``MAX_IMAGES_PER_REQUEST`` cap, and an AR-stage ``stop_token_ids``
override via ``resolve_stop_token_ids``). The endpoint dispatch in
``api_server.py`` (``/v1/images/edits`` vs ``/v1/images/generations``)
already encodes the task split, and the AR-stage stop override is
redundant: ``HunyuanImage3ForCausalMM.sample`` already forces an EOS
after sampling a ratio token (``hunyuan_image3.py`` generation-mode
branch), so leaving the YAML default stop set empty lets the AR run
through ```` and stop
naturally on EOS; ``ar2diffusion._extract_ratio_index`` then reads the
ratio off ``cumulative_token_ids``. The production deploy
(``vllm_omni/deploy/hunyuan_image3.yaml``) already omits
``stop_token_ids`` for stage-0.
Net effect on ``serving_chat.py``: +84/-19 -> +47/-19 (-37 lines).
Behavior verified end-to-end on ``/v1/images/edits`` with a non-square
target after removal: ``ar2diffusion`` reports ``AR ratio_idx=19,
target size=1216x832`` (matches the offline ``end2end.py`` path),
identical to the result with the now-removed override in place.
Offline ``end2end.py`` still derives ``task`` and overrides
``stop_token_ids`` because it builds the params list directly without
the endpoint-level task signal; that path is intentionally unchanged.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
vllm_omni/entrypoints/openai/serving_chat.py | 43 ++------------------
1 file changed, 3 insertions(+), 40 deletions(-)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 4677135cdb0..2c375fa2928 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,35 +2247,10 @@ def _build_multistage_generation_inputs(
lora_body = extra_body.get("lora")
layers = extra_body.get("layers")
resolution = extra_body.get("resolution")
- from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
- )
- from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
- resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
- )
-
bot_task = extra_body.get("bot_task")
sys_type = extra_body.get("sys_type")
custom_system_prompt = extra_body.get("system_prompt")
- # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via
- # bot_task. Task is now derived from reference_images presence; map
- # composites to their semantic bot_task and drop bare task enums.
- bot_task_omitted = False
- if bot_task in {"it2i", "t2i", "i2t", "t2t"}:
- bot_task = None
- bot_task_omitted = True
- elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}:
- bot_task = bot_task.split("_", 1)[1]
-
- if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
- raise ValueError(
- f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input "
- f"images per request, got {len(reference_images)}"
- )
-
- task = "it2i" if reference_images else "t2i"
-
engine_prompt_data: dict[str, Any] | None = None
modalities = ["image"]
if reference_images:
@@ -2287,21 +2262,21 @@ def _build_multistage_generation_inputs(
prompt_token_ids: list[int] | None = None
system_prompt_type: str | None = None
- if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted:
+ if bot_task is not None or sys_type is not None or custom_system_prompt is not None:
from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
build_prompt,
build_prompt_tokens,
)
build_kwargs: dict[str, Any] = {
- "task": task,
+ "task": "it2i" if reference_images else "t2i",
"sys_type": sys_type,
"custom_system_prompt": custom_system_prompt,
"num_images": len(reference_images) if reference_images else 1,
}
if bot_task is not None:
build_kwargs["bot_task"] = bot_task
- elif "bot_task" in extra_body and not bot_task_omitted:
+ elif "bot_task" in extra_body:
# Explicit None from the caller is plain-mode; omitted lets
# each task fall back to its default trigger.
build_kwargs["bot_task"] = None
@@ -2381,18 +2356,6 @@ def _build_multistage_generation_inputs(
extra_args["target_h"] = int(height)
extra_args["target_w"] = int(width)
- # Stop AR at the natural token for image tasks; mirrors
- # upstream modeling_hunyuan_image_3.py:3289-3303.
- if (
- comprehension_idx is not None
- and idx == comprehension_idx
- and hasattr(default_stage_params, "stop_token_ids")
- ):
- default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids(
- task=task,
- bot_task=bot_task,
- )
-
if stage_type == "diffusion":
self._set_if_supported(
default_stage_params,
From 161ba503d52a206a434d681d9c03d7e0632419ad Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 14 May 2026 09:41:19 +0800
Subject: [PATCH 43/43] test(hunyuan_image3): drop legacy task-as-bot_task
tests after serving_chat cleanup
The serving_chat cleanup in the previous commit removed the legacy
caller compatibility layer that translated ``bot_task in {"it2i",
"t2i", "i2t", "t2t"}`` to ``None`` and ``bot_task in {"it2i_think",
"it2i_recaption", ...}`` to the trailing ``think``/``recaption`` part.
That translation existed because old callers stuffed task enums into
the ``bot_task`` field; the new contract is the endpoint dispatch
(``/v1/images/edits`` vs ``/v1/images/generations``) and
``reference_images`` presence carry the task signal, and ``bot_task``
only takes the documented values (``None`` / ``recaption`` / ``think``
/ ``think_recaption`` / ``vanilla``).
Two tests in
``test_serving_chat_multistage_generation.py`` were explicitly pinning
the now-removed legacy form
(``test_..._legacy_bot_task_form_unchanged``,
``test_..._legacy_composite_tasks_still_work``); deleting them.
Three other tests passed ``bot_task="it2i"`` only to trigger the
``build_prompt`` path (the *value* did not matter, just non-None);
switching them to ``bot_task="think"`` keeps the same intent against
the new validator.
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
.../openai_api/test_image_server.py | 4 +-
...test_serving_chat_multistage_generation.py | 75 +------------------
2 files changed, 4 insertions(+), 75 deletions(-)
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index fb9c126d3fe..40adb7a9151 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -1675,7 +1675,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on
for multi-image fusion).
Cross-pins the multi-image fix at the API level: 2 reference images
- with bot_task=it2i must produce 2
placeholders in the captured
+ with bot_task=think must produce 2
placeholders in the captured
AR prompt (build_prompt called with num_images=2).
"""
img_a = make_test_image_bytes((32, 32))
@@ -1686,7 +1686,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on
data={
"prompt": "fuse",
"size": "auto",
- "bot_task": "it2i",
+ "bot_task": "think",
},
)
assert response.status_code == 200, response.text
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index dd7f668611e..4b63588bae7 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -127,7 +127,7 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
serving_chat,
engine=engine,
prompt="edit me",
- extra_body={"bot_task": "it2i"},
+ extra_body={"bot_task": "think"},
reference_images=images[:n],
gen_params=OmniDiffusionSamplingParams(),
)
@@ -196,7 +196,7 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
serving_chat,
engine=engine,
prompt="edit me",
- extra_body={"bot_task": "it2i"},
+ extra_body={"bot_task": "think"},
reference_images=images[:n],
gen_params=OmniDiffusionSamplingParams(),
tokenizer=tok,
@@ -218,77 +218,6 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
assert img_count == n, f"N={n}: expected {n}
token ids in prompt_token_ids, got {img_count}"
-def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
- """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now
- inferred from reference_images; legacy bot_task must still trigger the
- default think mode rather than getting silently dropped.
- """
- from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
- engine = SimpleNamespace(
- stage_configs=[
- SimpleNamespace(stage_type="llm", is_comprehension=True),
- SimpleNamespace(stage_type="diffusion", is_comprehension=False),
- ],
- default_sampling_params_list=[
- SamplingParams(temperature=0.0),
- OmniDiffusionSamplingParams(),
- ],
- )
- images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
-
- legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
- serving_chat,
- engine=engine,
- prompt="edit me",
- extra_body={"bot_task": "it2i"},
- reference_images=images,
- gen_params=OmniDiffusionSamplingParams(),
- )
- assert legacy_prompt["prompt"].count("
") == 2
- assert legacy_prompt["prompt"].endswith("Assistant: ")
-
-
-@pytest.mark.parametrize(
- "legacy_task,trigger",
- [
- ("it2i_think", ""),
- ("it2i_recaption", ""),
- ],
-)
-def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work(
- serving_chat,
- legacy_task: str,
- trigger: str,
-):
- """Legacy composite task names passed through bot_task must still work."""
- from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
- engine = SimpleNamespace(
- stage_configs=[
- SimpleNamespace(stage_type="llm", is_comprehension=True),
- SimpleNamespace(stage_type="diffusion", is_comprehension=False),
- ],
- default_sampling_params_list=[
- SamplingParams(temperature=0.0),
- OmniDiffusionSamplingParams(),
- ],
- )
- images = [Image.new("RGB", (32, 32), color="red")]
-
- legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
- serving_chat,
- engine=engine,
- prompt="edit me",
- extra_body={"bot_task": legacy_task},
- reference_images=images,
- gen_params=OmniDiffusionSamplingParams(),
- )
-
- assert legacy_prompt["prompt"].count("
") == 1
- assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}")
-
-
def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
"""Passing bot_task=think_recaption (vs default "think") must flip the
resolved sys_type to en_think_recaption (and trigger tag is still