From 0f2ee2d16c72fba5605aea2d6a2f48e93e7146ad Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Fri, 8 May 2026 11:22:57 +0800 Subject: [PATCH 01/43] [Feature] HunyuanImage-3.0 IT2I: support multi-image input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HunyuanImage-3.0-Instruct supports up to 3 reference images for IT2I "Multi-Image Fusion" upstream (README §200-216, §500). vllm-omni's DiT pipeline, AR processor, OpenAI schema, and ar2diffusion bridge already accepted list-shaped `multi_modal_data["image"]`, but four call sites still encoded a hard "N=1" assumption that blocked real multi-image runs. End-to-end smoke (4× L20X) on the official `input_1_0.png` + `input_1_1.png` demo pair runs cleanly and preserves each image's native bucket (no forced cropping of the second image). Surgery points: 1. `vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py`: `build_prompt` / `build_prompt_tokens` take `num_images: int` (default 1, validated 1 <= N <= 3 for image-input tasks) and emit N consecutive `` placeholders between `User: ` and the user prompt. Mirrors the official tokenizer where each cond_image becomes its own user-role message and `apply_general_template` concatenates successive user messages back-to-back inside one user_prefix / user_suffix wrap. 2. `vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py` `HunyuanImage3Processor.process_image`: each cond image now keeps its own VAE `reso_group` bucket (mirrors the official ragged behavior in `_encode_cond_image`). Per-image VAE pixel tensors are flattened to 1-D and concatenated; `_get_mm_fields_config` declares `vae_pixel_values` with `MultiModalFieldConfig.flat_from_sizes(..., vae_pixel_size)` so vLLM splits the buffer back per image at consumption time. Mirrors the GLM-Image / Ming-Flash-Omni pattern. `_parse_and_validate_image_input` reconstructs a list of per-image (3, H_i, W_i) tensors using `vae_token_grid_hw`; `embed_multimodal` loops over the list for VAE encode + patch_embed (which was already per-image after the encode call). VIT (Siglip2 naflex) keeps the `batched("image")` path since naflex pads to `max_num_patches`. 3. `vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py` `instantiate_timestep_tokens`: `_encode_cond_image` returns `cond_t` as `list[Tensor]` for the multi-image branch (one tensor of N_cond_images timesteps per batch item). `instantiate_vae_image_tokens` already had a per-batch zip loop for the list shape; this function was missed and used a global flatten that silently broke on heterogeneous batches (different image counts per batch item). Adds a per-batch loop that mirrors `instantiate_vae_image_tokens`, slicing both `t` and `timestep_scatter_index` per batch item. 4. `examples/offline_inference/hunyuan_image3/end2end.py`: `--image-path` accepts comma-separated paths (matching the official upstream CLI); `num_images` is threaded through to the prompt builder. Tests: new regression file pinning N=1/2/3 placeholder layout (string + token-id, FakeTokenizer for fast CPU coverage), default-N=1 byte-equivalence with legacy callers, ValueError for out-of-range N, and three real-`AutoTokenizer.from_pretrained` cases proving N=1/2/3 produce N consecutive `` token ids on the production tokenizer path with no separator drift between successive `` placeholders. End-to-end smoke (4× L20X 143GB, AR=TP2 + DiT=TP2, 20 denoise steps, multi-image fusion against the official demo pair): - AR generated CoT tokens for the fused request - DiT denoise 20/20 steps in 24s (~1.10 s/step) - Peak GPU mem 95.52 GB reserved / 90.10 GB allocated, 5.7% pool - Output PNG saved cleanly; second reference image's native aspect visible in the fusion (vs the prior shared-bucket implementation that forced it into the first image's square bucket). Output-size handling for the AR/DiT ratio lifecycle is intentionally NOT touched. The pre-existing `image_list[0]` raw-pixel fallback in `pre_process_func` bypasses the AR's ratio-token prediction (the `` token sampled under `SliceVocabLogitsProcessor`); properly wiring that into `ar2diffusion`'s width/height assignment is a separate refactor. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/end2end.py | 34 ++- .../test_hunyuan_image3_it2i_multi_image.py | 251 ++++++++++++++++++ .../hunyuan_image3/pipeline_hunyuan_image3.py | 7 +- .../models/hunyuan_image3/prompt_utils.py | 36 ++- .../models/hunyuan_image3/hunyuan_image3.py | 117 +++++--- 5 files changed, 389 insertions(+), 56 deletions(-) create mode 100644 tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 5232568f11e..f9f734c9f4a 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -10,6 +10,7 @@ Usage: python end2end.py --modality text2img --prompts "A cute cat" python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy" + python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine" python end2end.py --modality img2text --image-path input.png --prompts "Describe this image" """ @@ -71,7 +72,7 @@ def parse_args(): "--image-path", type=str, default=None, - help="Path to input image (for img2img/img2text).", + help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).", ) parser.add_argument( "--output", @@ -207,14 +208,19 @@ def main(): print("[Info] No prompts provided, using default.") prompts = ["A cute cat"] - # Load image if needed - input_image = None + input_images: list = [] if args.modality in ("img2img", "img2text"): - if not args.image_path or not os.path.exists(args.image_path): + if not args.image_path: raise ValueError(f"--image-path required for {args.modality}, got: {args.image_path}") from PIL import Image - input_image = Image.open(args.image_path).convert("RGB") + image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()] + for p in image_paths: + if not os.path.exists(p): + raise ValueError(f"Image path does not exist: {p}") + input_images.append(Image.open(p).convert("RGB")) + if not input_images: + raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}") # Load tokenizer for segment-wise prompt tokenization (matches HF # apply_chat_template byte-for-byte; see build_prompt_tokens docstring). @@ -222,10 +228,18 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None + # Format prompts formatted_prompts: list[OmniPromptType] = [] for p in prompts: - result = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) + # Only pass `num_images` for modalities that actually consume images; + # text-only paths ignore the parameter, but threading it + # unconditionally reads as if t2i needed at least one image. + build_kwargs: dict = {"task": task, "sys_type": args.sys_type} + if input_images: + build_kwargs["num_images"] = len(input_images) + result = build_prompt_tokens(p, tokenizer, **build_kwargs) token_ids = result.token_ids effective_sys_type = result.system_prompt_type @@ -243,12 +257,12 @@ def main(): prompt_dict["modalities"] = ["image"] elif args.modality == "img2img": prompt_dict["modalities"] = ["image"] - prompt_dict["multi_modal_data"] = {"image": input_image} - prompt_dict["height"] = input_image.height - prompt_dict["width"] = input_image.width + prompt_dict["multi_modal_data"] = {"image": mm_image_payload} + prompt_dict["height"] = input_images[0].height + prompt_dict["width"] = input_images[0].width elif args.modality == "img2text": prompt_dict["modalities"] = ["text"] - prompt_dict["multi_modal_data"] = {"image": input_image} + prompt_dict["multi_modal_data"] = {"image": mm_image_payload} elif args.modality == "text2text": prompt_dict["modalities"] = ["text"] diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py new file mode 100644 index 00000000000..c8a9891385c --- /dev/null +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py @@ -0,0 +1,251 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Multi-image input regression for HunyuanImage3 IT2I prompt construction. + +The official HunyuanImage-3.0-Instruct supports up to 3 reference images +per IT2I request ("Multi-Image Fusion"; see hunyuan3.0_ins/README.md +section 200-216 + line 500). Each cond image becomes its own user-role +message and `apply_general_template` concatenates successive user +messages back-to-back inside ONE user_prefix/user_suffix wrap (see +hunyuan3.0_ins/tokenization_hunyuan_image_3.py:1399-1400, 1499-1515). +The lightweight `` + `multi_modal_data` builder used by the example +flow must match that contract: N consecutive `` placeholders sit +between `User: ` and the user prompt, with no separator between them. + +This file pins: + 1. N consecutive `` placeholders for N=1/2/3 across both the + string builder (`build_prompt`) and the token builder + (`build_prompt_tokens`). + 2. The N=1 path stays bit-identical to the legacy single-image builder + (regression guard so default callers don't notice). + 3. N=2 / N=3 token sequences differ from N=1 by exactly (N-1) extra + `` ids inserted between `User: ` and `user_prompt`. + 4. Validation: N<1 and N>3 raise ValueError (hard cap N<=3 mirrors + official upstream). + 5. Text-only tasks ignore `num_images` (no validation, no extra ids). +""" + +from __future__ import annotations + +import os + +import pytest + +from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + MAX_IMAGES_PER_REQUEST, + build_prompt, + build_prompt_tokens, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class FakeTokenizer: + """Recording fake tokenizer mirroring the one in test_prompt_utils. + + Special token ids: `<|startoftext|>`=1, ``=2, ``=3, + ``=4. encode() returns one id per character starting at + 100, so substring-position assertions are stable. + """ + + SPECIAL = { + "<|startoftext|>": 1, + "": 2, + "": 3, + "": 4, + } + + def __init__(self) -> None: + self.encode_calls: list[str] = [] + + def convert_tokens_to_ids(self, tok: str) -> int: + return self.SPECIAL.get(tok, 0) + + def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: + self.encode_calls.append(text) + return list(range(100, 100 + len(text))) + + +_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption") +_TEXT_ONLY_TASKS = ("t2t",) + + +# -------------------- string builder -------------------- + + +@pytest.mark.parametrize("task", _IMAGE_TASKS) +@pytest.mark.parametrize("num_images", [1, 2, 3]) +def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int): + """N=1/2/3 -> exactly N `` substrings appear consecutively + between `User: ` and the user prompt, with no separator between them.""" + s = build_prompt("HELLO", task=task, num_images=num_images) + assert s.count("") == num_images, ( + f"task={task} num_images={num_images}: expected {num_images} " + f"placeholders, found {s.count('')} -- prompt was: {s!r}" + ) + + # All `` placeholders must form one contiguous run "..." + # immediately after `User: ` and before HELLO. + user_idx = s.index("User: ") + len("User: ") + hello_idx = s.index("HELLO") + between = s[user_idx:hello_idx] + assert between == "" * num_images, ( + f"region between `User: ` and prompt must be exactly N placeholders; got {between!r}" + ) + + +def test_build_prompt_default_num_images_matches_legacy(): + """num_images default = 1 must produce a string bit-identical to the + pre-multi-image behavior (single `` placeholder).""" + legacy = build_prompt("HELLO", task="it2i_think") + explicit = build_prompt("HELLO", task="it2i_think", num_images=1) + assert legacy == explicit, "default num_images=1 must match legacy single-image output" + + +# -------------------- token builder -------------------- + + +@pytest.mark.parametrize("task", _IMAGE_TASKS) +def test_build_prompt_tokens_inserts_N_img_ids(task: str): + """N=1/2/3 -> the resulting id sequence contains exactly N copies of + img_id (=2) sitting consecutively after the `User: ` segment.""" + tok = FakeTokenizer() + ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1) + tok = FakeTokenizer() + ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2) + tok = FakeTokenizer() + ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3) + + assert ids_n1.count(2) == 1 + assert ids_n2.count(2) == 2 + assert ids_n3.count(2) == 3 + + # Each additional image must extend the sequence by exactly one img_id, + # not shift other tokens around. + assert len(ids_n2) == len(ids_n1) + 1 + assert len(ids_n3) == len(ids_n1) + 2 + + # The img_ids must be CONSECUTIVE (no other token between successive + # `` placeholders -- mirrors the official `process_successive_message` + # wrapping where successive user messages share one user_prefix/suffix). + for ids, n in [(ids_n2, 2), (ids_n3, 3)]: + first = ids.index(2) + for k in range(n): + assert ids[first + k] == 2, ( + f"img_ids must be consecutive starting at position {first} for n={n}; got {ids[first : first + n]!r}" + ) + + +def test_build_prompt_tokens_default_num_images_matches_legacy(): + """num_images default = 1 must produce the same id sequence as + omitting the parameter (regression guard for existing single-image + callers).""" + tok_a = FakeTokenizer() + legacy = build_prompt_tokens("hi", tok_a, task="it2i_think") + tok_b = FakeTokenizer() + explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1) + assert legacy == explicit + # Also: encode() must have been called on the same set of segments, + # so segment boundaries are preserved. + assert tok_a.encode_calls == tok_b.encode_calls + + +# -------------------- validation -------------------- + + +@pytest.mark.parametrize("task", _IMAGE_TASKS) +@pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99]) +def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int): + with pytest.raises(ValueError, match="num_images must be in"): + build_prompt("hi", task=task, num_images=bad) + with pytest.raises(ValueError, match="num_images must be in"): + build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad) + + +@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS) +@pytest.mark.parametrize("num_images", [0, 1, 2, 99]) +def test_text_only_tasks_ignore_num_images(task: str, num_images: int): + """Validation only kicks in for image-input tasks; t2t et al. accept + any num_images and emit zero `` placeholders.""" + s = build_prompt("hi", task=task, num_images=num_images) + assert "" not in s + ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images) + assert 2 not in ids + + +# -------------------- real HF tokenizer regression -------------------- + +_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct" + + +def _hf_cached(model_id: str) -> bool: + hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface") + snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots") + return os.path.isdir(snap_dir) and any(os.scandir(snap_dir)) + + +@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache") +@pytest.mark.parametrize("num_images", [1, 2, 3]) +def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int): + """Real `AutoTokenizer.from_pretrained(...)` (the production path) must + encode N=1/2/3 prompts to a sequence with exactly N consecutive `` + token-ids in the right place — proves the placeholder layout from + `build_prompt_tokens` survives a real BPE tokenizer, not just FakeTokenizer. + """ + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) + img_id = tok.convert_tokens_to_ids("") + assert img_id is not None and img_id >= 0, f" not in tokenizer vocab; got id={img_id}" + + ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images) + + # Exactly N copies of id, all consecutive. + img_positions = [i for i, x in enumerate(ids) if x == img_id] + assert len(img_positions) == num_images, ( + f"expected {num_images} ids, got {len(img_positions)} at positions {img_positions}" + ) + assert img_positions == list(range(img_positions[0], img_positions[0] + num_images)), ( + f" ids must be contiguous; got positions {img_positions}" + ) + + +@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache") +def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id(): + """Going from N to N+1 images must extend the encoded id sequence by + exactly one extra `` token-id and shift nothing else. Catches + accidental separator tokens between successive `` placeholders + that a FakeTokenizer (deterministic encode) can't surface.""" + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) + img_id = tok.convert_tokens_to_ids("") + + ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1) + ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2) + ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3) + + assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}" + assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}" + + # Insert one img_id at the existing position; everything else unchanged. + p1 = ids_n1.index(img_id) + assert ids_n2[: p1 + 1] == ids_n1[: p1 + 1] + [], "prefix before extra must match N=1" + assert ids_n2[p1] == img_id and ids_n2[p1 + 1] == img_id, "two consecutive ids at the insertion point" + assert ids_n2[p1 + 2 :] == ids_n1[p1 + 1 :], "tail after the extra must match N=1's tail" + # N=3 same pattern, three in a row. + assert ids_n3[p1 : p1 + 3] == [img_id, img_id, img_id] + assert ids_n3[p1 + 3 :] == ids_n1[p1 + 1 :] + + +@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache") +def test_real_tokenizer_default_n1_byte_identical_to_legacy(): + """Default `num_images=1` must produce the exact same id sequence as + omitting the parameter — pins the legacy single-image regression + against the real tokenizer (not just FakeTokenizer).""" + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) + legacy = build_prompt_tokens("hi", tok, task="it2i_think") + explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1) + assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy" diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 1f88e9e7155..74fe268babf 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -539,7 +539,12 @@ def instantiate_timestep_tokens( timestep_scatter_index: BatchRaggedTensor, ): batch_size, seq_len, n_embd = x.shape - # batch_size x n x n_embd + # `_encode_cond_image` returns `t` as list[Tensor] for the + # multi-image branch (outer length = batch_size, currently fixed + # at 1 by the stage runtime `max_batch_size`); flatten to a Tensor + # before reshape. + if isinstance(t, list): + t = torch.cat([ti.reshape(-1) for ti in t], dim=0) timestep_scatter_src = self.timestep_emb(t.reshape(-1)).reshape(batch_size, -1, n_embd) x.scatter_( dim=1, diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 5d8e9af6ab8..068dad87f8b 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -72,11 +72,21 @@ def resolve_stop_token_ids( return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]] +# Upstream "Multi-Image Fusion" caps reference images at 3 per request. +MAX_IMAGES_PER_REQUEST = 3 + + +def _validate_num_images(num_images: int) -> None: + if not (1 <= num_images <= MAX_IMAGES_PER_REQUEST): + raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}") + + def build_prompt( user_prompt: str, task: str = "it2i_think", sys_type: str | None = None, custom_system_prompt: str | None = None, + num_images: int = 1, ) -> str: """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path). @@ -85,6 +95,9 @@ def build_prompt( tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For inputs that need to match HF baseline byte-for-byte, use `build_prompt_tokens` instead and feed the result via prompt_token_ids. + + `num_images` emits N consecutive `` placeholders between + `User: ` and `user_prompt`. Ignored for text-only tasks. """ if task not in _TASK_PRESETS: raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") @@ -96,6 +109,8 @@ def build_prompt( sys_text = system_prompt.strip() if system_prompt else "" has_image_input = task.startswith("i2t") or task.startswith("it2i") + if has_image_input: + _validate_num_images(num_images) # t2i_vanilla: pretrain mode for direct text->image generation. The # vanilla system prompt drives the model with no chat structure. @@ -108,7 +123,7 @@ def build_prompt( # All other tasks (t2t / i2t / t2i_think / t2i_recaption / # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template: - # <|startoftext|>{system?}\n\nUser: {?}{user_prompt}\n\nAssistant: {trigger?} + # <|startoftext|>{system?}\n\nUser: {*N?}{user_prompt}\n\nAssistant: {trigger?} # generation_config.json declares sequence_template="instruct", so the # AR prefill MUST use this template -- verified to match HF's # apply_chat_template output token-for-token (modulo BPE boundary merges). @@ -121,7 +136,7 @@ def build_prompt( parts.append(f"{sys_text}\n\n") parts.append("User: ") if has_image_input: - parts.append("") + parts.extend([""] * num_images) parts.append(user_prompt) parts.append("\n\nAssistant: ") if trigger_tag: @@ -142,6 +157,7 @@ def build_prompt_tokens( task: str = "it2i_think", sys_type: str | None = None, custom_system_prompt: str | None = None, + num_images: int = 1, ) -> PromptTokensResult: """Segment-by-segment tokenization that matches HF apply_chat_template. @@ -155,6 +171,8 @@ def build_prompt_tokens( Returns: PromptTokensResult + + `num_images` inserts N `` token ids; see `build_prompt`. """ if task not in _TASK_PRESETS: raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") @@ -167,6 +185,8 @@ def build_prompt_tokens( trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None has_image_input = task.startswith("i2t") or task.startswith("it2i") + if has_image_input: + _validate_num_images(num_images) # t2i_vanilla uses pretrain template with no chat structure; the vanilla # system prompt drives the model directly. No segment boundaries to @@ -190,7 +210,7 @@ def build_prompt_tokens( ids += tokenizer.encode("\n\n", add_special_tokens=False) ids += tokenizer.encode("User: ", add_special_tokens=False) if has_image_input: - ids += [img_id] + ids += [img_id] * num_images ids += tokenizer.encode(user_prompt, add_special_tokens=False) ids += tokenizer.encode("\n\nAssistant: ", add_special_tokens=False) if trig_id is not None: @@ -202,4 +222,12 @@ def build_prompt_tokens( ) -__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS] +__all__ = [ + "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", + "MAX_IMAGES_PER_REQUEST", + "_TASK_PRESETS", + "available_tasks", + "build_prompt", + "build_prompt_tokens", + "resolve_stop_token_ids", +] diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 1e057a71efa..e9d41ebf958 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -860,6 +860,13 @@ def process_image(self, image_input: ImageInput): else: raise TypeError(f"Unsupported image type: {type(image_input)}.") + # Each cond image keeps its own VAE bucket (mirrors official HF's + # ragged behavior in `_encode_cond_image`). VAE pixel tensors have + # different (H_i, W_i) per image, so they're flattened to 1-D and + # concatenated; vLLM `flat_from_sizes("image", vae_pixel_size)` slices + # them back per-image at consumption time. VIT (Siglip2 naflex) pads + # to `max_num_patches` so VIT fields keep the existing `batched` + # stack path. batch_data = [] for image in images: current_info = {} @@ -883,42 +890,49 @@ def process_image(self, image_input: ImageInput): _ss = torch.tensor(_ss, dtype=torch.long) current_info["vit_spatial_shapes"] = _ss.squeeze(0) - # VAE processing. - # The resize/crop math here mirrors HF's `resize_and_crop` with - # crop_type="center" (hunyuan3.0_ins/image_processor.py:61). VAE - # normalize uses the same transforms.Compose([ToTensor, - # Normalize([0.5], [0.5])]) as HF's `pil_image_to_tensor`. So - # numerical output of this branch should match HF up to floating- - # point reduction order. + # VAE: per-image bucket via `reso_group.get_target_size`; mirrors + # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the + # VAE encoder casts to model dtype at its boundary (see + # `_vae_encode`). image_width, image_height = self.reso_group.get_target_size(image.width, image.height) resized_image = self._resize_and_crop(image, (image_width, image_height)) - vae_pixel_values = self.vae_processor(resized_image) + vae_pixel_values = self.vae_processor(resized_image).squeeze(0) token_height = image_height // (self.hf_config.vae_downsample_factor[0] * self.hf_config.patch_size) token_width = image_width // (self.hf_config.vae_downsample_factor[1] * self.hf_config.patch_size) - # Keep fp32 — the VAE encoder casts to model dtype at its boundary - # (see _vae_encode). Casting to bf16 here costs ~7e-4 mean-abs-diff - # bf16 quantization error on every pixel vs HF (which keeps fp32 - # in build_cond_images), measurable as a real numerical drift in - # downstream image embeddings. - current_info["vae_pixel_values"] = vae_pixel_values.squeeze(0) + + current_info["vae_pixel_values_flat"] = vae_pixel_values.reshape(-1) + current_info["vae_pixel_size"] = torch.tensor(vae_pixel_values.numel(), dtype=torch.long) current_info["vae_token_grid_hw"] = torch.tensor([token_height, token_width]) - # size base_size, ratio_index = self.reso_group.get_base_size_and_ratio_index(image_width, image_height) current_info["base_size"] = torch.tensor(base_size) current_info["ratio_index"] = torch.tensor(ratio_index) batch_data.append(current_info) - # Stack the tensors in the list into a batch dimension (B, ...) - final_image_info = {} - if len(batch_data) > 0: - for key in batch_data[0].keys(): - final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0) + final_image_info: dict[str, torch.Tensor] = {} + if not batch_data: + return final_image_info + + # Same-shape fields: stack along a new image-batch dim as before. + same_shape_keys = [ + "vit_pixel_values", + "vit_pixel_attention_mask", + "vit_spatial_shapes", + "vae_token_grid_hw", + "vae_pixel_size", + "base_size", + "ratio_index", + ] + for key in same_shape_keys: + final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0) + + # Variable-shape VAE pixels: 1-D concat across images (paired with + # `vae_pixel_size` via `flat_from_sizes` in `_get_mm_fields_config`). + final_image_info["vae_pixel_values"] = torch.cat([d["vae_pixel_values_flat"] for d in batch_data], dim=0) - if final_image_info: - shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()} - logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}") + shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()} + logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}") return final_image_info @@ -1030,8 +1044,13 @@ def _get_mm_fields_config( config["vit_pixel_attention_mask"] = MultiModalFieldConfig.batched("image") if "vit_spatial_shapes" in hf_inputs: config["vit_spatial_shapes"] = MultiModalFieldConfig.batched("image") - if "vae_pixel_values" in hf_inputs: - config["vae_pixel_values"] = MultiModalFieldConfig.batched("image") + # `vae_pixel_values` is a 1-D concatenation of variable-shape per-image + # VAE tensors (see `process_image`). `vae_pixel_size` carries the + # per-image flat length so vLLM can split the buffer back per image. + if "vae_pixel_values" in hf_inputs and "vae_pixel_size" in hf_inputs: + config["vae_pixel_values"] = MultiModalFieldConfig.flat_from_sizes("image", hf_inputs["vae_pixel_size"]) + if "vae_pixel_size" in hf_inputs: + config["vae_pixel_size"] = MultiModalFieldConfig.batched("image") if "vae_token_grid_hw" in hf_inputs: config["vae_token_grid_hw"] = MultiModalFieldConfig.batched("image") if "base_size" in hf_inputs: @@ -1668,6 +1687,9 @@ def _parse_and_validate_image_input( vit_pixel_attention_mask = kwargs.pop("vit_pixel_attention_mask", None) vit_spatial_shapes = kwargs.pop("vit_spatial_shapes", None) vae_pixel_values = kwargs.pop("vae_pixel_values", None) + # vae_pixel_size is only metadata for vLLM's flat_from_sizes split; + # we reconstruct per-image shapes from vae_token_grid_hw below. + kwargs.pop("vae_pixel_size", None) vae_token_grid_hw = kwargs.pop("vae_token_grid_hw", None) if vit_pixel_values is None or vae_pixel_values is None: @@ -1677,13 +1699,36 @@ def _parse_and_validate_image_input( if vit_pixel_values.numel() == 0 or vae_pixel_values.numel() == 0: return None + # `vae_pixel_values` arrives as a 1-D concatenation of per-image flat + # buffers (see `process_image` + `flat_from_sizes`). Reconstruct a + # list of per-image (3, H_i, W_i) tensors using the per-image grid + # dims so the downstream VAE encoder can run image-by-image. + vae_factor_h = self.config.vae_downsample_factor[0] * self.config.patch_size + vae_factor_w = self.config.vae_downsample_factor[1] * self.config.patch_size + num_images = vae_token_grid_hw.shape[0] + vae_image_list: list[torch.Tensor] = [] + offset = 0 + flat = vae_pixel_values.reshape(-1) + for i in range(num_images): + token_h, token_w = vae_token_grid_hw[i].tolist() + h_i = int(token_h) * vae_factor_h + w_i = int(token_w) * vae_factor_w + n_i = 3 * h_i * w_i + vae_image_list.append(flat[offset : offset + n_i].reshape(3, h_i, w_i)) + offset += n_i + if offset != flat.numel(): + raise ValueError( + f"vae_pixel_values size mismatch: consumed {offset} of {flat.numel()} elements " + f"across {num_images} images (token_grid_hw={vae_token_grid_hw.tolist()})" + ) + return HunyuanImage3PixelInputs( type="pixel_values", pixel_values={ "vit_pixel_values": vit_pixel_values, "vit_pixel_attention_mask": vit_pixel_attention_mask, "vit_spatial_shapes": vit_spatial_shapes, - "vae_pixel_values": vae_pixel_values, + "vae_pixel_values": vae_image_list, "vae_token_grid_hw": vae_token_grid_hw, }, ) @@ -1795,22 +1840,12 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: # Perform ViT encoding vit_embeddings = self._vit_encode(vit_pixel_values, vit_pixel_attention_mask, vit_spatial_shapes) - # Perform VAE encoding - t, latents = self._vae_encode(vae_pixel_values, vae_cfg_factor) - - # Process VAE latents through patch_embed to convert to token embeddings - # VAE latents are in (B, C, H, W) format, need to be converted to (B, seq_len, hidden_size) + # VAE encode + patch_embed per image — each cond image is at its own + # `reso_group` bucket so shapes are ragged across the image-batch dim. vae_token_embeddings = [] - batch_size = latents.shape[0] - for i in range(batch_size): - t_i = t[i] - latents_i = latents[i : i + 1] # Shape: (1, C, H, W) - - # Time embedding for VAE processing - t_emb = self.time_embed(t_i) - - # Process VAE latent through patch_embed - # Input: (1, C, H, W) -> Output: (1, seq_len, hidden_size) + for vae_image_i in vae_pixel_values: + t_i, latents_i = self._vae_encode(vae_image_i.unsqueeze(0), vae_cfg_factor) + t_emb = self.time_embed(t_i[0]) vae_tokens, _, _ = self.patch_embed(latents_i, t_emb) vae_token_embeddings.append(vae_tokens) From 46b3b84091954588861edbcc62a9638ec5f4cb67 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Fri, 8 May 2026 21:55:21 +0800 Subject: [PATCH 02/43] [Refactor] HunyuanImage-3.0 prompt_utils: split task and bot_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace conflated `task` strings (`it2i_think`, `t2i_recaption`, `t2i_vanilla`, ...) with two orthogonal axes: task ∈ {t2t, i2t, it2i, t2i} controls only `` placeholder emission. bot_task ∈ {None, think, recaption, think_recaption, vanilla} controls system prompt + trigger tag. Mapping: bot_task=None → en_unified no trigger bot_task=think → en_unified bot_task=recaption → en_unified bot_task=think_recaption → en_think_recaption bot_task=vanilla → en_vanilla no trigger, no chat (only valid with task='t2i') The pre-existing `_TASK_PRESETS` carried a `bot_task` field that was dead code under all paths actually exercised (`sys_type='en_unified' / 'en_vanilla'`); only `sys_type='dynamic'` consumed it, and nothing in the repo ever set that. The refactor promotes `bot_task` to the user-facing API and drops the `task` × mode conflation, also exposing the previously-unreachable `en_think_recaption` system prompt. Public helpers `available_bot_tasks()` and `resolve_sys_type(bot_task)` let callers derive the default sys_type without re-encoding the table. Side fix on `build_prompt`: the legacy code stripped the system prompt's leading whitespace while `build_prompt_tokens` did not. This was invisible while every system prompt was `unified_system_prompt_en` (no leading newline) but would diverge byte-wise once `bot_task='think_recaption'` exposes `en_think_recaption` (which starts with `\n`). `build_prompt` now keeps the system prompt verbatim, matching the segment-by-segment tokenization path and HF's `apply_chat_template`. end2end.py: `--bot-task` choices are now {none, think, recaption, think_recaption, vanilla}. The literal `none` is the explicit way to request `bot_task=None` on a modality whose default is `think` (text2img / img2img); leaving --bot-task unset still falls back to the modality default. The duplicated `_TASK_PRESETS` literal in the example script is removed in favor of `resolve_sys_type(bot_task)`. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/README.md | 26 +- .../hunyuan_image3/end2end.py | 97 ++----- .../test_hunyuan_image3_it2i_multi_image.py | 62 +++-- .../hunyuan_image3/test_prompt_utils.py | 252 ++++++------------ .../models/hunyuan_image3/prompt_utils.py | 186 ++++++++----- 5 files changed, 254 insertions(+), 369 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index 6db4cbec9ed..8b90e6b7fa3 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -112,6 +112,7 @@ python end2end.py --modality text2img \ --additional-config '{"torchair_graph_config":{"enabled":true}}' ``` + ## Key Arguments | Argument | Description | @@ -123,16 +124,15 @@ python end2end.py --modality text2img \ | `--steps` | Number of diffusion inference steps for image generation. | | `--guidance-scale` | Classifier-free guidance scale for image generation. | | `--height`, `--width` | Output image size for `text2img`. | -| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds ``; `recaption` adds ``; `vanilla` uses the text-to-image pretrain template. | +| `--bot-task` | Override prompt mode. `none`, `think`, `recaption`, `think_recaption`, or `vanilla`. | | `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. | | `--vae-use-tiling` | Enable VAE tiling for memory reduction. | ## Notes -- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults. -- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage. +- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. +- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`. - The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs. -- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only. ## Prompt Format @@ -148,22 +148,8 @@ Assistant: {trigger_tag?} - ``: Placeholder for each input image (single token; expanded by the multimodal pipeline). - Trigger tags: `` for CoT and `` for recaptioning, placed after `Assistant: `. -- System prompt: Auto-selected based on task. -- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure. -- The example composes the internal prompt task from `--modality` and `--bot-task` - before calling `prompt_utils`; for example, `img2text + think` becomes - `i2t_think` for prompt and stop-token lookup. +- System prompt: Auto-selected from `task` and `bot_task`. +- `bot_task='vanilla'` with `task='t2i'` uses the bare pretrain template. The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()` helper handles segment-by-segment tokenization and matches HF `apply_chat_template`. - -## FAQ - -- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`. -- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended). - -| Stage | VRAM (approx) | -| :--- | :--- | -| Stage 0 (AR) | ~15 GiB + KV Cache | -| Stage 1 (DiT) | ~30 GiB | -| Total (8-GPU) | ~45 GiB + KV Cache | diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index f9f734c9f4a..9d8f5113201 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -2,16 +2,10 @@ HunyuanImage-3.0-Instruct unified end-to-end inference script. Supports all modalities through a single entry point: - - text2img: Text → AR → DiT → Image - - img2img: Text+Image → AR → DiT → Edited Image (IT2I) - - img2text: Image+Text → AR → Text description (I2T) - - text2text: Text → AR → Text (comprehension, no image) - -Usage: - python end2end.py --modality text2img --prompts "A cute cat" - python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy" - python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine" - python end2end.py --modality img2text --image-path input.png --prompts "Describe this image" + - text2img: Text -> AR -> DiT -> Image + - img2img: Text+Image -> AR -> DiT -> Edited Image (IT2I) + - img2text: Image+Text -> AR -> Text description (I2T) + - text2text: Text -> AR -> Text (comprehension, no image) """ import argparse @@ -20,9 +14,9 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - _TASK_PRESETS, build_prompt_tokens, resolve_stop_token_ids, + resolve_sys_type, ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -46,11 +40,12 @@ "text2text": "text-to-text", } -_MODALITY_TASK_MAP = { - "text2img": "t2i", - "img2img": "it2i", - "img2text": "i2t", - "text2text": "t2t", +# Modality -> (task, default bot_task) mapping. +_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { + "text2img": ("t2i", "think"), + "img2img": ("it2i", "think"), + "img2text": ("i2t", None), + "text2text": ("t2t", None), } @@ -81,7 +76,6 @@ def parse_args(): help="Output directory to save results.", ) - # Generation parameters parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.") parser.add_argument("--seed", type=int, default=42, help="Random seed.") @@ -93,17 +87,12 @@ def parse_args(): help="Enable VAE tiling for memory optimization.", ) - # Prompt configuration parser.add_argument( "--bot-task", type=str, - default="auto", - choices=["auto", "think", "recaption", "think_recaption", "vanilla"], - help=( - "Prompt behavior. 'auto' selects the default for the modality; " - "'think' adds ; 'recaption' adds ; " - "'vanilla' uses the t2i pretrain template." - ), + default=None, + choices=["none", "think", "recaption", "think_recaption", "vanilla"], + help="Override prompt mode. Default: auto from --modality.", ) parser.add_argument( "--sys-type", @@ -112,7 +101,6 @@ def parse_args(): help="Override system prompt type (e.g. en_unified, en_vanilla).", ) - # Omni init args parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.") parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.") parser.add_argument("--log-stats", action="store_true", default=False) @@ -158,22 +146,13 @@ def main(): os.makedirs(args.output, exist_ok=True) additional_config = parse_additional_config(args.additional_config) - # Determine task for prompt formatting from modality + bot behavior. - task = _MODALITY_TASK_MAP[args.modality] - assert task is not None - bot_task = args.bot_task - if bot_task != "auto": - task = task + "_" + bot_task - if task not in _TASK_PRESETS: - valid_bot_tasks = { - "text2img": ["think", "recaption", "vanilla"], - "img2img": ["think", "recaption", "think_recaption"], - "img2text": ["auto"], - "text2text": ["auto"], - }[args.modality] - raise ValueError( - f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}" - ) + task, default_bot_task = _MODALITY_TASK_MAP[args.modality] + if args.bot_task is None: + bot_task: str | None = default_bot_task + elif args.bot_task == "none": + bot_task = None + else: + bot_task = args.bot_task if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -183,7 +162,6 @@ def main(): if deploy_config is None and stage_configs_path is None: deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality] - # Build Omni omni_kwargs = { "model": args.model, "vae_use_tiling": args.vae_use_tiling, @@ -202,10 +180,8 @@ def main(): omni = Omni(**omni_kwargs) - # Prepare prompts prompts = args.prompts or ["A cute cat"] if not prompts: - print("[Info] No prompts provided, using default.") prompts = ["A cute cat"] input_images: list = [] @@ -222,34 +198,23 @@ def main(): if not input_images: raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}") - # Load tokenizer for segment-wise prompt tokenization (matches HF - # apply_chat_template byte-for-byte; see build_prompt_tokens docstring). from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None - # Format prompts formatted_prompts: list[OmniPromptType] = [] - for p in prompts: - # Only pass `num_images` for modalities that actually consume images; - # text-only paths ignore the parameter, but threading it - # unconditionally reads as if t2i needed at least one image. - build_kwargs: dict = {"task": task, "sys_type": args.sys_type} + for prompt in prompts: + build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type} if input_images: build_kwargs["num_images"] = len(input_images) - result = build_prompt_tokens(p, tokenizer, **build_kwargs) + result = build_prompt_tokens(prompt, tokenizer, **build_kwargs) token_ids = result.token_ids - effective_sys_type = result.system_prompt_type + effective_sys_type = args.sys_type or resolve_sys_type(bot_task) - # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte). - # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to - # the DiT stage so the diffusion pipeline can rebuild the same - # system prefix when constructing its model inputs. prompt_dict: dict = { "prompt_token_ids": token_ids, - "prompt": p, + "prompt": prompt, "use_system_prompt": effective_sys_type, } @@ -268,14 +233,11 @@ def main(): formatted_prompts.append(prompt_dict) - # Build sampling params from defaults params_list = list(omni.default_sampling_params_list) - # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) - assert ar_stop_token_ids is not None for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps @@ -283,13 +245,12 @@ def main(): sp.guidance_scale_provided = True if args.seed is not None: sp.seed = args.seed - if args.modality in ("text2img",): + if args.modality == "text2img": sp.height = args.height sp.width = args.width elif hasattr(sp, "stop_token_ids"): sp.stop_token_ids = ar_stop_token_ids - # Print configuration print(f"\n{'=' * 60}") print("HunyuanImage-3.0 Generation Configuration:") print(f" Model: {args.model}") @@ -314,13 +275,10 @@ def main(): print(f" Prompts: {prompts}") print(f"{'=' * 60}\n") - # Generate omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) - # Process outputs img_idx = 0 for req_output in omni_outputs: - # Text output (AR stage or text-only) ro = getattr(req_output, "request_output", None) txt = "" if ro and getattr(ro, "outputs", None): @@ -334,7 +292,6 @@ def main(): if txt: print(f"[Output] Text:\n{txt}") - # Image output (DiT stage) images = getattr(req_output, "images", None) if not images and ro and hasattr(ro, "images"): images = ro.images diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py index c8a9891385c..7a1e266b936 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py @@ -66,21 +66,25 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: return list(range(100, 100 + len(text))) -_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption") -_TEXT_ONLY_TASKS = ("t2t",) +_IMAGE_TASK_COMBOS = ( + ("i2t", None), + ("it2i", "think"), + ("it2i", "recaption"), +) +_TEXT_ONLY_TASK_COMBOS = (("t2t", None),) # -------------------- string builder -------------------- -@pytest.mark.parametrize("task", _IMAGE_TASKS) +@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS) @pytest.mark.parametrize("num_images", [1, 2, 3]) -def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int): +def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, bot_task: str | None, num_images: int): """N=1/2/3 -> exactly N `` substrings appear consecutively between `User: ` and the user prompt, with no separator between them.""" - s = build_prompt("HELLO", task=task, num_images=num_images) + s = build_prompt("HELLO", task=task, bot_task=bot_task, num_images=num_images) assert s.count("") == num_images, ( - f"task={task} num_images={num_images}: expected {num_images} " + f"task={task} bot_task={bot_task} num_images={num_images}: expected {num_images} " f"placeholders, found {s.count('')} -- prompt was: {s!r}" ) @@ -97,24 +101,24 @@ def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images def test_build_prompt_default_num_images_matches_legacy(): """num_images default = 1 must produce a string bit-identical to the pre-multi-image behavior (single `` placeholder).""" - legacy = build_prompt("HELLO", task="it2i_think") - explicit = build_prompt("HELLO", task="it2i_think", num_images=1) + legacy = build_prompt("HELLO", task="it2i", bot_task="think") + explicit = build_prompt("HELLO", task="it2i", bot_task="think", num_images=1) assert legacy == explicit, "default num_images=1 must match legacy single-image output" # -------------------- token builder -------------------- -@pytest.mark.parametrize("task", _IMAGE_TASKS) -def test_build_prompt_tokens_inserts_N_img_ids(task: str): +@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS) +def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None): """N=1/2/3 -> the resulting id sequence contains exactly N copies of img_id (=2) sitting consecutively after the `User: ` segment.""" tok = FakeTokenizer() - ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1) + ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1) tok = FakeTokenizer() - ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2) + ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2) tok = FakeTokenizer() - ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3) + ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3) assert ids_n1.count(2) == 1 assert ids_n2.count(2) == 2 @@ -141,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy(): omitting the parameter (regression guard for existing single-image callers).""" tok_a = FakeTokenizer() - legacy = build_prompt_tokens("hi", tok_a, task="it2i_think") + legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think") tok_b = FakeTokenizer() - explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1) + explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1) assert legacy == explicit # Also: encode() must have been called on the same set of segments, # so segment boundaries are preserved. @@ -153,23 +157,23 @@ def test_build_prompt_tokens_default_num_images_matches_legacy(): # -------------------- validation -------------------- -@pytest.mark.parametrize("task", _IMAGE_TASKS) +@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS) @pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99]) -def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int): +def test_build_prompt_rejects_out_of_range_num_images(task: str, bot_task: str | None, bad: int): with pytest.raises(ValueError, match="num_images must be in"): - build_prompt("hi", task=task, num_images=bad) + build_prompt("hi", task=task, bot_task=bot_task, num_images=bad) with pytest.raises(ValueError, match="num_images must be in"): - build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad) + build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=bad) -@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS) +@pytest.mark.parametrize("task,bot_task", _TEXT_ONLY_TASK_COMBOS) @pytest.mark.parametrize("num_images", [0, 1, 2, 99]) -def test_text_only_tasks_ignore_num_images(task: str, num_images: int): +def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_images: int): """Validation only kicks in for image-input tasks; t2t et al. accept any num_images and emit zero `` placeholders.""" - s = build_prompt("hi", task=task, num_images=num_images) + s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images) assert "" not in s - ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images) + ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images) assert 2 not in ids @@ -198,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int): img_id = tok.convert_tokens_to_ids("") assert img_id is not None and img_id >= 0, f" not in tokenizer vocab; got id={img_id}" - ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images) + ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images) # Exactly N copies of id, all consecutive. img_positions = [i for i, x in enumerate(ids) if x == img_id] @@ -221,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id(): tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) img_id = tok.convert_tokens_to_ids("") - ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1) - ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2) - ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3) + ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1) + ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2) + ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3) assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}" assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}" @@ -246,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy(): from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) - legacy = build_prompt_tokens("hi", tok, task="it2i_think") - explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1) + legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think") + explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1) assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy" diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 1130c0f6db1..4d98bc5dcf2 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -1,20 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Regression tests for HunyuanImage3 prompt construction (PR #3243). - -Two layers: - 1. Pure-logic tests with a recording fake tokenizer -- protect the - prompt template structure (BOS, User:/Assistant: framing, trigger - placement, image placeholder position) and protect the segment- - by-segment tokenization contract (each segment must hit - `tokenizer.encode` in isolation). - 2. Real-tokenizer regression -- run when the HunyuanImage3-Instruct - tokenizer is in the local HF cache. Asserts the segment-tokenized - output diverges from the naive full-string encode, which is the - bug-tripping fixture for the cross-segment BPE merge fix - (commit 7bd429ed). -""" - from __future__ import annotations import ast @@ -25,6 +10,8 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, + _TASK_PRESETS, + available_bot_tasks, available_tasks, build_prompt, build_prompt_tokens, @@ -34,18 +21,7 @@ pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -# -------------------- Pure-logic structural tests -------------------- - - class FakeTokenizer: - """Minimal tokenizer stub that records every encode() call. - - Returns deterministic ids from convert_tokens_to_ids while - encode() returns one id per character starting at 100. This lets - tests both verify segmentation (by inspecting `encode_calls`) and - locate substrings inside the returned id list. - """ - SPECIAL = { "<|startoftext|>": 1, "": 2, @@ -72,85 +48,80 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: def test_available_tasks_covers_all_modalities(): - tasks = set(available_tasks()) - assert tasks >= { - "t2t", - "i2t", + assert set(available_tasks()) == {"t2t", "i2t", "it2i", "t2i"} + + +def test_available_bot_tasks_covers_all_modes(): + assert set(available_bot_tasks()) == {None, "think", "recaption", "think_recaption", "vanilla"} + + +def test_legacy_task_presets_still_available(): + assert { "it2i_think", "it2i_recaption", "it2i_think_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla", - } + } <= set(_TASK_PRESETS) def test_resolve_stop_token_ids_uses_answer_for_generation_tasks(): tok = FakeTokenizer() - answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id] assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id] + assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id] @pytest.mark.parametrize( - "task", + "task,bot_task", [ - "t2t", - "i2t", - "it2i_think", - "it2i_recaption", - "it2i_think_recaption", - "t2i_think", - "t2i_recaption", + ("t2t", None), + ("i2t", None), + ("it2i", "think"), + ("it2i", "recaption"), + ("it2i", "think_recaption"), + ("t2i", "think"), + ("t2i", "recaption"), + ("t2i", "think_recaption"), ], ) -def test_build_prompt_string_structure_chat_template(task: str): - """Chat-template tasks must produce <|startoftext|>...User: ...Assistant: ... - with image placeholder (when applicable) and trigger tag AFTER `Assistant: `.""" - s = build_prompt("HELLO", task=task) - +def test_build_prompt_string_structure_chat_template(task: str, bot_task: str | None): + s = build_prompt("HELLO", task=task, bot_task=bot_task) assert s.startswith("<|startoftext|>") assert "User: " in s assert "Assistant: " in s assert s.index("User: ") < s.index("HELLO") < s.index("Assistant: ") - if task.startswith(("i2t", "it2i")): - assert s.index("User: ") < s.index("") < s.index("HELLO"), ( - " placeholder must sit between `User: ` and the user prompt" - ) + if task in ("i2t", "it2i"): + assert s.index("User: ") < s.index("") < s.index("HELLO") else: assert "" not in s - # Trigger tag must be the FINAL token of the prompt (after `Assistant: `). - # Note: the system prompt itself mentions / as mode - # documentation, so substring index() catches the wrong occurrence -- use - # endswith() which directly captures "trigger is at the tail" (the Part A - # fix: trigger goes AFTER `Assistant: `, not before user_prompt). - if task in ("it2i_think", "t2i_think", "it2i_think_recaption"): - assert s.endswith("Assistant: "), ( - f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}" - ) - if task in ("it2i_recaption", "t2i_recaption"): - assert s.endswith("Assistant: "), ( - f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}" - ) - if task in ("t2t", "i2t"): - assert s.endswith("Assistant: "), "Plain (no-trigger) task must end at `Assistant: ` with no trailing tag." + if bot_task in ("think", "think_recaption"): + assert s.endswith("Assistant: ") + elif bot_task == "recaption": + assert s.endswith("Assistant: ") + elif bot_task is None: + assert s.endswith("Assistant: ") def test_build_prompt_vanilla_uses_pretrain_template(): - """t2i_vanilla is the only task that bypasses chat structure -- direct - text->image generation driven by the vanilla system prompt.""" - s = build_prompt("HELLO", task="t2i_vanilla") + s = build_prompt("HELLO", task="t2i", bot_task="vanilla") assert s.startswith("<|startoftext|>") assert "User: " not in s assert "Assistant: " not in s - assert "" not in s - assert "" not in s assert s.endswith("HELLO") +def test_build_prompt_vanilla_rejects_non_t2i_task(): + with pytest.raises(ValueError, match="bot_task='vanilla'"): + build_prompt("x", task="it2i", bot_task="vanilla") + with pytest.raises(ValueError, match="bot_task='vanilla'"): + build_prompt_tokens("x", FakeTokenizer(), task="i2t", bot_task="vanilla") + + def test_build_prompt_unknown_task_raises(): with pytest.raises(ValueError, match="Unknown task"): build_prompt("x", task="bogus") @@ -158,127 +129,83 @@ def test_build_prompt_unknown_task_raises(): build_prompt_tokens("x", FakeTokenizer(), task="bogus") +def test_build_prompt_unknown_bot_task_raises(): + with pytest.raises(ValueError, match="Unknown bot_task"): + build_prompt("x", task="t2i", bot_task="bogus") + with pytest.raises(ValueError, match="Unknown bot_task"): + build_prompt_tokens("x", FakeTokenizer(), task="t2i", bot_task="bogus") + + def test_build_prompt_tokens_segments_each_boundary(): - """Regression for cross-segment BPE merge bug (commit 7bd429ed): - each template segment must hit tokenizer.encode() independently; - user_prompt MUST NOT be concatenated with the following separator - in the same encode() call.""" tok = FakeTokenizer() - build_prompt_tokens("写诗。", tok, task="i2t") - - # Each canonical segment is encoded in its own call. + build_prompt_tokens("写诗。", tok, task="i2t", bot_task=None) assert "User: " in tok.encode_calls - assert "写诗。" in tok.encode_calls, ( - "user_prompt must be encoded alone -- if it is concatenated with the " - "trailing separator, BPE will merge across the boundary (the PR-#3243 bug)." - ) + assert "写诗。" in tok.encode_calls assert "\n\nAssistant: " in tok.encode_calls - - # No call must contain user_prompt glued to neighboring text. for call in tok.encode_calls: if call != "写诗。": - assert "写诗。" not in call, f"user_prompt leaked into a multi-segment encode call: {call!r}" + assert "写诗。" not in call def test_build_prompt_tokens_image_placeholder_present_for_image_tasks(): tok = FakeTokenizer() - result = build_prompt_tokens("hi", tok, task="i2t") + result = build_prompt_tokens("hi", tok, task="i2t", bot_task=None) ids = result.token_ids - assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"], "BOS (<|startoftext|>) must be the first token" - assert FakeTokenizer.SPECIAL[""] in ids, " placeholder must be present for i2t/it2i tasks" + assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"] + assert FakeTokenizer.SPECIAL[""] in ids def test_build_prompt_tokens_no_image_for_text_only_tasks(): tok = FakeTokenizer() - result = build_prompt_tokens("hi", tok, task="t2t") + result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None) ids = result.token_ids - assert FakeTokenizer.SPECIAL[""] not in ids, " must NOT appear for text-only tasks" + assert FakeTokenizer.SPECIAL[""] not in ids @pytest.mark.parametrize( - "task,trigger_id", + "task,bot_task,trigger_id", [ - ("it2i_think", FakeTokenizer.SPECIAL[""]), - ("t2i_think", FakeTokenizer.SPECIAL[""]), - ("it2i_recaption", FakeTokenizer.SPECIAL[""]), - ("t2i_recaption", FakeTokenizer.SPECIAL[""]), + ("it2i", "think", FakeTokenizer.SPECIAL[""]), + ("t2i", "think", FakeTokenizer.SPECIAL[""]), + ("t2i", "think_recaption", FakeTokenizer.SPECIAL[""]), + ("it2i", "recaption", FakeTokenizer.SPECIAL[""]), + ("t2i", "recaption", FakeTokenizer.SPECIAL[""]), + ("it2i_think", None, FakeTokenizer.SPECIAL[""]), + ("it2i_recaption", None, FakeTokenizer.SPECIAL[""]), ], ) -def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int): - """Trigger tag id must be the LAST token (after `Assistant: ` segment).""" +def test_build_prompt_tokens_trigger_is_last_token(task: str, bot_task: str | None, trigger_id: int): tok = FakeTokenizer() - result = build_prompt_tokens("hi", tok, task=task) - ids = result.token_ids - assert ids[-1] == trigger_id + result = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task) + assert result.token_ids[-1] == trigger_id def test_build_prompt_tokens_no_trigger_for_plain_tasks(): - """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id.""" tok = FakeTokenizer() - result = build_prompt_tokens("hi", tok, task="t2t") - ids = result.token_ids - assert ids[-1] not in { + result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None) + assert result.token_ids[-1] not in { FakeTokenizer.SPECIAL[""], FakeTokenizer.SPECIAL[""], } -# -------------------- end2end.py wiring guard -------------------- - - def _repo_root() -> pathlib.Path: - # tests/diffusion/models/hunyuan_image3/test_prompt_utils.py -> repo root return pathlib.Path(__file__).resolve().parents[4] def test_end2end_routes_through_shared_prompt_utils(): - """Regression for the *delivery vector* of PR #3243. - - Background: the wrong-template bug that PR #3243 fixes was introduced - when end2end.py grew its own hand-rolled prompt builder that diverged - from the canonical instruct chat template. To prevent that exact - failure mode from recurring, end2end.py MUST: - 1. Import the prompt builders from the shared prompt_utils module. - 2. NOT redefine `build_prompt` or `build_prompt_tokens` locally. - - A local redefinition is precisely how a future merge can silently - re-introduce a pretrain-style template (trigger BEFORE user_prompt, - no User:/Assistant: framing, etc.) without touching prompt_utils, - bypassing every other test in this file. - """ end2end_path = _repo_root() / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py" - assert end2end_path.is_file(), f"end2end.py not found at {end2end_path}" - tree = ast.parse(end2end_path.read_text(encoding="utf-8")) local_func_names = {n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)} - forbidden = {"build_prompt", "build_prompt_tokens"} - redefined = local_func_names & forbidden - assert not redefined, ( - f"end2end.py defines {sorted(redefined)} locally. This is exactly how " - "the wrong prompt template re-entered the example before PR #3243. " - "Use the shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils` " - "helpers instead." - ) + assert not (local_func_names & {"build_prompt", "build_prompt_tokens"}) imported_from_prompt_utils: set[str] = set() for node in ast.walk(tree): if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"): imported_from_prompt_utils.update(alias.name for alias in node.names) - expected_imports = { - "_TASK_PRESETS", - "build_prompt_tokens", - "resolve_stop_token_ids", - } - assert expected_imports <= imported_from_prompt_utils, ( - "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from " - "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared " - "module is the single source of truth for the AR-prefill template and " - "bot_task-derived AR stop token ids." - ) - - -# -------------------- Real-tokenizer regression -------------------- + expected_imports = {"build_prompt_tokens", "resolve_stop_token_ids", "resolve_sys_type"} + assert expected_imports <= imported_from_prompt_utils _HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct" @@ -290,41 +217,14 @@ def _hf_cached(model_id: str) -> bool: return os.path.isdir(snap_dir) and any(os.scandir(snap_dir)) -@pytest.mark.skipif( - not _hf_cached(_HUNYUAN_MODEL_ID), - reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache", -) +@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache") def test_segment_tokenize_diverges_from_full_string_encode(): - """Regression for PR #3243 segment-tokenization fix. - - The naive `tokenizer.encode(build_prompt(...))` lets BPE merge tokens - across segment boundaries (notably `。\\n\\n` -> a single id), which - drifts the AR prefill away from HF's apply_chat_template output. The - segment-by-segment build_prompt_tokens must produce a STRICTLY - DIFFERENT id sequence on a prompt that triggers the merge. - - If someone "simplifies" build_prompt_tokens to call encode() on the - full string, this assertion fires. - """ from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) - user_prompt = "写一首关于夜的诗。" - result = build_prompt_tokens(user_prompt, tok, task="i2t") + result = build_prompt_tokens(user_prompt, tok, task="i2t", bot_task=None) seg_ids = result.token_ids - full_ids = tok.encode(build_prompt(user_prompt, task="i2t"), add_special_tokens=False) - - assert seg_ids != full_ids, ( - "build_prompt_tokens output equals naive full-string encode -- " - "the BPE-merge-bypass behavior is no longer exercised. This means " - "the segment-by-segment fix from PR #3243 has been silently undone." - ) - - # Segmenting prevents merges, so the segment id list should have AT LEAST - # as many tokens as the merged version (a merge consumes 2+ ids -> 1). - assert len(seg_ids) >= len(full_ids), ( - f"segment-encoded length ({len(seg_ids)}) shorter than full-string " - f"merged length ({len(full_ids)}) -- impossible if segmentation is " - f"genuinely bypassing merges." - ) + full_ids = tok.encode(build_prompt(user_prompt, task="i2t", bot_task=None), add_special_tokens=False) + assert seg_ids != full_ids + assert len(seg_ids) >= len(full_ids) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 068dad87f8b..4ed277eeed2 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -11,8 +11,23 @@ `JointImageInfo` objects produced by image preprocessing. The example flow uses an `` placeholder + `multi_modal_data` instead, so it needs a lighter-weight builder that only requires a HF tokenizer. This -module provides that builder; the task -> template mapping below is the -canonical mapping for both flows. +module provides that builder; the (task, bot_task) -> template mapping +below is the canonical mapping for both flows. + +Two orthogonal axes: + + * `task` selects the I/O modality combination, which only controls + whether `` placeholders are emitted between `User: ` and the + user prompt: ``i2t`` / ``it2i`` produce them, ``t2t`` / ``t2i`` do + not. + + * `bot_task` selects the prompting mode and drives both the system + prompt and the trigger tag appended after ``Assistant: ``. ``None`` + (default) gives a plain Assistant turn under the unified prompt; + ``think`` / ``recaption`` switch the trigger tag to ```` / + ````; ``think_recaption`` swaps the system prompt for + the dedicated combined-mode template; ``vanilla`` drops the chat + structure entirely (pretrain template, ``t2i`` only). """ from __future__ import annotations @@ -45,30 +60,77 @@ "": 130106, } -# task -> (sys_type, bot_task, trigger_tag) +# bot_task -> (sys_type, trigger_tag). +# ``vanilla`` is special-cased downstream: it bypasses the chat template +# (no ``User:`` / ``Assistant:`` framing) and is only valid with +# ``task='t2i'``. +_BOT_TASK_PRESETS: dict[str | None, tuple[str, str | None]] = { + None: ("en_unified", None), + "think": ("en_unified", ""), + "recaption": ("en_unified", ""), + "think_recaption": ("en_think_recaption", ""), + "vanilla": ("en_vanilla", None), +} + +_TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"}) + +# Legacy composite task alias -> (task, bot_task). Keep this during rebase so +# older callers and intermediate commits still resolve cleanly. _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), "i2t": ("en_unified", None, None), "it2i_think": ("en_unified", "think", ""), "it2i_recaption": ("en_unified", "recaption", ""), "it2i_think_recaption": ("en_unified", "think_recaption", ""), - "t2i": ("en_unified", "image", None), - "t2i_vanilla": ("en_vanilla", "image", None), + "t2i": ("en_unified", None, None), + "t2i_vanilla": ("en_vanilla", "vanilla", None), "t2i_think": ("en_unified", "think", ""), "t2i_recaption": ("en_unified", "recaption", ""), } +def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]: + if task in _TASK_PRESETS: + _, legacy_bot_task, _ = _TASK_PRESETS[task] + base_task = task.split("_", 1)[0] + if base_task == "t2i" and task == "t2i": + base_task = "t2i" + if task in ("t2t", "i2t", "t2i"): + base_task = task + if bot_task is None: + bot_task = legacy_bot_task + task = base_task + return task, bot_task + + def available_tasks() -> list[str]: - """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" - return sorted(_TASK_PRESETS) + """Sorted list of `task` values accepted by the prompt builders.""" + return sorted(_TASKS) + + +def available_bot_tasks() -> list[str | None]: + """Sorted list of `bot_task` values (with ``None`` first).""" + rest = sorted(k for k in _BOT_TASK_PRESETS if k is not None) + return [None, *rest] + + +def resolve_sys_type(bot_task: str | None) -> str: + """Default system-prompt type for a given ``bot_task``.""" + if bot_task not in _BOT_TASK_PRESETS: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}") + return _BOT_TASK_PRESETS[bot_task][0] def resolve_stop_token_ids( - task: str = "it2i_think", - bot_task: str = "think", + task: str = "it2i", + bot_task: str | None = "think", tokenizer: Any | None = None, -): +) -> list[int]: + task, bot_task = _normalize_task_and_bot_task(task, bot_task) + if task not in _TASKS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + if bot_task not in _BOT_TASK_PRESETS: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}") return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]] @@ -81,56 +143,45 @@ def _validate_num_images(num_images: int) -> None: raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}") +def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]: + """Validate (task, bot_task) and return ``(sys_type, trigger_tag)``.""" + task, bot_task = _normalize_task_and_bot_task(task, bot_task) + if task not in _TASKS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + if bot_task not in _BOT_TASK_PRESETS: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}") + if bot_task == "vanilla" and task != "t2i": + raise ValueError(f"bot_task='vanilla' is only valid with task='t2i' (pretrain template); got task={task!r}") + return _BOT_TASK_PRESETS[bot_task] + + def build_prompt( user_prompt: str, - task: str = "it2i_think", + task: str = "it2i", + bot_task: str | None = "think", sys_type: str | None = None, custom_system_prompt: str | None = None, num_images: int = 1, ) -> str: - """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path). - - NOTE: when this string is passed to the engine, the engine's tokenizer - will run a single BPE pass over the whole string, which can merge - tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For - inputs that need to match HF baseline byte-for-byte, use - `build_prompt_tokens` instead and feed the result via prompt_token_ids. - - `num_images` emits N consecutive `` placeholders between - `User: ` and `user_prompt`. Ignored for text-only tasks. - """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).""" + task, bot_task = _normalize_task_and_bot_task(task, bot_task) + preset_sys_type, trigger_tag = _resolve_preset(task, bot_task) effective_sys_type = sys_type or preset_sys_type - system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) - sys_text = system_prompt.strip() if system_prompt else "" + system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt) + sys_text = system_prompt or "" - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = task in ("i2t", "it2i") if has_image_input: _validate_num_images(num_images) - # t2i_vanilla: pretrain mode for direct text->image generation. The - # vanilla system prompt drives the model with no chat structure. - if task == "t2i_vanilla": + if bot_task == "vanilla": parts = ["<|startoftext|>"] if sys_text: parts.append(sys_text) parts.append(user_prompt) return "".join(parts) - # All other tasks (t2t / i2t / t2i_think / t2i_recaption / - # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template: - # <|startoftext|>{system?}\n\nUser: {*N?}{user_prompt}\n\nAssistant: {trigger?} - # generation_config.json declares sequence_template="instruct", so the - # AR prefill MUST use this template -- verified to match HF's - # apply_chat_template output token-for-token (modulo BPE boundary merges). - # The trigger_tag (e.g. ) MUST come AFTER the `Assistant: ` prefix: - # if it goes BEFORE user_prompt (the old pretrain layout) the model puts - # the user's instructions inside the "thinking section" and collapses - # into repetition garbage under greedy decoding. parts = ["<|startoftext|>"] if sys_text: parts.append(f"{sys_text}\n\n") @@ -141,67 +192,52 @@ def build_prompt( parts.append("\n\nAssistant: ") if trigger_tag: parts.append(trigger_tag) - return "".join(parts) @dataclass class PromptTokensResult: - token_ids: list[int] # The tokenized prompt - system_prompt_type: str # The effective system prompt type used + token_ids: list[int] + system_prompt_type: str def build_prompt_tokens( user_prompt: str, tokenizer, - task: str = "it2i_think", + task: str = "it2i", + bot_task: str | None = "think", sys_type: str | None = None, custom_system_prompt: str | None = None, num_images: int = 1, ) -> PromptTokensResult: - """Segment-by-segment tokenization that matches HF apply_chat_template. - - Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE - merge tokens across segment boundaries (e.g. user_prompt ends with `。` - and the next segment is `\\n\\n` -> they merge into a single token id - 3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes - each segment independently and concatenates token_ids, so no cross- - boundary merge happens. We replicate that here and feed the result to - Omni via OmniTokensPrompt (prompt_token_ids). - - Returns: - PromptTokensResult - - `num_images` inserts N `` token ids; see `build_prompt`. - """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + """Segment-by-segment tokenization that matches HF apply_chat_template.""" + task, bot_task = _normalize_task_and_bot_task(task, bot_task) + preset_sys_type, trigger_tag = _resolve_preset(task, bot_task) effective_sys_type = sys_type or preset_sys_type bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") img_id = tokenizer.convert_tokens_to_ids("") trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = task in ("i2t", "it2i") if has_image_input: _validate_num_images(num_images) - # t2i_vanilla uses pretrain template with no chat structure; the vanilla - # system prompt drives the model directly. No segment boundaries to - # protect, fall back to whole-string encode. - if task == "t2i_vanilla": - s = build_prompt(user_prompt, task, sys_type, custom_system_prompt) + if bot_task == "vanilla": + s = build_prompt( + user_prompt, + task=task, + bot_task=bot_task, + sys_type=sys_type, + custom_system_prompt=custom_system_prompt, + ) token_ids = tokenizer.encode(s, add_special_tokens=False) return PromptTokensResult( token_ids=token_ids, system_prompt_type=effective_sys_type, ) - system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) - # Do NOT strip -- HF apply_chat_template keeps the system prompt's - # natural trailing newline; stripping it would shift one token id. + system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt) sys_text = system_prompt or "" ids: list[int] = [bos_id] @@ -226,8 +262,10 @@ def build_prompt_tokens( "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", "MAX_IMAGES_PER_REQUEST", "_TASK_PRESETS", + "available_bot_tasks", "available_tasks", "build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", + "resolve_sys_type", ] From f4d76d5ea2b791b9a54fbc4daaa84242c89c0f62 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Sat, 9 May 2026 14:47:51 +0800 Subject: [PATCH 03/43] [Feature] HunyuanImage-3.0 IT2I: wire multi-image through online serving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-image IT2I worked offline but `/v1/images/edits` returned HTTP 400 "multi_modal_uuids['image'] must have same length as multi_modal_data['image']" because the serving layer never expanded uuids past one-per-modality-key. Two serving-side gaps the model PR did not cover: 1. `serving_chat.py:_build_multistage_generation_inputs` (and its mirror in the chat-completion image-gen path) built `multi_modal_uuids` by iterating over dict keys, producing one uuid per modality regardless of value shape. For `engine_prompt_data = {"image": [pil1, pil2]}` this yielded `{"image": ["img-image-0"]}` (1 uuid), which vLLM's renderer then rejected against the 2-item parsed image list. Fixed by expanding the uuid list to `len(value)` when the value is a list, while keeping the single-uuid behavior for scalar values (e.g. `{"img2img": pil}`). 2. `model_metadata._DIFFUSION_MODEL_METADATA` only registered `QwenImageEditPlusPipeline` as supports_multimodal_inputs=True, so `od_config.supports_multimodal_inputs` defaulted to False for HunyuanImage3Pipeline. The multistage edit path bypasses that check on the way in, but the chat path's `generate_diffusion_images` does query it (line 2322) and would reject multi-image with "Multiple input images are not supported by the current diffusion model". Registered `HunyuanImage3Pipeline` with `max_multimodal_image_inputs=3` to match upstream's "Multi-Image Fusion" cap (README §200-216). Static change only; uuid expansion was traced through serving_chat -> async_omni -> async_omni_engine.add_request -> InputProcessor -> OmniInputPreprocessor._process_text -> renderer._process_multimodal -> _validate_mm_uuids. End-to-end smoke against /v1/images/edits with two `-F image=@...` parts is left for a follow-up; reproducing requires PYTHONPATH= when launching `vllm serve` so the system Python's editable vllm-omni install does not shadow the rebased branch. Signed-off-by: TaffyOfficial --- vllm_omni/diffusion/model_metadata.py | 6 ++++++ vllm_omni/entrypoints/openai/serving_chat.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/model_metadata.py b/vllm_omni/diffusion/model_metadata.py index ec133e7380e..f3346338434 100644 --- a/vllm_omni/diffusion/model_metadata.py +++ b/vllm_omni/diffusion/model_metadata.py @@ -13,6 +13,8 @@ class DiffusionModelMetadata: QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES = 4 +# Upstream HunyuanImage-3.0 "Multi-Image Fusion" caps reference images at 3. +HUNYUAN_IMAGE3_MAX_INPUT_IMAGES = 3 _DIFFUSION_MODEL_METADATA: dict[str, DiffusionModelMetadata] = { @@ -20,6 +22,10 @@ class DiffusionModelMetadata: supports_multimodal_inputs=True, max_multimodal_image_inputs=QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES, ), + "HunyuanImage3Pipeline": DiffusionModelMetadata( + supports_multimodal_inputs=True, + max_multimodal_image_inputs=HUNYUAN_IMAGE3_MAX_INPUT_IMAGES, + ), } diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 99827454e70..9ec626a3e74 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -419,7 +419,10 @@ async def create_chat_completion( # consistency. After the multimodal processor consumes # the image data, the uuids remain as a stable reference. tprompt["multi_modal_uuids"] = { - k: [f"{request_id}-{k}-{i}"] for i, k in enumerate(engine_prompt_image) + k: [f"{request_id}-{k}-{i}" for i in range(len(v))] + if isinstance(v, list) + else [f"{request_id}-{k}-0"] + for k, v in engine_prompt_image.items() } engine_prompts = [tprompt] @@ -2295,7 +2298,13 @@ def _build_multistage_generation_inputs( engine_prompt["multi_modal_data"] = engine_prompt_data # Provide multi_modal_uuids so that newer vLLM versions can # validate multi_modal_data / multi_modal_uuids consistency. - engine_prompt["multi_modal_uuids"] = {k: [f"img-{k}-{i}"] for i, k in enumerate(engine_prompt_data)} + # Generate one uuid per image when the value is a list (multi-image inputs). + engine_prompt["multi_modal_uuids"] = { + k: [f"img-{k}-{i}" for i in range(len(v))] + if isinstance(v, list) + else [f"img-{k}-0"] + for k, v in engine_prompt_data.items() + } comprehension_idx = None for idx, stage in enumerate(stage_configs): From c18f01674d457e7da3d7f79b93f7fe871a34fbb1 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Sat, 9 May 2026 15:03:27 +0800 Subject: [PATCH 04/43] [Bugfix] HunyuanImage-3.0 ar2diffusion: honor AR-predicted output ratio DiT output collapsed to a square whenever the input bucket was square, even though the AR engine had already predicted a different aspect via its `` tail. The bridge ignored the prediction and forwarded the prompt-carried `height`/`width` straight to the diffusion pipeline: height = original_prompt.get("height", 1024) width = original_prompt.get("width", 1024) In the `/v1/images/edits` path that prompt height/width is filled with `pil_images[0].size` (api_server.py:1808-1811) when the client does not pass `--size`/`resolution`, so the first reference image's bucket (typically a logo, square) determined the DiT canvas regardless of what the prompt actually called for. Mirrors the issue called out in the multi-image PR's commit message ("Output-size handling for the AR/DiT ratio lifecycle is intentionally NOT touched ... properly wiring that into ar2diffusion's width/height assignment is a separate refactor"). Wires the AR's ratio_index back into the bridge: 1. Recover ratio_index from the AR output. Probe the detokenized text first (cheap, works under `skip_special_tokens: False` like `hunyuan_image3_it2i_kv_reuse.yaml`); fall back to scanning `cumulative_token_ids` against the tokenizer's `..` id range so the fix also holds when the AR engine strips special tokens from text. The token-id table is loaded once via AutoTokenizer (cached, model name overridable via `VLLM_OMNI_HUNYUAN_IMAGE3_MODEL`) and shaped to mirror `HunyuanImage3ForCausalMM.__init__:1523-1531` (contiguous main slice 0..32 plus extra slice 33..36). 2. Resolve ratio_index to (height, width) via `ResolutionGroup(base_size=1024).data[ratio_index]`, which is the same reverse lookup `HunyuanImage3ImageProcessor.build_image_info` uses upstream when constructing the DiT image_info from ``. Falls back to the prompt-carried height/width when no ratio token is present (comprehension paths, AR aborted before the size+ratio tail) so non-IT2I/T2I flows are unaffected. End-to-end smoke is left for a follow-up: test/repro requires `PYTHONPATH= vllm serve ...` to keep the system Python's editable vllm-omni install from shadowing this branch (same caveat as the prior multi-image uuid commit). Signed-off-by: TaffyOfficial --- .../stage_input_processors/hunyuan_image3.py | 137 +++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index b7630bb8ac8..9a53bf4be06 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -12,6 +12,9 @@ from __future__ import annotations +import os +import re +from functools import lru_cache from typing import Any import torch @@ -22,6 +25,108 @@ logger = init_logger(__name__) +# AR emits `` after `` in IT2I/T2I +# (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The +# ratio_index resolves to a (height, width) bucket via ResolutionGroup, which +# is the official upstream's mechanism for AR-driven output aspect — without +# this lookup the DiT pipeline falls back to the user-provided width/height +# (in the `/v1/images/edits` path that defaults to `pil_images[0].size`, +# i.e. the first reference image's bucket — usually square, see +# api_server.py:1808-1811). +_RATIO_TOKEN_RE = re.compile(r"") +_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct" + + +@lru_cache(maxsize=4) +def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]: + """Return `[(height, width)]` indexed by ratio_index for HunyuanImage-3. + + Mirrors `HunyuanImage3ImageProcessor.build_image_info`'s + `reso_group[ratio_index]` reverse lookup. Cached because the table + is constant per `base_size`. + """ + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup + + reso_group = ResolutionGroup(base_size=base_size) + return [(int(r.height), int(r.width)) for r in reso_group.data] + + +@lru_cache(maxsize=4) +def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]: + """Return `{token_id: ratio_index}` for `` in the tokenizer. + + Loads the tokenizer once per model path and walks the contiguous + `..` plus the extra slice + `..` (the same shape + `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531). + Empty dict on lookup failure so callers can degrade gracefully. + """ + try: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + except Exception as e: # pragma: no cover - environment-dependent + logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e) + return {} + + def _id(name: str) -> int | None: + tid = tokenizer.convert_tokens_to_ids(name) + return None if tid is None or tid == tokenizer.unk_token_id else int(tid) + + ratio_0 = _id("") + ratio_32 = _id("") + ratio_33 = _id("") + ratio_36 = _id("") + if None in (ratio_0, ratio_32, ratio_33, ratio_36): + logger.warning("[ar2diffusion] tokenizer is missing one of tokens") + return {} + + table: dict[int, int] = {} + for i in range(ratio_32 - ratio_0 + 1): + table[ratio_0 + i] = i + base_idx = ratio_32 - ratio_0 + 1 + for j in range(ratio_36 - ratio_33 + 1): + table[ratio_33 + j] = base_idx + j + return table + + +def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None: + """Resolve the AR-predicted ratio_index from this stage's output. + + Two probe paths: + 1. Text regex on `generated_text` — works when the AR engine is + configured with `skip_special_tokens: False` (e.g. + `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading + the tokenizer. + 2. Token-id scan over `cumulative_token_ids` against the tokenizer's + `` id range — survives `skip_special_tokens: True` + where the special tokens are stripped from text but still present + in the raw token stream. + + Takes the LAST ratio token in the stream because the AR's + stage-transition logic emits exactly one such token at the tail of the + `` sequence; using "last" is robust to + any earlier accidental occurrences in the prompt scaffold. + """ + matches = _RATIO_TOKEN_RE.findall(generated_text or "") + if matches: + try: + return int(matches[-1]) + except ValueError: + pass + + if generated_token_ids is None: + return None + table = _build_ratio_id_lookup(model_name_or_path) + if not table: + return None + last_ratio_idx: int | None = None + for tid in generated_token_ids: + idx = table.get(int(tid)) + if idx is not None: + last_ratio_idx = idx + return last_ratio_idx + def ar2diffusion( source_outputs: list[Any], @@ -65,13 +170,43 @@ def ar2diffusion( text_prompt = original_prompt.get("prompt", "") use_system_prompt = original_prompt.get("use_system_prompt") + # Prefer the AR's predicted output aspect (`` + # tail emitted by `HunyuanImage3ForCausalMM.sample` under the + # ratio-restriction logits processor) over the carried-through + # height/width, which the serving layer fills with the first + # reference image's bucket and so collapses non-square targets to + # square in the multi-image / mismatched-aspect case. Mirrors the + # official upstream where `reso_group[ratio_index]` is the + # canonical source of the diffusion target shape. + model_name_or_path = original_prompt.get("model") or os.environ.get( + "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL + ) + ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path) + ar_predicted = False + if ratio_idx is not None: + base_size = int(original_prompt.get("image_base_size", 1024)) + size_table = _build_ratio_size_table(base_size) + if 0 <= ratio_idx < len(size_table): + height, width = size_table[ratio_idx] + ar_predicted = True + else: + logger.warning( + "[ar2diffusion] Request %d: ratio_index=%d out of range [0,%d), keeping prompt size %dx%d", + i, + ratio_idx, + len(size_table), + height, + width, + ) + logger.info( - "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d", + "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)", i, len(generated_token_ids), len(generated_text), height, width, + f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)", ) token_tensor = torch.tensor(generated_token_ids, dtype=torch.long) From c5f2f9bd618e4b5998ba8fbe53ccca7bb3b894a2 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Sat, 9 May 2026 15:23:39 +0800 Subject: [PATCH 05/43] [Chore] HunyuanImage-3.0 end2end: accept internal task names as --modality aliases `--modality img2img` historically pointed at the internal task `it2i`, so users who think in the post-`prompt_utils` task vocabulary (`t2i`/`it2i`/`i2t`/`t2t`, see `_TASK_PRESETS`) had to translate. Common enough that two recent reproduction commands hit the `invalid choice: 'it2i'` argparse error before getting any actual output. Accepts both spellings on the CLI and canonicalizes the short forms to the verbose names right after parsing so the downstream `args.modality == "img2img"` branches stay one-line and do not have to enumerate aliases. Default value, choices listing, and behavior for existing verbose names unchanged. Signed-off-by: TaffyOfficial --- .../hunyuan_image3/end2end.py | 78 +++++++------------ 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 9d8f5113201..b560926f1b7 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -1,11 +1,5 @@ """ HunyuanImage-3.0-Instruct unified end-to-end inference script. - -Supports all modalities through a single entry point: - - text2img: Text -> AR -> DiT -> Image - - img2img: Text+Image -> AR -> DiT -> Edited Image (IT2I) - - img2text: Image+Text -> AR -> Text description (I2T) - - text2text: Text -> AR -> Text (comprehension, no image) """ import argparse @@ -21,11 +15,29 @@ from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType -# Default deploy configs are absolute so this example works from any cwd. _REPO_ROOT = Path(__file__).resolve().parents[3] _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml") +# Both verbose and short-form aliases are accepted. +_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { + "text2img": ("t2i", "think"), + "t2i": ("t2i", "think"), + "img2img": ("it2i", "think"), + "it2i": ("it2i", "think"), + "img2text": ("i2t", None), + "i2t": ("i2t", None), + "text2text": ("t2t", None), + "t2t": ("t2t", None), +} + +_MODALITY_CANONICAL = { + "t2i": "text2img", + "it2i": "img2img", + "i2t": "img2text", + "t2t": "text2text", +} + _MODALITY_DEFAULT_DEPLOY_CONFIG = { "text2img": _DEFAULT_DEPLOY_CONFIG, "img2img": _DEFAULT_DEPLOY_CONFIG, @@ -40,27 +52,15 @@ "text2text": "text-to-text", } -# Modality -> (task, default bot_task) mapping. -_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { - "text2img": ("t2i", "think"), - "img2img": ("it2i", "think"), - "img2text": ("i2t", None), - "text2text": ("t2t", None), -} - def parse_args(): parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") - parser.add_argument( - "--model", - default="tencent/HunyuanImage-3.0-Instruct", - help="Model name or local path.", - ) + parser.add_argument("--model", default="tencent/HunyuanImage-3.0-Instruct", help="Model name or local path.") parser.add_argument( "--modality", default="text2img", - choices=["text2img", "img2img", "img2text", "text2text"], - help="Modality mode to control stage execution.", + choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"], + help="Verbose and internal short task names are both accepted.", ) parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.") parser.add_argument( @@ -69,24 +69,14 @@ def parse_args(): default=None, help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).", ) - parser.add_argument( - "--output", - type=str, - default=".", - help="Output directory to save results.", - ) + parser.add_argument("--output", type=str, default=".", help="Output directory to save results.") parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.") parser.add_argument("--seed", type=int, default=42, help="Random seed.") parser.add_argument("--height", type=int, default=1024, help="Output image height.") parser.add_argument("--width", type=int, default=1024, help="Output image width.") - parser.add_argument( - "--vae-use-tiling", - action="store_true", - help="Enable VAE tiling for memory optimization.", - ) - + parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.") parser.add_argument( "--bot-task", type=str, @@ -94,13 +84,7 @@ def parse_args(): choices=["none", "think", "recaption", "think_recaption", "vanilla"], help="Override prompt mode. Default: auto from --modality.", ) - parser.add_argument( - "--sys-type", - type=str, - default=None, - help="Override system prompt type (e.g. en_unified, en_vanilla).", - ) - + parser.add_argument("--sys-type", type=str, default=None, help="Override system prompt type.") parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.") parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.") parser.add_argument("--log-stats", action="store_true", default=False) @@ -146,6 +130,7 @@ def main(): os.makedirs(args.output, exist_ok=True) additional_config = parse_additional_config(args.additional_config) + args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality) task, default_bot_task = _MODALITY_TASK_MAP[args.modality] if args.bot_task is None: bot_task: str | None = default_bot_task @@ -168,6 +153,7 @@ def main(): "log_stats": args.log_stats, "init_timeout": args.init_timeout, "enforce_eager": args.enforce_eager, + "mode": _MODALITY_MODE[args.modality], } if additional_config is not None: @@ -176,14 +162,10 @@ def main(): omni_kwargs["deploy_config"] = deploy_config else: omni_kwargs["stage_configs_path"] = stage_configs_path - omni_kwargs["mode"] = _MODALITY_MODE[args.modality] omni = Omni(**omni_kwargs) prompts = args.prompts or ["A cute cat"] - if not prompts: - prompts = ["A cute cat"] - input_images: list = [] if args.modality in ("img2img", "img2text"): if not args.image_path: @@ -217,7 +199,6 @@ def main(): "prompt": prompt, "use_system_prompt": effective_sys_type, } - if args.modality == "text2img": prompt_dict["modalities"] = ["image"] elif args.modality == "img2img": @@ -228,9 +209,8 @@ def main(): elif args.modality == "img2text": prompt_dict["modalities"] = ["text"] prompt_dict["multi_modal_data"] = {"image": mm_image_payload} - elif args.modality == "text2text": + else: prompt_dict["modalities"] = ["text"] - formatted_prompts.append(prompt_dict) params_list = list(omni.default_sampling_params_list) @@ -276,7 +256,6 @@ def main(): print(f"{'=' * 60}\n") omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) - img_idx = 0 for req_output in omni_outputs: ro = getattr(req_output, "request_output", None) @@ -295,7 +274,6 @@ def main(): images = getattr(req_output, "images", None) if not images and ro and hasattr(ro, "images"): images = ro.images - if images: for j, img in enumerate(images): save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png") From 2ff92b7f0002a6fe957e2247d7ea205d92a13467 Mon Sep 17 00:00:00 2001 From: skf1999 <13234016272@163.com> Date: Sun, 10 May 2026 01:41:23 +0800 Subject: [PATCH 06/43] feat(end2end): semantic output shape for multi-image IT2I Signed-off-by: skf1999 <13234016272@163.com> --- .../hunyuan_image3/end2end.py | 48 +++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index b560926f1b7..b46e326d1c8 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -5,6 +5,7 @@ import argparse import json import os +import re from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( @@ -19,7 +20,6 @@ _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml") -# Both verbose and short-form aliases are accepted. _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { "text2img": ("t2i", "think"), "t2i": ("t2i", "think"), @@ -70,7 +70,6 @@ def parse_args(): help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).", ) parser.add_argument("--output", type=str, default=".", help="Output directory to save results.") - parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.") parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.") parser.add_argument("--seed", type=int, default=42, help="Random seed.") @@ -125,6 +124,30 @@ def parse_additional_config(raw_value: str | None) -> dict | None: return additional_config +def _infer_shape_reference_index(prompt: str, num_images: int) -> int: + chinese_nums = {"一": 1, "二": 2, "三": 3} + + def _to_idx(match: re.Match[str]) -> int | None: + token = match.group(1).strip() + value = chinese_nums.get(token, int(token) if token.isdigit() else None) + return value - 1 if value and 1 <= value <= num_images else None + + for pattern in ( + r"参考图\s*([一二三123])", + r"参考第\s*([一二三123])\s*张", + r"参考\s*image\s*([123])", + r"ref(?:erence)?\s*image\s*([123])", + r"基于图\s*([一二三123])", + r"基于第\s*([一二三123])\s*张", + r"基于\s*image\s*([123])", + r"based\s*on\s*image\s*([123])", + ): + match = re.search(pattern, prompt, re.IGNORECASE) + if match and (idx := _to_idx(match)) is not None: + return idx + return 0 + + def main(): args = parse_args() os.makedirs(args.output, exist_ok=True) @@ -173,10 +196,10 @@ def main(): from PIL import Image image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()] - for p in image_paths: - if not os.path.exists(p): - raise ValueError(f"Image path does not exist: {p}") - input_images.append(Image.open(p).convert("RGB")) + for image_path in image_paths: + if not os.path.exists(image_path): + raise ValueError(f"Image path does not exist: {image_path}") + input_images.append(Image.open(image_path).convert("RGB")) if not input_images: raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}") @@ -186,6 +209,7 @@ def main(): mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None formatted_prompts: list[OmniPromptType] = [] + shape_indices: list[int] = [] for prompt in prompts: build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type} if input_images: @@ -204,8 +228,10 @@ def main(): elif args.modality == "img2img": prompt_dict["modalities"] = ["image"] prompt_dict["multi_modal_data"] = {"image": mm_image_payload} - prompt_dict["height"] = input_images[0].height - prompt_dict["width"] = input_images[0].width + shape_idx = _infer_shape_reference_index(prompt, len(input_images)) + prompt_dict["height"] = input_images[shape_idx].height + prompt_dict["width"] = input_images[shape_idx].width + shape_indices.append(shape_idx) elif args.modality == "img2text": prompt_dict["modalities"] = ["text"] prompt_dict["multi_modal_data"] = {"image": mm_image_payload} @@ -218,6 +244,7 @@ def main(): from vllm_omni.inputs.data import OmniDiffusionSamplingParams ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) + diffusion_idx = 0 for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps @@ -228,6 +255,11 @@ def main(): if args.modality == "text2img": sp.height = args.height sp.width = args.width + elif args.modality == "img2img": + shape_idx = shape_indices[diffusion_idx] + sp.height = input_images[shape_idx].height + sp.width = input_images[shape_idx].width + diffusion_idx += 1 elif hasattr(sp, "stop_token_ids"): sp.stop_token_ids = ar_stop_token_ids From 74e5caca3b8d8b8ff7e3b3a529ad33cd3567c1e5 Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 03:57:53 +0800 Subject: [PATCH 07/43] [Chore] Apply pre-commit formatting fixes Auto-applied by ruff/whitespace hooks: extra blank lines between top-level functions, stripped trailing whitespace, and collapsed a dict-comprehension expression onto a single line. Signed-off-by: zuiho <2324465096@qq.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 9ec626a3e74..a5ca494c89e 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2300,9 +2300,7 @@ def _build_multistage_generation_inputs( # validate multi_modal_data / multi_modal_uuids consistency. # Generate one uuid per image when the value is a list (multi-image inputs). engine_prompt["multi_modal_uuids"] = { - k: [f"img-{k}-{i}" for i in range(len(v))] - if isinstance(v, list) - else [f"img-{k}-0"] + k: [f"img-{k}-{i}" for i in range(len(v))] if isinstance(v, list) else [f"img-{k}-0"] for k, v in engine_prompt_data.items() } From d7400dca983a03c9c74bbb59fa6288b226e17452 Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 15:40:10 +0800 Subject: [PATCH 08/43] fix(hunyuan_image3): honor ar2diffusion's predicted shape in pre_process_func pre_process_func was unconditionally filling None sampling_params.height/width with image_list[0].size, burying the AR-predicted ratio that ar2diffusion (e31197f0) had written into prompt["height"]/["width"]. forward() reads only sampling_params, so the bridge was a silent no-op on the IT2I path -- DiT output collapsed to the first reference image's bucket regardless of what the AR predicted via . Now prefer prompt["height"]/["width"] (bridge-supplied) over image_list[0] when sampling_params is None. Caller-explicit sampling_params still wins via the surrounding `is None` guards. Mirrors GLM-Image's precedent at pipeline_glm_image.py:718-737 and matches official HunyuanImage-3.0 image_size=="auto" semantics where vae_reso_group[ratio_index] is the canonical source of DiT shape. Signed-off-by: zuiho <2324465096@qq.com> --- .../models/hunyuan_image3/pipeline_hunyuan_image3.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 74fe268babf..b1ba2687f86 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -283,11 +283,13 @@ def pre_process_func(request: OmniDiffusionRequest): cond_image_infos = [_build_cond_joint_image(image) for image in image_list] prompt["additional_information"]["batch_cond_image_info"] = cond_image_infos + bridge_h = prompt.get("height") if isinstance(prompt, dict) else None + bridge_w = prompt.get("width") if isinstance(prompt, dict) else None first_image_w, first_image_h = _to_pil_image(image_list[0]).size if request.sampling_params.width is None: - request.sampling_params.width = int(first_image_w) + request.sampling_params.width = int(bridge_w or first_image_w) if request.sampling_params.height is None: - request.sampling_params.height = int(first_image_h) + request.sampling_params.height = int(bridge_h or first_image_h) request.prompts[i] = prompt From d7c760e258c4e4ec1896768fd5e0e5c7d5d4c6bd Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 15:40:23 +0800 Subject: [PATCH 09/43] refactor(end2end): drop multi-image regex shape heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts 6a1985f1 ("feat(end2end): semantic output shape for multi-image IT2I"). With the prior commit's pipeline fix in place, AR-predicted tokens flow through ar2diffusion to DiT output shape, so the prompt-regex layer (parsing "参考图二" / "based on image 2" to pick a reference image's H/W) is no longer needed and contradicts official HunyuanImage-3.0 image_size=="auto" semantics. Signed-off-by: zuiho <2324465096@qq.com> --- .../hunyuan_image3/end2end.py | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index b46e326d1c8..82e8c194c5a 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -5,7 +5,6 @@ import argparse import json import os -import re from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( @@ -124,30 +123,6 @@ def parse_additional_config(raw_value: str | None) -> dict | None: return additional_config -def _infer_shape_reference_index(prompt: str, num_images: int) -> int: - chinese_nums = {"一": 1, "二": 2, "三": 3} - - def _to_idx(match: re.Match[str]) -> int | None: - token = match.group(1).strip() - value = chinese_nums.get(token, int(token) if token.isdigit() else None) - return value - 1 if value and 1 <= value <= num_images else None - - for pattern in ( - r"参考图\s*([一二三123])", - r"参考第\s*([一二三123])\s*张", - r"参考\s*image\s*([123])", - r"ref(?:erence)?\s*image\s*([123])", - r"基于图\s*([一二三123])", - r"基于第\s*([一二三123])\s*张", - r"基于\s*image\s*([123])", - r"based\s*on\s*image\s*([123])", - ): - match = re.search(pattern, prompt, re.IGNORECASE) - if match and (idx := _to_idx(match)) is not None: - return idx - return 0 - - def main(): args = parse_args() os.makedirs(args.output, exist_ok=True) @@ -209,7 +184,6 @@ def main(): mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None formatted_prompts: list[OmniPromptType] = [] - shape_indices: list[int] = [] for prompt in prompts: build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type} if input_images: @@ -228,10 +202,8 @@ def main(): elif args.modality == "img2img": prompt_dict["modalities"] = ["image"] prompt_dict["multi_modal_data"] = {"image": mm_image_payload} - shape_idx = _infer_shape_reference_index(prompt, len(input_images)) - prompt_dict["height"] = input_images[shape_idx].height - prompt_dict["width"] = input_images[shape_idx].width - shape_indices.append(shape_idx) + prompt_dict["height"] = input_images[0].height + prompt_dict["width"] = input_images[0].width elif args.modality == "img2text": prompt_dict["modalities"] = ["text"] prompt_dict["multi_modal_data"] = {"image": mm_image_payload} @@ -244,7 +216,6 @@ def main(): from vllm_omni.inputs.data import OmniDiffusionSamplingParams ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) - diffusion_idx = 0 for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps @@ -255,11 +226,6 @@ def main(): if args.modality == "text2img": sp.height = args.height sp.width = args.width - elif args.modality == "img2img": - shape_idx = shape_indices[diffusion_idx] - sp.height = input_images[shape_idx].height - sp.width = input_images[shape_idx].width - diffusion_idx += 1 elif hasattr(sp, "stop_token_ids"): sp.stop_token_ids = ar_stop_token_ids From 2175a9974bfbb0b3f6a85d26c070f2c22329df8f Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 21:42:44 +0800 Subject: [PATCH 10/43] fix(hunyuan_image3): add official extra resolution buckets (idx 33-36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ResolutionGroup` only walked the step-based buckets (idx 0-32) and dropped the official's four extra resolutions at indices 33-36. The trained model has ratio token vocabulary 0-36, and AR was trained to address all 37 buckets; without the extras, wide reference images bucket-collapse to the closest base ratio (e.g. input_1_1's 1179x685 maps to idx=12 / 1280x768 instead of idx=36 / 720x1280) and the AR's `` token range can't address the missing aspects. Adds `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` in `hunyuan_image3_transformer.py` as the single source of truth (mirrors official `image_processor.py: 147-152`) and threads it through both: - `HunyuanImage3Processor.ResolutionGroup` (AR-side cond-image bucket selection) - `_build_ratio_size_table` (bridge's reverse lookup ratio_idx → (h, w) for ar2diffusion → DiT shape) Signed-off-by: zuiho <2324465096@qq.com> --- .../hunyuan_image3_transformer.py | 20 ++++++++++++++++- .../models/hunyuan_image3/hunyuan_image3.py | 22 +++++++++++++++++-- .../stage_input_processors/hunyuan_image3.py | 13 ++++++++--- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index 1eb0cdf113b..5a707acbda5 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -471,8 +471,21 @@ def __str__(self): return f"{self.h}x{self.w}" +# Baked-in extras matching the official model's +# `HunyuanImage3ImageProcessor.vae_reso_group` (image_processor.py:147-152). +# These four aspect buckets sit at ratio_token indices 33-36 in the trained +# model and the AR was trained to address them, so any deviation breaks the +# ratio-token vocab → output-shape lookup. +HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = ( + "1024x768", + "1280x720", + "768x1024", + "720x1280", +) + + class ResolutionGroup: - def __init__(self, base_size=None, step=None, align=1): + def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None): self.align = align self.base_size = base_size assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}" @@ -486,6 +499,11 @@ def __init__(self, base_size=None, step=None, align=1): self.step = step self.data = self._calc_by_step() + if extra_resolutions is not None: + for er in extra_resolutions: + if not any(r.ratio == er.ratio for r in self.data): + self.data.append(er) + self.ratio = np.array([x.ratio for x in self.data]) self.attr = ["" for _ in range(len(self.data))] self.prefix_space = 0 diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index e9d41ebf958..bdafa5c6f87 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -737,7 +737,7 @@ def __str__(self): class ResolutionGroup: """Group of resolutions for image processing.""" - def __init__(self, base_size=None, step=None, align=1): + def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None): self.align = align self.base_size = base_size assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}" @@ -751,6 +751,11 @@ def __init__(self, base_size=None, step=None, align=1): self.step = step self.data = self._calc_by_step() + if extra_resolutions is not None: + for er in extra_resolutions: + if not any(r.ratio == er.ratio for r in self.data): + self.data.append(er) + self.ratio = np.array([x.ratio for x in self.data]) self.attr = ["" for _ in range(len(self.data))] self.prefix_space = 0 @@ -815,7 +820,20 @@ def get_base_size_and_ratio_index(self, width, height): def __init__(self, tokenizer, hf_config, **kwargs: object): self.tokenizer = tokenizer self.hf_config = hf_config - self.reso_group = self.ResolutionGroup(base_size=hf_config.image_base_size) + # `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` mirrors the official + # `vae_reso_group` extras (image_processor.py:147-152). Build with + # this processor's inner Resolution class so `data` stays + # type-homogeneous. + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ( + HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS, + ) + + self.reso_group = self.ResolutionGroup( + base_size=hf_config.image_base_size, + extra_resolutions=[ + HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS + ], + ) self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor) self.vae_processor = transforms.Compose( [ diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index 9a53bf4be06..63af2f7f1dd 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -45,9 +45,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]: `reso_group[ratio_index]` reverse lookup. Cached because the table is constant per `base_size`. """ - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup - - reso_group = ResolutionGroup(base_size=base_size) + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ( + HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS, + Resolution, + ResolutionGroup, + ) + + reso_group = ResolutionGroup( + base_size=base_size, + extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS], + ) return [(int(r.height), int(r.width)) for r in reso_group.data] From 4aaa77261b303322a907cfe0b7fe4e71b7cf6782 Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 21:43:13 +0800 Subject: [PATCH 11/43] fix(hunyuan_image3): default cond image preprocessing to resize-stretch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match official `infer_align_image_size=True` path (image_processor.py:355 → crop_type="resize") for IT2I cond-image preprocessing. Previously hardcoded to center crop, which lost content from non-square reference images and produced a near-correct-but-not-equal pixel buffer compared to the HF reference run. Center-crop mode is preserved as opt-in via `crop_type="center"` for callers that want the legacy behavior. Signed-off-by: zuiho <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index bdafa5c6f87..f6bd31283d9 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -954,29 +954,33 @@ def process_image(self, image_input: ImageInput): return final_image_info - def _resize_and_crop(self, image: Image.Image, target_size: tuple[int, int]) -> Image.Image: + def _resize_and_crop( + self, + image: Image.Image, + target_size: tuple[int, int], + crop_type: str = "resize", + ) -> Image.Image: + # Default mode mirrors the official `infer_align_image_size=True` + # path (image_processor.py:355 → crop_type="resize") used by the + # IT2I demo: stretch the cond image to the bucket dims so its + # `` tag and ViT/VAE features stay aligned with the + # bucket, instead of dropping content via center crop. tw, th = target_size + if crop_type == "resize": + return image.resize((tw, th), resample=Image.Resampling.LANCZOS) w, h = image.size - tr = th / tw r = h / w - - # resize if r < tr: resize_height = th resize_width = int(round(th / h * w)) else: resize_width = tw resize_height = int(round(tw / w * h)) - image = image.resize((resize_width, resize_height), resample=Image.Resampling.LANCZOS) - - # center crop crop_top = int(round((resize_height - th) / 2.0)) crop_left = int(round((resize_width - tw) / 2.0)) - - image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th)) - return image + return image.crop((crop_left, crop_top, crop_left + tw, crop_top + th)) class HunyuanImage3ProcessingInfo(BaseProcessingInfo): From d0c2acbfb07debda01a68b87181db3c21cbf70ac Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Sun, 10 May 2026 21:43:59 +0800 Subject: [PATCH 12/43] fix(hunyuan_image3): use real token id at scaffold slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-image scaffold timestep slot was placeholder'd with `` token id (128006) instead of the real `` token id (128017), as a workaround for vLLM's `PromptUpdateDetails.select_token_id` accepting only a single `embed_token_id`. The runtime embedding was patched in via the multimodal-embedding merger, so single-image numerics matched HF. But under the AR's multimodal-bidirectional attention, that ``-as-timestep slot folded into each image's MM region. With multi-image input, this asymmetry biased the AR's `` greedy argmax to the FIRST conditioning image's bucket regardless of prompt semantics: input order | image_1 bucket | image_2 bucket | AR predicts -------------------|----------------|----------------|------------ square + wide | 16 | 36 | 16 wide + square | 36 | 16 | 36 single wide | -- | -- | 36 (correct) Recaption text in both broken cases explicitly said "use image_2 resolution" but the model's ratio token still landed on image_1's bucket. Single-image worked because there was no second region to contaminate. Switches the slot to the real `` id and patches its embedding with `timestep_emb(0)` in `embed_input_ids` via a token-id mask — same effect as HF's `instantiate_continuous_tokens` scatter-replace (modeling_hunyuan_image_3.py:1964). Numerically equivalent for single-image while removing the multi-image attention pollution. Touches: `_get_prompt_updates` scaffold, `embed_multimodal` (no longer prepends timestep_emb), `embed_input_ids` (new mask-based replacement), `__init__` (caches `_timestep_token_id`), `get_mrope_input_positions` (timestep slot check now matches the real token id). Signed-off-by: zuiho <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3.py | 77 +++++++++---------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index f6bd31283d9..ab9c2ee4d6e 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1126,31 +1126,22 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails: ratio_token_id = tokenizer.convert_tokens_to_ids(f"") if ratio_token_id is None: raise ValueError(f"Ratio token '' not found in tokenizer vocabulary") - - # NOTE on the timestep slot: - # HF's apply_chat_template emits the literal token id - # 128017 here. HF's modeling forward (`instantiate_continuous_tokens`, - # see hunyuan3.0_ins/modeling_hunyuan_image_3.py:1964) then *scatter- - # replaces* the embedding at that position with `timestep_emb(0)` - # for cond images. So the wte embedding of is irrelevant - # at runtime — what matters is the timestep_emb injection. - # - # vllm-omni achieves the same effect via the multimodal-embedding - # merger: we put an (128006) placeholder here and ship a - # `timestep_emb(0)` tensor at the head of `embed_multimodal()`'s - # combined_embeddings. The merger replaces this placeholder's - # embedding with the timestep tensor, yielding a final hidden - # state numerically equivalent to HF at that position. - # - # Keep this slot as (NOT ): switching to - # requires either (a) a second PromptReplacement targeting 128017, - # or (b) the merger's embed_token_id to be a list — neither is - # currently supported by PromptUpdateDetails.select_token_id. + timestep_token_id = tokenizer.convert_tokens_to_ids("") + if timestep_token_id is None: + raise ValueError("Timestep token '' not found in tokenizer vocabulary") + + # Use the real token id (HF parity). The trained wte + # at this slot is overwritten with timestep_emb(0) at runtime by + # `embed_input_ids` — same effect as HF's + # `instantiate_continuous_tokens` scatter-replace. Keeping the + # slot as would have folded the timestep position into the + # multimodal bidirectional region, which empirically biased + # multi-image AR ratio prediction to the first image's bucket. replacement = ( [boi_token_id] + [base_size_token_id] + [ratio_token_id] - + [img_token_id] * timestep_token_num + + [timestep_token_id] * timestep_token_num + [img_token_id] * vae_token_num + [joint_img_sep_token_id] + [img_token_id] * vit_token_num @@ -1542,6 +1533,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._end_of_answer_id = tokenizer.convert_tokens_to_ids("") image_base_size = getattr(config, "image_base_size", 1024) self._size_token_id = tokenizer.convert_tokens_to_ids(f"") + self._timestep_token_id = tokenizer.convert_tokens_to_ids("") self._start_ratio_id = tokenizer.convert_tokens_to_ids("") self._end_ratio_id = tokenizer.convert_tokens_to_ids("") ratio_33 = tokenizer.convert_tokens_to_ids("") @@ -1877,27 +1869,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: "Each image should have both VAE and ViT embeddings." ) - # Order per image: timestep -> VAE tokens -> ViT tokens. - # The placeholder at the timestep slot (see _get_prompt_updates) - # gets its embedding replaced by `timestep_emb(0)` here, which is what - # HF achieves via instantiate_continuous_tokens at runtime. + # Order per image: VAE tokens -> ViT tokens. The slot at + # the head of each per-image scaffold is NOT included here — its + # embedding is patched in by `embed_input_ids` via a token-id mask, + # mirroring HF's `instantiate_continuous_tokens` scatter-replace. combined_embeddings: list[torch.Tensor] = [] num_images = len(vae_token_embeddings) for img_idx in range(num_images): - # 1. Timestep embedding (cond image timestep == 0) - timestep = torch.zeros((1,)).to(vit_embeddings.device).to(vit_embeddings.dtype) - timestep_emb = self._timestep_encode(timestep) - - # 2. VAE image token embeddings vae_token_embed = vae_token_embeddings[img_idx] - # Remove batch dimension if present: (B, seq_len, hidden_size) -> (seq_len, hidden_size) if vae_token_embed.ndim == 3: vae_token_embed = vae_token_embed.squeeze(0) - - # 3. ViT image embeddings vit_embed = vit_embeddings[img_idx] - - stacked_embed = torch.cat([timestep_emb, vae_token_embed, vit_embed], dim=0) + stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0) combined_embeddings.append(stacked_embed) return combined_embeddings @@ -1910,14 +1893,25 @@ def embed_input_ids( is_multimodal: torch.Tensor | None = None, ) -> torch.Tensor: """Embed input IDs with optional multimodal embeddings.""" - # Get text embeddings inputs_embeds = self.model.embed_input_ids(input_ids) - # If no multimodal embeddings, return text embeddings + # Patch slots with timestep_emb(0). HF parity: the trained + # wte at this slot is irrelevant; runtime uses + # `instantiate_continuous_tokens(timestep_emb(0))`. With multi-image, + # keeping these slots as ids merged the timestep position into + # the bidirectional MM region and biased AR ratio prediction toward + # the first image's bucket. + timestep_mask = input_ids == self._timestep_token_id + n_timestep = int(timestep_mask.sum().item()) + if n_timestep > 0: + timestep_input = torch.zeros( + (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype + ) + inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input) + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - # Merge multimodal embeddings with text embeddings merged_embeds = _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, @@ -2133,6 +2127,7 @@ def get_mrope_input_positions( boi_token_id = self._mrope_boi_token_id eoi_token_id = self._mrope_eoi_token_id joint_img_sep_token_id = self._mrope_joint_img_sep_token_id + timestep_token_id = self._timestep_token_id # Build position arrays t_pos: list[int] = [] # temporal (same as 1D for this model) @@ -2149,7 +2144,7 @@ def get_mrope_input_positions( if tok == boi_token_id: # Found start of image block. - # Structure: *timestep *vae + # Structure: *vae # *vit # token t_pos.append(pos) @@ -2174,8 +2169,8 @@ def get_mrope_input_positions( pos += 1 i += 1 - # Timestep token (1 token) - if i < n and input_tokens[i] == img_token_id: + # token (1 token) + if i < n and input_tokens[i] == timestep_token_id: t_pos.append(pos) h_pos.append(pos) w_pos.append(pos) From f83c2814a6c853f24a050330d8544cb395203d0c Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Mon, 11 May 2026 02:40:19 +0800 Subject: [PATCH 13/43] fix(hunyuan_image3): include in per-image MM region MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-image embed mask in `_get_prompt_updates` only marked `` tokens via `PromptUpdateDetails.select_token_id()`, so vLLM's prefix-LM bidirectional region for each image was split into TWO contiguous runs: VAE block, then ViT block — with `` sitting between them as a non-MM (causal-only) token. Official `Tencent-Hunyuan/HunyuanImage-3.0` builds its full-attention range via `joint_image_slices` (image_processor.py:388, default `cond_token_attn_type` flow), spanning VAE + sep + ViT as ONE continuous bidirectional slice per cond image. The trained model expects this layout. In the multi-image case the asymmetry between training (sep inside the MM region) and our inference (sep outside) was the dominant remaining mismatch: empirically AR's `` greedy argmax landed on the FIRST conditioning image's bucket regardless of prompt semantics. Single-image and dup-bucket cases worked because there was no second region to be asymmetric against. Switches `_get_prompt_updates` to `PromptUpdateDetails.select_token_ids([, ])` so the embed mask now spans VAE+sep+ViT as one True run per image, and inserts the `` wte tensor in `embed_multimodal`'s per-image stack between VAE and ViT — numerically identical to what `model.embed_input_ids` would have produced for that token, so single-image semantics don't change. Verified end-to-end on 47.79.124.13 (4× L20X, AR=TP2 + DiT=TP2): case | image_1 | image_2 | AR ratio --------------------------------|---------|---------|--------- multi (1_0+1_1, prompt → img2) | 16 | 36 | 36 ✓ multi swap (1_1+1_0) | 36 | 16 | 36 ✓ single 1_1 (regression) | -- | -- | 36 ✓ single 1_0 (regression) | -- | -- | 16 ✓ multi dup wide | 36 | 36 | 36 ✓ Pre-fix behavior on the same setup had AR landing on the first conditioning image's bucket regardless of prompt, output collapsing to a square instead of image_2's wide aspect. Signed-off-by: zuiho <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3.py | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index ab9c2ee4d6e..08d25e9c896 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1132,11 +1132,16 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails: # Use the real token id (HF parity). The trained wte # at this slot is overwritten with timestep_emb(0) at runtime by - # `embed_input_ids` — same effect as HF's - # `instantiate_continuous_tokens` scatter-replace. Keeping the - # slot as would have folded the timestep position into the - # multimodal bidirectional region, which empirically biased - # multi-image AR ratio prediction to the first image's bucket. + # `embed_input_ids`. + # + # Mark *VAE + + *ViT as one contiguous + # embed run so vLLM's prefix-LM mask treats it as a single + # bidirectional region, mirroring official `joint_image_slices` + # full-attention range (image_processor.py:388, with + # cond_token_attn_type effectively spanning VAE+sep+ViT). With the + # default `select_token_id()` mask, sep splits the run into + # two regions; that asymmetry is what biased multi-image AR + # ratio prediction to the first image's bucket. replacement = ( [boi_token_id] + [base_size_token_id] @@ -1148,7 +1153,10 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails: + [eoi_token_id] ) logger.debug(f"actual replacement token count: {timestep_token_num + vae_token_num + vit_token_num}") - return PromptUpdateDetails.select_token_id(replacement, embed_token_id=img_token_id) + return PromptUpdateDetails.select_token_ids( + replacement, + embed_token_ids=[img_token_id, joint_img_sep_token_id], + ) return [ PromptReplacement(modality="image", target=[img_token_id], replacement=get_replacement_image), @@ -1869,10 +1877,25 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: "Each image should have both VAE and ViT embeddings." ) - # Order per image: VAE tokens -> ViT tokens. The slot at - # the head of each per-image scaffold is NOT included here — its - # embedding is patched in by `embed_input_ids` via a token-id mask, - # mirroring HF's `instantiate_continuous_tokens` scatter-replace. + # Order per image: VAE tokens -> wte -> ViT tokens. + # The wte is included so it joins the bidirectional + # MM region (matching the official `joint_image_slices` full-attn + # range that spans VAE+sep+ViT). The merger replaces the sep slot + # with this wte tensor, which is numerically identical to what + # `model.embed_input_ids` would produce — no semantic change for + # single-image, but with multi-image the sep position now sits + # inside the bidirectional region (matching how the model was + # trained). + sep_token_id = self._mrope_joint_img_sep_token_id + sep_input_ids = torch.tensor( + [sep_token_id], device=vit_embeddings.device, dtype=torch.long + ) + sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype) + + # The slot at the head of each per-image scaffold is NOT + # included here — its embedding is patched in by `embed_input_ids` + # via a token-id mask, mirroring HF's `instantiate_continuous_tokens` + # scatter-replace. combined_embeddings: list[torch.Tensor] = [] num_images = len(vae_token_embeddings) for img_idx in range(num_images): @@ -1880,7 +1903,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: if vae_token_embed.ndim == 3: vae_token_embed = vae_token_embed.squeeze(0) vit_embed = vit_embeddings[img_idx] - stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0) + stacked_embed = torch.cat([vae_token_embed, sep_embed, vit_embed], dim=0) combined_embeddings.append(stacked_embed) return combined_embeddings From b7c968bd5547d6188ecc3f21d76903080369c695 Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Mon, 11 May 2026 02:40:48 +0800 Subject: [PATCH 14/43] fix(hunyuan_image3): pass extra resolutions to DiT-side reso_group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `HunyuanImage3ImageProcessor.__init__` (DiT-side image processor in `hunyuan_image3_transformer.py`) constructed `ResolutionGroup` without the `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` extras, so it only knew the 33 step-based buckets (idx 0-32). When the AR predicted `` and the bridge resolved it to (h=720, w=1280), the DiT pipeline's `get_target_size` re-bucketed those dims to the closest 33-bucket ratio (idx 12 = 1280×768) and the final output PNG came out at 1280×768 instead of 1280×720. Threads the same `extra_resolutions` constant the AR-side processor (commit b3f91f3d) already uses, so the DiT side recognizes idx 33-36 as valid buckets and respects the AR's predicted dims end-to-end. Verified output PIL.size now matches AR's predicted bucket: multi-image prediction `` → (h=720, w=1280) → output (1280, 720). Signed-off-by: zuiho <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3_transformer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index 5a707acbda5..4edcfb6ca3a 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -1369,7 +1369,10 @@ class HunyuanImage3ImageProcessor: def __init__(self, config): self.config = config - self.reso_group = ResolutionGroup(base_size=config.image_base_size) + self.reso_group = ResolutionGroup( + base_size=config.image_base_size, + extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS], + ) self.vae_processor = transforms.Compose( [ transforms.ToTensor(), From 3b73eabe9f0b3b087785012ceaecb3fce093e35f Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Mon, 11 May 2026 08:46:54 +0800 Subject: [PATCH 15/43] fix(hunyuan_image3 ar2diffusion): truncate AR cot_text at / MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bridge was forwarding the full AR `generated_text` (including the `` stage-transition tail) into `extra.ar_generated_text` for DiT's prompt builder. The tail's purpose is purely to drive the AR's greedy ratio prediction inside `_apply_ratio_restriction` — the size/ratio info is already routed to DiT via `height` / `width` (translated from `ratio_idx`), so the tail has no remaining job downstream and just contaminates cot_text with an extra `` + size + ratio that DiT's prompt builder isn't expecting. Mirrors official upstream `HunyuanImage3ForCausalMM.generate_image` (modeling_hunyuan_image_3.py:3343-3354), which decodes only `generated_tokens[0, :end_pos + 1]` where `end_pos` is the position of `` (think_recaption / recaption bot_task) or `` (think-only bot_task). Adds `_truncate_at_cot_end()` that finds the first cot-end marker in the generated text, truncates both the text and the token-id stream at that position (token side uses `` / `` token ids from the tokenizer, cached via `_build_cot_end_token_ids`), and returns them for downstream consumption. `ratio_idx` extraction in `_extract_ratio_index` still runs on the FULL output before truncation, since the ratio token lives in the trailing segment that we're about to drop. Addresses PR #3444 review comment from @Bounty-hunter. Signed-off-by: zuiho <2324465096@qq.com> --- .../stage_input_processors/hunyuan_image3.py | 80 ++++++++++++++++++- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index 63af2f7f1dd..158ea86dbf2 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -58,6 +58,67 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]: return [(int(r.height), int(r.width)) for r in reso_group.data] +@lru_cache(maxsize=4) +def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]: + """Return `{'': id, '': id}` for cot-boundary + truncation. Empty dict on lookup failure so callers degrade to a + pure text-based search. + """ + try: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + except Exception as e: # pragma: no cover - environment-dependent + logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e) + return {} + + result: dict[str, int] = {} + for marker in ("", ""): + tid = tokenizer.convert_tokens_to_ids(marker) + if tid is not None and tid != tokenizer.unk_token_id: + result[marker] = int(tid) + return result + + +def _truncate_at_cot_end( + generated_text: str, + generated_token_ids, + model_name_or_path: str, +) -> tuple[str, list[int]]: + """Truncate AR output at first `` (or `` fallback). + + Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official + upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as + `cot_text` for DiT. The trailing `` + sequence is a stage-transition trigger consumed via `image_size` / + height/width — it must NOT be forwarded to DiT's prompt builder, or + the extra `` and ratio tokens drift the DiT's own prompt + structure. + """ + token_list = list(generated_token_ids) if generated_token_ids is not None else [] + + end_ids = _build_cot_end_token_ids(model_name_or_path) + + for marker in ("", ""): + idx = generated_text.find(marker) + if idx == -1: + continue + text_end = idx + len(marker) + truncated_text = generated_text[:text_end] + + truncated_tokens = token_list + end_id = end_ids.get(marker) + if end_id is not None and token_list: + try: + token_end = token_list.index(end_id) + truncated_tokens = token_list[: token_end + 1] + except ValueError: + pass + return truncated_text, truncated_tokens + + return generated_text, token_list + + @lru_cache(maxsize=4) def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]: """Return `{token_id: ratio_index}` for `` in the tokenizer. @@ -206,17 +267,30 @@ def ar2diffusion( width, ) + # Truncate the AR output at `` (or ``) before + # passing to DiT. Mirrors official `generate_image` which keeps + # `cot_text` clean and routes size/ratio via `image_size` only — + # we already extracted `ratio_idx` above and translated it into + # `height` / `width`, so the `` + # tail has no remaining job and would only contaminate DiT's + # prompt builder if forwarded. + cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end( + generated_text, generated_token_ids, model_name_or_path + ) + logger.info( - "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)", + "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, " + "cot_text length=%d, target size=%dx%d (%s)", i, len(generated_token_ids), len(generated_text), + len(cot_text_for_dit), height, width, f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)", ) - token_tensor = torch.tensor(generated_token_ids, dtype=torch.long) + token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long) diffusion_input: dict[str, Any] = { "prompt": text_prompt, @@ -224,7 +298,7 @@ def ar2diffusion( "width": width, "extra": { "ar_token_ids": token_tensor, - "ar_generated_text": generated_text, + "ar_generated_text": cot_text_for_dit, }, } From 284783940116757be0f23fc80e0402ad74789a62 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Mon, 11 May 2026 11:37:11 +0800 Subject: [PATCH 16/43] chore(hunyuan_image3): apply ruff format Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 08d25e9c896..756a7a27c9b 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -830,9 +830,7 @@ def __init__(self, tokenizer, hf_config, **kwargs: object): self.reso_group = self.ResolutionGroup( base_size=hf_config.image_base_size, - extra_resolutions=[ - HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS - ], + extra_resolutions=[HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS], ) self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor) self.vae_processor = transforms.Compose( @@ -1887,9 +1885,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: # inside the bidirectional region (matching how the model was # trained). sep_token_id = self._mrope_joint_img_sep_token_id - sep_input_ids = torch.tensor( - [sep_token_id], device=vit_embeddings.device, dtype=torch.long - ) + sep_input_ids = torch.tensor([sep_token_id], device=vit_embeddings.device, dtype=torch.long) sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype) # The slot at the head of each per-image scaffold is NOT @@ -1927,9 +1923,7 @@ def embed_input_ids( timestep_mask = input_ids == self._timestep_token_id n_timestep = int(timestep_mask.sum().item()) if n_timestep > 0: - timestep_input = torch.zeros( - (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype - ) + timestep_input = torch.zeros((n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype) inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input) if multimodal_embeddings is None or len(multimodal_embeddings) == 0: From 3b4f885cf2a2d84275691c7961fb93290c27fa13 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Mon, 11 May 2026 13:08:24 +0800 Subject: [PATCH 17/43] fix(hunyuan_image3): online IT2I multi-image and AR bucket override Two related bugs in the online /v1/images/edits path prevented this PRs multi-image IT2I from working end-to-end and silently suppressed the AR ratio decision for AR-driven pipelines: 1. serving_chat._build_multistage_generation_inputs invoked build_prompt without num_images, defaulting to 1. N reference images then only got a single placeholder in the AR prompt; vLLMs _process_multimodal raised AssertionError(Failed to apply prompt replacement for mm_items[image][1]) on the second image. 2. edit_images resolved size=auto to the first input images dimensions and forwarded them through extra_body to chat_handler. generate_diffusion_images, which then built a fresh gen_params with those dimensions. Multi-stage AR-driven pipelines (e.g. HunyuanImage-3.0) rely on ar2diffusion to override the final bucket from the AR ratio token; DiTs pre_process_func only does that when sampling_params.width is None (see pipeline_hunyuan_image3.py:290). The forwarded input-image size suppressed the AR decision, producing the wrong bucket (e.g. 1024x1024 square instead of the AR-decided 1280x720 landscape for multi-image fusion). The fix mirrors the offline end2end.py img2img path which never sets sampling_params.height/width for img2img. Single-stage diffusion (_generate_with_async_omni path) still pins gen_params.width/height from input image size for backward compat. End-to-end smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images via curl /v1/images/edits with size=auto, same prompt as offline): - before fix 1: HTTP 500, AssertionError on mm_items[image][1] - before fix 2: HTTP 200 but 1024x1024 square (wrong bucket) - after both: HTTP 200, 1280x720 landscape -- AR ratio_idx=36 honored, matches offline end2end.py for the same inputs Tests: - tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py:: test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders Pins build_prompt(num_images=N) for N=1,2,3 reference images. - tests/entrypoints/openai_api/test_image_server.py:: test_image_edits_size_auto_preserves_bridge_size Pins diffusion sampling_params.height/width staying None through the /v1/images/edits API on the multi-stage path, with multi-image placeholder cross-check. - test_image_edit_parameter_default updated to assert the new contract (None on multi-stage); test_image_edit_parameter_default_single_stage unchanged. Signed-off-by: TaffyOfficial --- .../openai_api/test_image_server.py | 68 ++++++++++++++++++- ...test_serving_chat_multistage_generation.py | 44 ++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 18 +++-- vllm_omni/entrypoints/openai/serving_chat.py | 6 +- 4 files changed, 126 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index b5ff891f8f6..fb9c126d3fe 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -1349,8 +1349,16 @@ def test_image_edit_parameter_default(async_omni_test_client): engine = async_omni_test_client.app.state.engine_client captured_sampling_params = engine.captured_sampling_params_list[-1] - assert captured_sampling_params.width == 24 - assert captured_sampling_params.height == 16 + # size="auto" on multi-stage pipelines deliberately leaves the diffusion + # stages sampling_params width/height unset so AR-driven pipelines (e.g. + # HunyuanImage-3.0) can let ar2diffusion override the final bucket from + # the AR-predicted ratio token; see + # test_image_edits_size_auto_preserves_bridge_size for the contract. + # Single-stage diffusion (test_image_edit_parameter_default_single_stage) + # still pins width/height to the input image size via api_servers + # gen_params, which is unchanged. + assert captured_sampling_params.width is None + assert captured_sampling_params.height is None assert captured_sampling_params.num_outputs_per_prompt == 1 assert captured_sampling_params.num_inference_steps == 4 assert captured_sampling_params.guidance_scale == 7.5 @@ -1649,3 +1657,59 @@ def __init__(self): assert len(images) == 1 assert isinstance(images[0], Image.Image) assert images[0].size == (32, 32) + + +def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_only_client): + """size=auto must NOT pin the diffusion stage sampling_params.height/width. + + Regression: prior to the fix, edit_images resolved size=auto to the + first input image dimensions and forwarded them through gen_params + + extra_body to the diffusion stages sampling_params. AR-driven + pipelines (e.g. HunyuanImage-3.0) rely on ar2diffusions + bridge to override the final bucket via the AR-predicted ratio token, + and the DiT pre_process_func only fills sampling_params from the + bridge value when sampling_params.width is None (see + pipeline_hunyuan_image3.py:290). Non-None width from the input image + silently suppressed the AR decision, producing the wrong bucket + (e.g. 1024x1024 square instead of the AR-decided 1280x720 landscape + for multi-image fusion). + + Cross-pins the multi-image fix at the API level: 2 reference images + with bot_task=it2i must produce 2 placeholders in the captured + AR prompt (build_prompt called with num_images=2). + """ + img_a = make_test_image_bytes((32, 32)) + img_b = make_test_image_bytes((128, 64)) + response = async_omni_stage_configs_only_client.post( + "/v1/images/edits", + files=[("image", img_a), ("image", img_b)], + data={ + "prompt": "fuse", + "size": "auto", + "bot_task": "it2i", + }, + ) + assert response.status_code == 200, response.text + + engine = async_omni_stage_configs_only_client.app.state.engine_client + captured = engine.captured_sampling_params_list + assert captured is not None + assert len(captured) == 2 + + diffusion_params = captured[1] + assert diffusion_params.height is None, ( + f"size=auto leaked into diffusion sampling_params.height={diffusion_params.height}; " + "must stay None so AR-driven pipelines can apply the bridges decision." + ) + assert diffusion_params.width is None, ( + f"size=auto leaked into diffusion sampling_params.width={diffusion_params.width}; " + "must stay None so AR-driven pipelines can apply the bridges decision." + ) + + KEY = "prompt" + IMG = "" + captured_prompt = engine.captured_prompt + if isinstance(captured_prompt, dict) and isinstance(captured_prompt.get("prompt"), str): + assert captured_prompt["prompt"].count("") == 2, ( + f"N=2 reference images must emit 2 placeholders in AR prompt; got {captured_prompt[KEY].count(IMG)} -- prompt: {captured_prompt[KEY]!r}" + ) diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 144a0e97a6c..618c2573078 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -91,3 +91,47 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser assert engine.default_sampling_params_list[1].lora_request is None assert engine.default_sampling_params_list[2].resolution == 640 assert engine.default_sampling_params_list[2].lora_request is None + + +def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders(serving_chat): + """N reference images with bot_task set must emit N placeholders. + + Regression: prior to the multi-image online fix, build_prompt was + called without num_images, defaulting to 1. A 2-image edit request + would only get a single placeholder in the AR prompt; vLLMs + _process_multimodal then raised + AssertionError(Failed to apply prompt replacement for mm_items[image][1]) + when trying to replace the second image (no placeholder left for it). + + Pins the contract that build_prompt() is invoked with the actual image + count so multi-image IT2I is wired correctly through the online + /v1/images/edits path. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + IMG = "" + images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)] + + for n in (1, 2, 3): + engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"bot_task": "it2i"}, + reference_images=images[:n], + gen_params=OmniDiffusionSamplingParams(), + ) + prompt_str = engine_prompt["prompt"] + assert prompt_str.count("") == n, ( + f"N={n}: expected {n} placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}" + ) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 06fb0a7f4cb..4227cff2fb6 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1811,7 +1811,8 @@ async def edit_images( # 3.3 Parse and add size if provided width, height = None, None - if size.lower() == "auto": + size_was_auto = size.lower() == "auto" + if size_was_auto: if resolution is None: # No resolution specified, use input image size width, height = pil_images[0].size @@ -1882,10 +1883,17 @@ async def edit_images( "seed": effective_seed, "num_outputs_per_prompt": n, } - if width is not None: - extra_body["width"] = width - if height is not None: - extra_body["height"] = height + # When size="auto", width/height were resolved from the first + # input images size (e.g. 512x512 logo), NOT a client-requested + # output dimension. Forwarding them to extra_body would override + # AR-driven pipelines (e.g. HunyuanImage-3.0) AR `` + # token decision via gen_params -> sampling_params. Skip the + # forward when auto, matching offline end2end.py img2img. + if not size_was_auto: + if width is not None: + extra_body["width"] = width + if height is not None: + extra_body["height"] = height if negative_prompt is not None: extra_body["negative_prompt"] = negative_prompt if num_inference_steps is not None: diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index a5ca494c89e..022b5d2e95d 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2265,16 +2265,16 @@ def _build_multistage_generation_inputs( build_prompt_tokens, ) + num_images = len(reference_images) if reference_images else 1 prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None if tokenizer is not None: - result = build_prompt_tokens(prompt, tokenizer, task=bot_task) + result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images) prompt_token_ids = result.token_ids system_prompt_type = result.system_prompt_type else: - prompt = build_prompt(prompt, task=bot_task) + prompt = build_prompt(prompt, task=bot_task, num_images=num_images) engine_prompt["prompt"] = prompt - if reference_images and len(reference_images) == 1: engine_prompt_data = {"image": reference_images[0]} modalities = ["image"] From ca830c851b63b9d2deea3d08b53b8315b4a4b5b4 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Mon, 11 May 2026 15:26:07 +0800 Subject: [PATCH 18/43] fix(hunyuan_image3): online IT2I HF byte-equivalent prompt path Follow-up to 815ac732 (online IT2I multi-image + size=auto). Online still passed the prompt as a string and let the engine BPE-tokenize the full chat template at once, while offline end2end.py img2img feeds prompt_token_ids built segment-by-segment via build_prompt_tokens (mirrors HF apply_chat_template). The two paths produced different AR input token sequences for the same user inputs: - offline (build_prompt_tokens): AR 661 tokens / 1118 chars cot - online (build_prompt string): AR 706 tokens / 1190 chars cot The mismatch silently shifted ARs training distribution (cross-segment BPE merges, e.g. -> single id, vs HFs [1811, 271]). AR produced different cot_text and DiT produced a visually different image even with the same seed/prompt/reference images. This patch threads the comprehension stages tokenizer through generate_diffusion_images -> _build_multistage_generation_inputs. When a tokenizer is available (multi-stage AR-driven path), the helper: 1. Calls build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=N) and writes engine_prompt[prompt_token_ids]; engine_prompt[prompt] stays as the raw user text so ar2diffusion can hand it through to DiT. 2. Sets engine_prompt[use_system_prompt] = resolve_sys_type(think) -> en_unified, matching offline end2end.py img2img which always forwards an explicit use_system_prompt. Falls back to the original build_prompt string path when no tokenizer is plumbed (legacy callers / unit tests), so existing flows still work. E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images, curl /v1/images/edits with size=auto, seed=42, steps=50, guidance=5.0): - before: AR 706 / 1190, brushed-metal yin-yang (BPE merges diverged) - after: AR 660 / 1148, canvas background restored (1 token / 30 char delta vs offline 661 / 1118 is within sampling noise; same en_unified sys prompt + trigger on both sides). Tests: - test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids pins: (a) engine_prompt[prompt_token_ids] set when tokenizer is passed, (b) engine_prompt[prompt] preserved as raw user text, (c) engine_prompt[use_system_prompt] == en_unified, (d) N token ids in prompt_token_ids for N=1,2,3. Follow-ups (separate patches): - Public API surface for task / bot_task separation (online callers currently pass bot_task in extra_body but the value semantically means task; needed to express think_recaption / recaption / vanilla). - HF byte-for-byte parity assertion across offline and online once the API split lands. Signed-off-by: TaffyOfficial --- ...test_serving_chat_multistage_generation.py | 81 +++++++++++++++++++ vllm_omni/entrypoints/openai/serving_chat.py | 37 ++++++--- 2 files changed, 108 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 618c2573078..b0871732f6a 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -135,3 +135,84 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders assert prompt_str.count("") == n, ( f"N={n}: expected {n} placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}" ) + + +def test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids(serving_chat): + """When a tokenizer is provided, the helper must emit HF byte-for-byte + prompt_token_ids and forward use_system_prompt to the engine prompt. + + Regression: prior to the HF-byte-equivalent fix, online IT2I always + passed the prompt as a single string. The engine then BPE-merged across + chat-template segment boundaries (e.g. user_prompt-ending punctuation + plus the trailing \n\n before \"Assistant: \") producing a token + sequence that differs from HF apply_chat_template / offline + end2end.py. AR generated different cot_text (706 tokens / 1190 chars + vs offline 661 / 1118 for the same inputs) and DiT produced a visually + different image (yin-yang on brushed-metal vs three-blue swirl on + canvas) under the same seed. + + Pins: + 1. engine_prompt[\"prompt_token_ids\"] is set when tokenizer is passed. + 2. engine_prompt[\"prompt\"] stays as the raw user prompt -- the DiT + side rebuilds its own system prefix via use_system_prompt. + 3. engine_prompt[\"use_system_prompt\"] == \"en_unified\" so + ar2diffusion forwards the matching system prompt to DiT. + 4. N reference images emit N token ids in the AR sequence. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + # Minimal FakeTokenizer mirroring tests/diffusion/.../test_hunyuan_image3_it2i_multi_image.py + class FakeTokenizer: + SPECIAL = { + "<|startoftext|>": 1, + "": 2, + "": 3, + "": 4, + } + + def convert_tokens_to_ids(self, tok: str) -> int: + return self.SPECIAL.get(tok, 0) + + def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: + return list(range(100, 100 + len(text))) + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + PROMPT_KEY = "prompt" + USP_KEY = "use_system_prompt" + images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)] + + for n in (1, 2, 3): + tok = FakeTokenizer() + engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"bot_task": "it2i"}, + reference_images=images[:n], + gen_params=OmniDiffusionSamplingParams(), + tokenizer=tok, + ) + # (1) prompt_token_ids must be set and non-empty + assert "prompt_token_ids" in engine_prompt, f"N={n}: prompt_token_ids missing" + token_ids = engine_prompt["prompt_token_ids"] + assert isinstance(token_ids, list) and len(token_ids) > 0, f"N={n}: prompt_token_ids empty" + # (2) raw prompt preserved (DiT bridge needs raw user text) + assert engine_prompt["prompt"] == "edit me", ( + f"N={n}: prompt must stay raw user text, got {engine_prompt[PROMPT_KEY]!r}" + ) + # (3) use_system_prompt forwarded for ar2diffusion bridge + assert engine_prompt.get("use_system_prompt") == "en_unified", ( + f"N={n}: use_system_prompt must be en_unified, got {engine_prompt.get(USP_KEY)!r}" + ) + # (4) N token ids (id=2 in FakeTokenizer) + img_count = token_ids.count(2) + assert img_count == n, f"N={n}: expected {n} token ids in prompt_token_ids, got {img_count}" diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 022b5d2e95d..2738f648e09 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2258,7 +2258,8 @@ def _build_multistage_generation_inputs( else: engine_prompt_data = {"image": reference_images} - engine_prompt: OmniTextPrompt = {"prompt": prompt} + prompt_token_ids: list[int] | None = None + system_prompt_type: str | None = None if bot_task: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, @@ -2266,23 +2267,35 @@ def _build_multistage_generation_inputs( ) num_images = len(reference_images) if reference_images else 1 - prompt_token_ids: list[int] | None = None - system_prompt_type: str | None = None if tokenizer is not None: - result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images) + # HF byte-for-byte path: feed segment-tokenized prompt_token_ids + # so AR sees the same template-tokenization HF apply_chat_template + # produces. Without this, the engine BPE-merges across template + # segment boundaries (e.g. "。\n\n" -> single id) and AR + # diverges from training distribution -- different cot_text, + # different DiT input, different final image. Mirrors offline + # examples/.../end2end.py img2img which always feeds + # prompt_token_ids. See prompt_utils.build_prompt NOTE. + result = build_prompt_tokens( + prompt, + tokenizer, + task=bot_task, + num_images=num_images, + ) prompt_token_ids = result.token_ids system_prompt_type = result.system_prompt_type else: + # Legacy string path (e.g. unit tests with no tokenizer plumbed). prompt = build_prompt(prompt, task=bot_task, num_images=num_images) - engine_prompt["prompt"] = prompt if reference_images and len(reference_images) == 1: engine_prompt_data = {"image": reference_images[0]} modalities = ["image"] - if prompt_token_ids is not None: - engine_prompt["prompt_token_ids"] = prompt_token_ids - if system_prompt_type is not None: - engine_prompt["use_system_prompt"] = system_prompt_type + engine_prompt: OmniTextPrompt = {"prompt": prompt} + if prompt_token_ids is not None: + engine_prompt["prompt_token_ids"] = prompt_token_ids + if system_prompt_type is not None: + engine_prompt["use_system_prompt"] = system_prompt_type engine_prompt["modalities"] = modalities if negative_prompt is not None: engine_prompt["negative_prompt"] = negative_prompt @@ -2456,13 +2469,17 @@ async def generate_diffusion_images( diffusion_engine = cast(AsyncOmni, engine) stage_configs = getattr(diffusion_engine, "stage_configs", None) or [] if len(stage_configs) > 1: + # Pull tokenizer from the comprehension (AR) stage so we can + # build HF byte-for-byte prompt_token_ids in the helper. If + # the engine doesn"t expose one, fall back to the legacy + # string-prompt path (engine re-tokenizes). tokenizer = None get_tok = getattr(diffusion_engine, "get_tokenizer", None) if get_tok is not None: try: tokenizer = await get_tok() except Exception as exc: - logger.warning("get_tokenizer failed: %s", exc) + logger.warning("get_tokenizer failed; falling back to string prompt path: %s", exc) engine_prompt, sampling_params_list = self._build_multistage_generation_inputs( engine=diffusion_engine, prompt=prompt, From c2ea079927380256fc5424cf513d25d721577f6b Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Mon, 11 May 2026 16:03:25 +0800 Subject: [PATCH 19/43] fix(hunyuan_image3): align DiT tokenization with AR-sampled token IDs Follow-up to 94830bdd (HF byte-equivalent prompt on AR side). DiT side was still re-encoding the AR-decoded cot text via tokenizer.encode, which is not lossless when AR-sampled tokens decode to text whose BPE re-merges differ from ARs original token sequence -- e.g. Chinese punctuation, escaped quotes, and multi-byte UTF-8 boundaries silently shift the token count by N for the same content. For KV-reuse-enabled requests this is fatal: AR caches K/V at AR-tok positions (length L_ar), but DiT computes positive_reuse_len from think_recaption_end_pos in its OWN tokenizer_output (length L_dit != L_ar). inject_ar_kv_into_layers then silently slices k[:positive_reuse_len] from a shorter tensor (Python slice tolerates out-of-bounds) and _cache_prompt_kvs assert q_len + ar_kv_len == seq_len fires with ar_kv_len = L_ar while seq_len was computed with positive_reuse_len = L_dit. User-observed: q_len(4105) + ar_kv_len(6740) != seq_len(10854), off by 9 on a Chinese-heavy IT2I prompt. For non-KV-reuse requests the same drift exists but is silently absorbed: AR sees its training-distribution tokens, DiT sees a different prefix prefix, output image quality subtly diverges (the 3-magnet vs 1-magnet pattern in the earlier P0 e2e smoke). ar2diffusion bridge already forwards extra.ar_token_ids alongside extra.ar_generated_text since the multi-image PR landed -- this patch just teaches DiT to consume it. Surgery points: 1. hunyuan_image3_tokenizer.py: get_cot_sections_from_token_ids Mirror of get_cot_sections but splits at / marker token IDs in AR-sampled space instead of text-split. Emits sections carrying pre-tokenized tokens=[...] which encode_text already consumes verbatim (line 152-154: if isinstance(text, str): encode; else: use as-is). 2. hunyuan_image3_tokenizer.py: apply_chat_template adds optional batch_cot_token_ids: list[Any] | None param. When provided per batch item, the assistant message is built with context_type=token_ids (vs str). Backward compatible: callers passing only batch_cot_text keep working. 3. hunyuan_image3_tokenizer.py: process_successive_message handles context_type==token_ids for assistant role -- splits on marker IDs when both + or + tokens are present, otherwise wraps the full ID sequence as a single text section with tokens=... . 4. pipeline_hunyuan_image3.py: forward() extracts extra.ar_token_ids alongside extra.ar_generated_text from each prompt and threads cot_token_ids through prepare_model_inputs -> apply_chat_template.batch_cot_token_ids. Prefer ID path when available; fall back to text path otherwise (back-compat for non-AR-driven flows that dont set ar_token_ids). E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl /v1/images/edits, size=auto, seed=42, steps=50, guidance=5.0, non-KV-reuse stage configs): HTTP 200, 1280x720 PNG, AR 641 tokens / 1107 chars cot. No regression in existing flows (149 unit tests pass). KV-reuse e2e validation in this run was blocked by an orthogonal environment issue (gpu_memory_utilization=0.95 in user yaml + post-load FusedMoeRunner workspace allocation overshoots) rather than a code defect; the byte-aligned ar_token_ids path is what the assertion requires, verified via unit tests. Tests: - tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py * test_get_cot_sections_from_token_ids_round_trips_ar_ids pins lossless splitting at AR-tok / markers (no re-encode). * test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids pins end-to-end contract that apply_chat_template emits the AR-sampled ID sequence verbatim in the final encoded output. Signed-off-by: TaffyOfficial --- .../hunyuan_image3/test_kvreuse_alignment.py | 135 ++++++++++++++++++ .../hunyuan_image3_tokenizer.py | 131 +++++++++++++++-- .../hunyuan_image3/pipeline_hunyuan_image3.py | 15 ++ 3 files changed, 272 insertions(+), 9 deletions(-) create mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py new file mode 100644 index 00000000000..20faf5487dc --- /dev/null +++ b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Regression tests for AR-token-IDs preservation through DiT prompt building. + +Pins the KV-reuse alignment contract: when the AR-side stage input +processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion +stage, `apply_chat_template` must consume those IDs verbatim (no +re-encode of the decoded cot text via `tokenizer.encode`) so that the +DiT-side prompt tokenization matches AR's actually-sampled token +sequence byte-for-byte. + +Why this matters: tokenize-detokenize-tokenize over the cot text is not +lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries), +and the resulting length drift breaks AR KV position alignment -- +DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`) +ends up larger than the actual cached AR KV length, and +`inject_ar_kv_into_layers` then silently truncates via Python slice, +leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off +by N (hard 500 on KV-reuse-enabled requests; see +`pipeline_hunyuan_image3.py:_cache_prompt_kv`). +""" + +from __future__ import annotations + +import os + +import pytest + +pytestmark = [pytest.mark.core_model] + + +def _hf_cached(model_id: str) -> bool: + hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface") + snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots") + return os.path.isdir(snap_dir) and any(os.scandir(snap_dir)) + + +_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct" + + +@pytest.mark.skipif( + not _hf_cached(_HUNYUAN_MODEL_ID), + reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache", +) +def test_get_cot_sections_from_token_ids_round_trips_ar_ids(): + """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the + `` / `` token-id positions and emit sections whose + concatenated tokens equal the input (no re-encode). + + Catches the failure mode where DiT re-encodes the decoded cot text + and the BPE merges differ from AR's sampled tokens (length drift). + """ + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import ( + TokenizerWrapper, + ) + + tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID) + + think_id = tkw.tokenizer.convert_tokens_to_ids("") + end_think_id = tkw.end_think_token_id + + # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens + # surrounded by / markers, plus some leading + trailing + # tokens (e.g. / tail that gets truncated upstream). + thought_payload = [1000, 1001, 1002, 1003, 1004] + leading = [2000, 2001] + trailing = [3000] + ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing + + sections = tkw.get_cot_sections_from_token_ids( + ar_token_ids, + uncond_kwargs={}, + drop_think=False, + ) + + # Sections concatenated must equal the input verbatim. + out: list[int] = [] + for sec in sections: + assert sec["type"] == "text", f"unexpected section type: {sec}" + toks = sec.get("tokens") + assert toks is not None, f"section missing 'tokens' field: {sec}" + out.extend(toks) + assert out == ar_token_ids, ( + f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; " + f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}" + ) + + +@pytest.mark.skipif( + not _hf_cached(_HUNYUAN_MODEL_ID), + reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache", +) +def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids(): + """When `batch_cot_token_ids` is passed, the assistant section in the + final encoded token sequence must contain the AR-sampled token ids + verbatim -- no `tokenizer.encode(cot_text)` round-trip. + + Pins the end-to-end contract that KV-reuse alignment relies on. + """ + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import ( + TokenizerWrapper, + ) + + tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID) + think_id = tkw.tokenizer.convert_tokens_to_ids("") + end_think_id = tkw.end_think_token_id + + # Construct a synthetic AR cot id sequence. Use mid-range vocab ids + # that are very unlikely to collide with any chat-template specials. + payload = [55001, 55002, 55003] + ar_token_ids = [think_id] + payload + [end_think_id] + + out_with_ids = tkw.apply_chat_template( + batch_prompt=["draw a robot"], + batch_system_prompt=[None], + batch_cot_token_ids=[ar_token_ids], + mode="gen_text", + sequence_template="instruct", + ) + tokens_with_ids = out_with_ids["output"].tokens.tolist()[0] # batched output: take batch 0 + + # The exact AR payload must appear as a contiguous subsequence in the + # encoded output, sandwiched by the think markers we forwarded. + def _find_subseq(haystack: list[int], needle: list[int]) -> int: + n = len(needle) + for i in range(len(haystack) - n + 1): + if haystack[i : i + n] == needle: + return i + return -1 + + full_cot = [think_id] + payload + [end_think_id] + idx = _find_subseq(tokens_with_ids, full_cot) + assert idx >= 0, ( + f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; " + f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead" + ) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py index 751bfb21af8..e6e0c9db346 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py @@ -903,6 +903,75 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th dict(type="text", text=cot_text, **uncond_kwargs), ] + def get_cot_sections_from_token_ids( + self, + token_ids, + uncond_kwargs, + cot_max_length=None, + drop_think=False, + ): + """Split AR-sampled token IDs at think/recaption markers without re-encoding. + + Functional mirror of `get_cot_sections` but operates on AR sampled IDs. + Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR + cot text is not lossless (BPE re-merges across multi-byte UTF-8 and + punctuation boundaries). The resulting length drift breaks AR KV + position alignment (`positive_reuse_len` computed in DiT-tok space vs + the actual cached AR KV in AR-tok space, off by N tokens for prompts + containing Chinese + escaped quotes etc.). + """ + if not token_ids: + return [] + ids = list(token_ids) + + think_id = self.tokenizer.convert_tokens_to_ids("") + end_think_id = self.end_think_token_id + recaption_id = self.tokenizer.convert_tokens_to_ids("") + end_recaption_id = self.end_recaption_token_id + + def _split_at_pair(seq, start_id, end_id): + if start_id is None or end_id is None: + return None + try: + s = seq.index(start_id) + e = seq.index(end_id, s + 1) + except ValueError: + return None + return seq[:s], seq[s + 1 : e], seq[e + 1 :] + + # Try ... first to mirror text-side split order. + split = _split_at_pair(ids, think_id, end_think_id) + if split is not None: + before, inside, after = split + return ( + self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think) + + ( + [ + dict(type="text", tokens=[think_id]), + dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs), + dict(type="text", tokens=[end_think_id]), + ] + if not drop_think + else [] + ) + + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think) + ) + + split = _split_at_pair(ids, recaption_id, end_recaption_id) + if split is not None: + before, inside, after = split + return ( + self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think) + + [ + dict(type="text", tokens=[recaption_id]), + dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs), + dict(type="text", tokens=[end_recaption_id]), + ] + + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think) + ) + + return [dict(type="text", tokens=ids, **uncond_kwargs)] + def apply_general_template( self, message_list, @@ -953,17 +1022,36 @@ def process_successive_message( while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role: message = _message_list[_cur_message_idx] if message["type"] == "text": - text = message["content"] + content = message["content"] + ctx_type = message.get("context_type", "str") if role == "system": - _sub_sections.append(dict(type="text", text=text)) + _sub_sections.append(dict(type="text", text=content)) elif role == "assistant": - if ("" in text and "" in text) or ( - "" in text and "" in text - ): - _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think)) + if ctx_type == "token_ids": + # Pre-tokenized AR cot tokens; split on marker ids, no re-encode. + if hasattr(content, "tolist"): + content = content.tolist() + think_id = self.tokenizer.convert_tokens_to_ids("") + recaption_id = self.tokenizer.convert_tokens_to_ids("") + has_cot = (think_id in content and self.end_think_token_id in content) or ( + recaption_id in content and self.end_recaption_token_id in content + ) + if has_cot: + _sub_sections.extend( + self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think) + ) + else: + _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs)) else: - _sub_sections.append(dict(type="text", text=text, **uncond_kwargs)) + text = content + if ("" in text and "" in text) or ( + "" in text and "" in text + ): + _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think)) + else: + _sub_sections.append(dict(type="text", text=text, **uncond_kwargs)) else: + text = content _sub_sections.append( dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs) ) @@ -1088,6 +1176,7 @@ def apply_chat_template( batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None, batch_system_prompt: list[str] | None = None, batch_cot_text: list[str] | None = None, + batch_cot_token_ids: list | None = None, max_length: int | None = None, bot_task: str = "auto", # auto/image/think/recaption/img_ratio image_base_size: int = 1024, @@ -1116,6 +1205,14 @@ def apply_chat_template( ) else: batch_cot_text = [None] * batch_size + # Optional per-item pre-tokenized AR cot ids (used by KV-reuse). + if batch_cot_token_ids is not None: + assert len(batch_cot_token_ids) == batch_size, ( + f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), " + f"but got {len(batch_cot_token_ids)}." + ) + else: + batch_cot_token_ids = [None] * batch_size if batch_cond_image_info is not None: assert len(batch_cond_image_info) == batch_size, ( f"batch_cond_image_info should have the same length as batch_size ({batch_size}), " @@ -1130,10 +1227,18 @@ def apply_chat_template( # Convert single round materials into standard message list batch_message_list = [] - for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip( + for ( + prompt, + system_prompt, + cot_text, + cot_token_ids, + gen_image_info, + cond_image_info_list, + ) in zip( batch_prompt, batch_system_prompt, batch_cot_text, + batch_cot_token_ids, batch_gen_image_info, batch_cond_image_info, ): @@ -1153,7 +1258,15 @@ def apply_chat_template( # 2.2 text inputs message_list.append(dict(role="user", type="text", content=prompt, context_type="str")) # 3. assistant answer sections - if cot_text is not None: + if cot_token_ids is not None: + # Use AR-sampled token IDs verbatim. Avoids the + # tokenize-detokenize-tokenize length drift that breaks KV reuse + # (see process_successive_message context_type="token_ids" branch + # and get_cot_sections_from_token_ids docstring). + message_list.append( + dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids") + ) + elif cot_text is not None: message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str")) if mode == "gen_image": message_list.append( diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index b1ba2687f86..5c6ddba0b64 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -750,6 +750,7 @@ def prepare_model_inputs( mode="gen_image", system_prompt=None, cot_text=None, + cot_token_ids=None, num_inference_steps=50, guidance_scale=5.0, image_size="auto", @@ -766,6 +767,7 @@ def prepare_model_inputs( batch_message_list = message_list batch_prompt = prompt batch_cot_text = cot_text + batch_cot_token_ids = cot_token_ids batch_system_prompt = system_prompt batch_gen_image_info = None batch_cond_image_info = kwargs.pop("batch_cond_image_info", None) @@ -844,6 +846,7 @@ def prepare_model_inputs( batch_cond_image_info=batch_cond_image_info, batch_system_prompt=batch_system_prompt, batch_cot_text=batch_cot_text, + batch_cot_token_ids=batch_cot_token_ids, max_length=kwargs.get("max_length"), bot_task=bot_task, image_base_size=self.config.image_base_size, @@ -1376,12 +1379,23 @@ def forward( # and ``get_cot_sections()`` can parse the think/recaption structure # directly. cot_text_list = [] + cot_token_ids_list = [] for p in req.prompts: extra = p.get("extra", {}) if isinstance(p, dict) else {} cot_text_list.append(extra.get("ar_generated_text") or None) + cot_token_ids_list.append(extra.get("ar_token_ids")) cot_text = ( [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None ) + # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt + # tokenization matches ARs actual token sequence byte-for-byte. Required + # when KV reuse is enabled: positive_reuse_len computed from DiT-side + # tokenization must equal the AR-side KV cache length, otherwise the + # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs + # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on + # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids + # in hunyuan_image3_tokenizer.py). + cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None batch_cond_image_info: list[list[JointImageInfo]] | None = None if any(not isinstance(p, str) for p in req.prompts): @@ -1422,6 +1436,7 @@ def forward( model_inputs = self.prepare_model_inputs( prompt=prompt, cot_text=cot_text, + cot_token_ids=cot_token_ids, system_prompt=system_prompt, mode="gen_image", generator=generator, From 1454f441ecf76152bcb67629f6fcb446ad9aa3f4 Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Mon, 11 May 2026 16:17:04 +0800 Subject: [PATCH 20/43] fix(hunyuan_image3): split task / bot_task / sys_type at /v1/images/edits Before P1, /v1/images/edits exposed a single Form field that was misused: callers passed a enum value (i2t / it2i / t2i / t2t) under that name, and _build_multistage_generation_inputs forwarded it as to build_prompt with bot_task defaulted to "think". This blocked clients from expressing: - the bot_task semantic (think / recaption / think_recaption / vanilla) - sys_type override (offline ) Both knobs are needed to drive the online OpenAI API 1:1 against the offline examples/.../end2end.py img2img surface. Changes: 1. api_server.py: edit_images Form params add task: str | None and sys_type: str | None. Legacy bot_task= is auto promoted to task=, bot_task=None so old clients keep working. 2. api_server.py: forward all three keys (task / bot_task / sys_type) to extra_body instead of writing a single misleading bot_task key. 3. serving_chat.py:_build_multistage_generation_inputs reads the triple, applies the same legacy normalization (defends against direct chat_handler callers passing the pre-P1 shape), and threads bot_task + sys_type through build_prompt_tokens / build_prompt. use_system_prompt forwarded to ar2diffusion now respects the override. Tests (new): - test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged Legacy extra_body={"bot_task": "it2i"} produces a prompt byte identical to extra_body={"task": "it2i"} (back-compat). - test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys bot_task=think vs bot_task=think_recaption produce different rendered prompts (system body differs); pins that bot_task is actually plumbed through rather than collapsed to think default. - test_build_multistage_generation_inputs_sys_type_override sys_type=en_unified over bot_task=think_recaption reproduces the same prompt body as bot_task=think (offline override pattern). Follow-up (not in this patch): - Mirror task / bot_task / sys_type on /v1/images/generations JSON schema (ImageGenerationRequest) for consistency across endpoints. Signed-off-by: TaffyOfficial --- ...test_serving_chat_multistage_generation.py | 195 ++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 29 ++- vllm_omni/entrypoints/openai/serving_chat.py | 38 +++- 3 files changed, 257 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index b0871732f6a..88d15a684b6 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -216,3 +216,198 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: # (4) N token ids (id=2 in FakeTokenizer) img_count = token_ids.count(2) assert img_count == n, f"N={n}: expected {n} token ids in prompt_token_ids, got {img_count}" + + +def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat): + """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under + `bot_task` in extra_body. After the P1 task/bot_task split, the helper + must still treat that legacy form as `task=, bot_task=None` + (i.e. defaults bot_task semantic to "think"), so the resulting prompt + is identical to the pre-P1 output. + + Pins the back-compat contract. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")] + + # Legacy form: only bot_task=. + legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"bot_task": "it2i"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + # New form: explicit task=, no bot_task. + new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + assert legacy_prompt["prompt"] == new_prompt["prompt"], ( + f"legacy bot_task= form must produce the same prompt as task=; " + f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}" + ) + + +def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat): + """Passing bot_task=think_recaption (vs default "think") must flip the + resolved sys_type to en_think_recaption (and trigger tag is still + ). Pins that the API actually plumbs the bot_task semantic + through to build_prompt rather than ignoring it. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red")] + + # Default bot_task (think) -> en_unified system prompt baked into the + # legacy string path. Use legacy build_prompt (tokenizer=None) so the + # rendered prompt is a string we can grep. + think_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i", "bot_task": "think"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + # think_recaption -> en_think_recaption system prompt (different content). + recap_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i", "bot_task": "think_recaption"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + assert think_prompt["prompt"] != recap_prompt["prompt"], ( + "bot_task semantic must change the rendered system prompt: " + f"think/think_recaption produced identical strings (len={len(think_prompt['prompt'])})" + ) + + +def test_build_multistage_generation_inputs_sys_type_override(serving_chat): + """Caller-supplied sys_type must override the bot_task-derived default. + Mirrors offline `--bot-task think_recaption --sys-type en_unified` + where the user wants think_recaptions trigger but the unified system + prompt body. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red")] + + # think_recaption defaults sys_type -> en_think_recaption. + default_sys, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i", "bot_task": "think_recaption"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + # sys_type=en_unified overrides -> same system body as bot_task=think. + overridden, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i", "bot_task": "think_recaption", "sys_type": "en_unified"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + plain_think, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"task": "it2i", "bot_task": "think"}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + + # Override must (a) differ from the no-override default, and (b) equal + # the prompt that bot_task=think produces (both end up with + # en_unified system body + trigger). + assert overridden["prompt"] != default_sys["prompt"], ( + "sys_type override must change the rendered prompt body vs the bot_task default" + ) + assert overridden["prompt"] == plain_think["prompt"], ( + "sys_type=en_unified + bot_task=think_recaption must produce the same prompt as " + "bot_task=think (both = en_unified system body + trigger)" + ) + + +def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat): + """`extra_body["system_prompt"]` must reach build_prompt as + `custom_system_prompt`, enabling sys_type="custom" callers to inject + a verbatim system body. Without this plumbing the sys_type="custom" + branch in get_system_prompt() returns None and silently drops the + user-supplied content. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red")] + + QKEY = "prompt" + marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ" + + out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={ + "task": "it2i", + "bot_task": "think", + "sys_type": "custom", + "system_prompt": marker, + }, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + assert marker in out["prompt"], ( + f"custom system_prompt content must reach the rendered prompt; " + f"marker {marker!r} not found in prompt of length {len(out['prompt'])}" + ) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 4227cff2fb6..77dc026bc97 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1701,6 +1701,14 @@ async def edit_images( layers: int | None = Form(None), resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS bot_task: str | None = Form(None), + # P1: task / sys_type / system_prompt split out from the legacy bot_task + # field so callers can express the full HunyuanImage-3.0 prompt template + # surface (task enum + bot_task semantic + sys_type override + custom + # system prompt body). Legacy callers that pass a task-enum value via + # bot_task still work (see normalization below). + task: str | None = Form(None), + sys_type: str | None = Form(None), + system_prompt: str | None = Form(None), ) -> ImageGenerationResponse: """ OpenAI-compatible image edit endpoint. @@ -1913,8 +1921,25 @@ async def edit_images( lora_dict = _get_lora_from_json_str(lora) _parse_lora_request(lora_dict) extra_body["lora"] = lora_dict - if bot_task is not None: - extra_body["bot_task"] = bot_task + # P1: normalize legacy `bot_task=` form. Callers historically + # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task` + # Form field; promote it to `task` here so the chat_handler can + # split task vs bot_task semantics cleanly. New callers pass both + # `task` and `bot_task` explicitly; we keep them separate. + _task = task + _bot_task = bot_task + _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"} + if _task is None and _bot_task in _legacy_task_enum: + _task = _bot_task + _bot_task = None + if _task is not None: + extra_body["task"] = _task + if _bot_task is not None: + extra_body["bot_task"] = _bot_task + if sys_type is not None: + extra_body["sys_type"] = sys_type + if system_prompt is not None: + extra_body["system_prompt"] = system_prompt prompt_text = prompt.get("prompt", "") generation_result = await chat_handler.generate_diffusion_images( diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 2738f648e09..d1b2e89ae80 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2247,7 +2247,22 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") + # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy + # api_server callers may still pass a task-enum value (i2t / it2i / + # t2i / t2t) under `bot_task`; normalize it to `task` here so + # downstream uses the canonical split. Source the task enum from + # prompt_utils so this layer stays in sync with the model side. + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + available_tasks as _hunyuan3_available_tasks, + ) + + task = extra_body.get("task") bot_task = extra_body.get("bot_task") + sys_type = extra_body.get("sys_type") + custom_system_prompt = extra_body.get("system_prompt") + if task is None and bot_task in set(_hunyuan3_available_tasks()): + task = bot_task + bot_task = None engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] @@ -2260,13 +2275,20 @@ def _build_multistage_generation_inputs( prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None - if bot_task: + if task or bot_task: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, build_prompt_tokens, ) num_images = len(reference_images) if reference_images else 1 + # build_prompt defaults task="it2i"; preserve that when caller + # only passed bot_task semantic. + effective_task = task if task is not None else "it2i" + # build_prompt defaults bot_task="think"; preserve that for legacy + # callers (passing bot_task=None to build_prompt explicitly gives a + # different (sys, trigger) than the default "think"). + effective_bot_task = bot_task if bot_task is not None else "think" if tokenizer is not None: # HF byte-for-byte path: feed segment-tokenized prompt_token_ids # so AR sees the same template-tokenization HF apply_chat_template @@ -2279,14 +2301,24 @@ def _build_multistage_generation_inputs( result = build_prompt_tokens( prompt, tokenizer, - task=bot_task, + task=effective_task, + bot_task=effective_bot_task, + sys_type=sys_type, + custom_system_prompt=custom_system_prompt, num_images=num_images, ) prompt_token_ids = result.token_ids system_prompt_type = result.system_prompt_type else: # Legacy string path (e.g. unit tests with no tokenizer plumbed). - prompt = build_prompt(prompt, task=bot_task, num_images=num_images) + prompt = build_prompt( + prompt, + task=effective_task, + bot_task=effective_bot_task, + sys_type=sys_type, + custom_system_prompt=custom_system_prompt, + num_images=num_images, + ) if reference_images and len(reference_images) == 1: engine_prompt_data = {"image": reference_images[0]} modalities = ["image"] From 99c5eec085b42c05de937a8e7c117155c7c0234c Mon Sep 17 00:00:00 2001 From: TaffyOfficial Date: Tue, 12 May 2026 10:51:44 +0800 Subject: [PATCH 21/43] fix(hunyuan_image3): align online edit AR input with offline path Two complementary fixes that close the gap where online /v1/images/edits systematically produced different AR cot (e.g. "3 magnets" semantic) from offline end2end.py ("1 magnet" semantic) on the same prompt + seed + images, even after the P0 byte-equivalent prompt_token_ids and P1 task/bot_task/sys_type API split landed. 1. RGB normalization in _load_input_images (root cause for the systematic semantic divergence) input_1_0.png in the demo set is RGBA with 57,671 fully-transparent pixels. Offline `end2end.py` opens images with `Image.open(...).convert("RGB")`, which composites transparent pixels over BLACK. Online had no such normalization; the Hunyuan AR image processor receives the raw RGBA upload and alpha-composites over WHITE. The two paths therefore fed AR two different RGB tensors at the encoder boundary -- enough to make AR recaption diverge into different scene interpretations even with byte-identical prompt_token_ids. Fix: `_load_input_images(... normalize_rgb=True)` defaults to RGB normalization. `edit_images` opts in only when the caller passes Hunyuan-aware prompt controls (task / bot_task / sys_type); mask stays untouched so its alpha role is preserved. Diagnosis by Codex; thanks. 2. Determinize cond-image VAE encode Both AR-side `_vae_encode` (model_executor) and DiT-side cond VAE encoding (pipeline_hunyuan_image3) called `latent_dist.sample()` with no generator, consuming torch's global RNG state. Fresh-process callers (offline) hit a stable post-init RNG state every invocation so this looked deterministic; long-running servers (online) mix per-request scheduler/UUID/etc into the global RNG before this call, so same-seed curls got drifting cond latents across requests. Cond image at this site is declared `t=0` clean conditioning -- no stochasticity needed. Fix: pass a fresh `torch.Generator(device=...).manual_seed(0)` at both call sites. Cond latents now deterministic across runs and across paths. Why `.sample(seeded_gen)` instead of `.mode()`: AR-side DiagonalGaussianDistribution has `.mode()`, but the DiT-side counterpart in diffusion/.../autoencoder.py does not implement it. The seeded `.sample()` works on both sides and matches HF upstream's `latent_dist.sample(generator)` signature -- a strict improvement over HF default (HF defaults the generator to None and inherits the same silent non-determinism). Related memory: `memory/feedback/painterly_silent_bugs.md` flagged the same bug class once before; this is the cond-image-encode incarnation. E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl /v1/images/edits with task=it2i bot_task=think_recaption sys_type=en_unified seed=42 steps=50 guidance=5.0): - before either fix: "3 magnets on canvas" (offline produces 1) - after cond VAE fix only: "3 magnets on canvas" (within-run drift reduced from 73-token to 10-token spread but cross-path semantic still wrong) - after both fixes: "1 magnet on canvas" -- in the same semantic neighborhood as the offline baseline Tests: 153 unit tests pass, ruff clean. Surgical API-level regression tests for the two fixes deferred (would require GPU fixtures for the cond VAE side; the RGB side is small enough that the e2e proof is the contract). Signed-off-by: TaffyOfficial --- .../hunyuan_image3/pipeline_hunyuan_image3.py | 6 +++- vllm_omni/entrypoints/openai/api_server.py | 28 ++++++++++++++++--- .../models/hunyuan_image3/hunyuan_image3.py | 12 +++++++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 5c6ddba0b64..e927f278340 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -634,7 +634,11 @@ def vae_encode(self, image, cfg_factor=1): if isinstance(vae_encode_result, torch.Tensor): latents = vae_encode_result else: - latents = vae_encode_result.latent_dist.sample() + # Fixed-seed Generator so cond latents are deterministic + # across calls; see AR-side comment in + # model_executor/.../hunyuan_image3.py:_vae_encode. + _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0) + latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) if hasattr(config, "shift_factor") and config.shift_factor: latents.sub_(config.shift_factor) if hasattr(config, "scaling_factor") and config.scaling_factor: diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 77dc026bc97..b485b6a3946 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1759,16 +1759,23 @@ async def edit_images( status_code=HTTPStatus.BAD_REQUEST.value, detail=detail, ) - pil_images = await _load_input_images(input_images_list) + # Only convert uploads to RGB when the caller opts into the + # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy + # callers that send only the older bot_task= shape keep + # whatever PIL mode the upload arrived as, to preserve pre-existing + # behavior for non-Hunyuan flows. + normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None + pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb) prompt["multi_modal_data"] = {} prompt["multi_modal_data"]["image"] = pil_images if mask_image is not None: - loaded = await _load_input_images([mask_image]) + # Mask role is different (alpha channel matters); never normalize. + loaded = await _load_input_images([mask_image], normalize_rgb=False) prompt["multi_modal_data"]["mask_image"] = loaded[0] if reference_image is not None: - loaded = await _load_input_images([reference_image]) + loaded = await _load_input_images([reference_image], normalize_rgb=normalize_edit_images_rgb) prompt["multi_modal_data"]["reference_image"] = loaded[0] # 3 Build sample params @@ -2220,6 +2227,8 @@ def _extract_images_from_result(result: Any) -> list[Any]: async def _load_input_images( inputs: list[str], + *, + normalize_rgb: bool = True, ) -> list[Image.Image]: """ convert to PIL.Image.Image list @@ -2266,7 +2275,18 @@ async def _load_input_images( if not images: raise ValueError("No valid input images found") - return images + if not normalize_rgb: + return images + + # Match the offline HunyuanImage3 image-edit example path, which eagerly + # normalizes input files with ``Image.open(...).convert("RGB")`` before + # they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes + # online IT2I observe a different visual input than offline (for example + # transparent-logo PNGs alpha-composited over white instead of black), + # which is enough for HunyuanImage3 AR recaption to diverge before DiT + # sees the request -- root cause of the "online 3 magnets vs offline 1 + # magnet" systematic semantic mismatch. + return [img.convert("RGB") for img in images] def _choose_output_format(output_format: str | None, background: str | None) -> str: diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 756a7a27c9b..216543b9593 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1776,7 +1776,17 @@ def _vae_encode( images = images.to(dtype=self.vae.dtype) vae_encode_result = self.vae.encode(images) - latents = vae_encode_result.latent_dist.sample() + # Cond image encoding is supposed to be deterministic clean + # conditioning (the comment below declares `t=0`). `.sample()` + # without a generator consumes torch's global RNG, which made + # cond latents drift between requests on a long-running server + # (online) while looking deterministic for fresh-process callers + # (offline) -- silent path-level non-determinism. Feed a fixed + # generator so all callers see identical cond latents. + import torch as _torch # local alias to keep blast radius minimal + + _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0) + latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) # Apply shift and scaling factors if present if hasattr(config, "shift_factor") and config.shift_factor: From 4d8c600391d2178cb1ad8aa446b34c7cc6b7a51f Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 11:46:17 +0800 Subject: [PATCH 22/43] fix(hunyuan_image3): address PR #3444 review feedback Apply two rounds of code review fixes on the multi-image IT2I PR: Cond VAE determinism Replace `latent_dist.sample()` + `manual_seed(0)` hardcoding with `latent_dist.mode()` on both AR (`model_executor/.../hunyuan_image3.py ::_vae_encode`) and DiT (`diffusion/.../pipeline_hunyuan_image3.py`) sides. Cond image is clean (t=0) conditioning by design; posterior mean is deterministic by construction and matches the official cond encode path. Adds `.mode()` to the DiT-side `DiagonalGaussianDistribution`. Stale compound task names (two-axis API migration) Repo-wide grep for `{t2t,i2t,it2i,t2i}x{think,recaption,think_recaption, vanilla}` cross-product turned up two residual compound names that the initial cleanup missed: - tests/e2e/accuracy/test_hunyuan_image3.py: task='it2i_recaption' -> task='it2i', bot_task='recaption' (would have ValueErrored at _resolve_preset on the new two-axis API). - tests/diffusion/.../test_prompt_utils.py: task='t2i_think' / task='t2i_recaption' -> (task='t2i', bot_task='think|recaption'). Custom system prompt body forwarding (producer -> consumer trace) Online `/v1/images/edits` accepted `sys_type='custom'` + `system_prompt` body on the AR side via `build_prompt_tokens(custom_system_prompt=...)`, but only forwarded `use_system_prompt` to the engine_prompt. DiT's `get_system_prompt(use, "image", body)` reads the body as the third positional arg, so `sys_type='custom'` was silently falling back to an empty DiT system prefix -- AR/DiT divergence under a user-visible knob. Forward `system_prompt` through both `serving_chat` engine_prompt and `stage_input_processors/hunyuan_image3.py::ar2diffusion` -> DiT `diffusion_input`. Ratio extraction simplification Drop the regex path on `generated_text` -- only worked under `skip_special_tokens: False`, which most deploy yamls don't set. Pure token-id reverse scan against `_build_ratio_id_lookup` is the source of truth (AR `_stage_transitions` forces exactly one `` emission). Drop unused `_RATIO_TOKEN_RE` constant, `re` import, and `generated_text` parameter from `_extract_ratio_index`. Housekeeping - Remove duplicate `engine_prompt["prompt_token_ids"]` assignment in serving_chat.py (merge residue, the second copy was added by the main-merge then re-introduced after the API split). - `examples/.../end2end.py`: stale `_TASK_PRESETS` comment -> `available_tasks` helper (symbol no longer exists post-split). - `process_image` comment in `model_executor/.../hunyuan_image3.py` clarifies the AR-side `_resize_and_crop` default vs the official `infer_align_image_size=False` (center crop) default. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/end2end.py | 8 ++- .../hunyuan_image3/test_prompt_utils.py | 4 +- tests/e2e/accuracy/test_hunyuan_image3.py | 8 ++- .../models/hunyuan_image3/autoencoder.py | 3 ++ .../hunyuan_image3/pipeline_hunyuan_image3.py | 9 ++-- vllm_omni/entrypoints/openai/serving_chat.py | 6 +++ .../models/hunyuan_image3/hunyuan_image3.py | 25 +++++----- .../stage_input_processors/hunyuan_image3.py | 49 +++++++------------ 8 files changed, 59 insertions(+), 53 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 82e8c194c5a..908109d65a3 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,7 +18,13 @@ _REPO_ROOT = Path(__file__).resolve().parents[3] _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml") - +# Modality → (task, default bot_task) mapping. `task` selects only whether +# `` placeholders are emitted; `bot_task` (None | think | recaption | +# think_recaption | vanilla) selects the system prompt + trigger tag. +# +# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short +# forms match the internal task names (see prompt_utils.available_tasks) +# so users who think in those terms don't have to translate. _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { "text2img": ("t2i", "think"), "t2i": ("t2i", "think"), diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 4d98bc5dcf2..2ddfbea42dd 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -69,8 +69,8 @@ def test_legacy_task_presets_still_available(): def test_resolve_stop_token_ids_uses_answer_for_generation_tasks(): tok = FakeTokenizer() answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id] - assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id] + assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id] + assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id] assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id] diff --git a/tests/e2e/accuracy/test_hunyuan_image3.py b/tests/e2e/accuracy/test_hunyuan_image3.py index 93671e7bbf6..0871793c5db 100644 --- a/tests/e2e/accuracy/test_hunyuan_image3.py +++ b/tests/e2e/accuracy/test_hunyuan_image3.py @@ -93,7 +93,13 @@ def _run(stage_config_path: str, output_path: Path) -> tuple[Image.Image, str, f from vllm_omni.platforms import current_omni_platform tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) - result = build_prompt_tokens(PROMPT, tokenizer, task="it2i_recaption", sys_type="en_unified") + result = build_prompt_tokens( + PROMPT, + tokenizer, + task="it2i", + bot_task="recaption", + sys_type="en_unified", + ) token_ids = result.token_ids system_prompt_type = result.system_prompt_type diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py index efba2f27435..ddd7d5c6df7 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py @@ -46,6 +46,9 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor: x = self.mean + self.std * sample return x + def mode(self) -> torch.FloatTensor: + return self.mean + @dataclass class DecoderOutput(BaseOutput): diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index e927f278340..5a9d1e48856 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -634,11 +634,10 @@ def vae_encode(self, image, cfg_factor=1): if isinstance(vae_encode_result, torch.Tensor): latents = vae_encode_result else: - # Fixed-seed Generator so cond latents are deterministic - # across calls; see AR-side comment in - # model_executor/.../hunyuan_image3.py:_vae_encode. - _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0) - latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) + # Cond image is clean conditioning (t=0 below) -- use the + # posterior mean so encoding is deterministic by construction. + # See AR-side comment in model_executor/.../hunyuan_image3.py. + latents = vae_encode_result.latent_dist.mode() if hasattr(config, "shift_factor") and config.shift_factor: latents.sub_(config.shift_factor) if hasattr(config, "scaling_factor") and config.scaling_factor: diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index d1b2e89ae80..4ba824f0909 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2328,6 +2328,12 @@ def _build_multistage_generation_inputs( engine_prompt["prompt_token_ids"] = prompt_token_ids if system_prompt_type is not None: engine_prompt["use_system_prompt"] = system_prompt_type + # Forward the custom system prompt body too. DiT's + # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads + # the third positional arg, so leaving it None turns a `sys_type=custom` + # request into an empty DiT system prefix (AR/DiT divergence). + if custom_system_prompt is not None: + engine_prompt["system_prompt"] = custom_system_prompt engine_prompt["modalities"] = modalities if negative_prompt is not None: engine_prompt["negative_prompt"] = negative_prompt diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 216543b9593..9f3b76039d0 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -907,9 +907,10 @@ def process_image(self, image_input: ImageInput): current_info["vit_spatial_shapes"] = _ss.squeeze(0) # VAE: per-image bucket via `reso_group.get_target_size`; mirrors - # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the - # VAE encoder casts to model dtype at its boundary (see - # `_vae_encode`). + # HF's `resize_and_crop` (crop_type="center", the official + # generate_image default with infer_align_image_size=False). + # Keep fp32 — the VAE encoder casts to model dtype at its + # boundary (see `_vae_encode`). image_width, image_height = self.reso_group.get_target_size(image.width, image.height) resized_image = self._resize_and_crop(image, (image_width, image_height)) vae_pixel_values = self.vae_processor(resized_image).squeeze(0) @@ -1776,17 +1777,13 @@ def _vae_encode( images = images.to(dtype=self.vae.dtype) vae_encode_result = self.vae.encode(images) - # Cond image encoding is supposed to be deterministic clean - # conditioning (the comment below declares `t=0`). `.sample()` - # without a generator consumes torch's global RNG, which made - # cond latents drift between requests on a long-running server - # (online) while looking deterministic for fresh-process callers - # (offline) -- silent path-level non-determinism. Feed a fixed - # generator so all callers see identical cond latents. - import torch as _torch # local alias to keep blast radius minimal - - _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0) - latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) + # Cond image is clean (t=0) conditioning -- take the posterior mean + # so encoding is deterministic by construction. `.sample()` without a + # generator consumes torch's global RNG and silently drifts between + # requests on a long-running server (online) while looking stable for + # fresh-process callers (offline). `.mode()` matches the official + # HunyuanImage-3 cond encode path. + latents = vae_encode_result.latent_dist.mode() # Apply shift and scaling factors if present if hasattr(config, "shift_factor") and config.shift_factor: diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index 158ea86dbf2..c95a2a48f18 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -13,7 +13,6 @@ from __future__ import annotations import os -import re from functools import lru_cache from typing import Any @@ -33,7 +32,6 @@ # (in the `/v1/images/edits` path that defaults to `pil_images[0].size`, # i.e. the first reference image's bucket — usually square, see # api_server.py:1808-1811). -_RATIO_TOKEN_RE = re.compile(r"") _DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct" @@ -158,42 +156,27 @@ def _id(name: str) -> int | None: return table -def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None: +def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None: """Resolve the AR-predicted ratio_index from this stage's output. - Two probe paths: - 1. Text regex on `generated_text` — works when the AR engine is - configured with `skip_special_tokens: False` (e.g. - `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading - the tokenizer. - 2. Token-id scan over `cumulative_token_ids` against the tokenizer's - `` id range — survives `skip_special_tokens: True` - where the special tokens are stripped from text but still present - in the raw token stream. - - Takes the LAST ratio token in the stream because the AR's - stage-transition logic emits exactly one such token at the tail of the - `` sequence; using "last" is robust to - any earlier accidental occurrences in the prompt scaffold. + `HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit + exactly one `` token after ` + `, so we scan the token stream from the tail for the first + id that maps to a ratio. Token-ids are the source of truth — text-side + regex is unreliable because most deploy yamls run AR with + `skip_special_tokens: True` (special tokens are stripped from text but + still present in `cumulative_token_ids`). """ - matches = _RATIO_TOKEN_RE.findall(generated_text or "") - if matches: - try: - return int(matches[-1]) - except ValueError: - pass - if generated_token_ids is None: return None table = _build_ratio_id_lookup(model_name_or_path) if not table: return None - last_ratio_idx: int | None = None - for tid in generated_token_ids: + for tid in reversed(list(generated_token_ids)): idx = table.get(int(tid)) if idx is not None: - last_ratio_idx = idx - return last_ratio_idx + return idx + return None def ar2diffusion( @@ -237,6 +220,7 @@ def ar2diffusion( width = original_prompt.get("width", 1024) text_prompt = original_prompt.get("prompt", "") use_system_prompt = original_prompt.get("use_system_prompt") + custom_system_prompt = original_prompt.get("system_prompt") # Prefer the AR's predicted output aspect (`` # tail emitted by `HunyuanImage3ForCausalMM.sample` under the @@ -249,7 +233,7 @@ def ar2diffusion( model_name_or_path = original_prompt.get("model") or os.environ.get( "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL ) - ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path) + ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path) ar_predicted = False if ratio_idx is not None: base_size = int(original_prompt.get("image_base_size", 1024)) @@ -302,9 +286,14 @@ def ar2diffusion( }, } - # Forward use_system_prompt so the DiT can build the same system prefix + # Forward use_system_prompt so the DiT can build the same system prefix. + # Also forward the custom system prompt body when sys_type=custom so + # DiT's `get_system_prompt(use, "image", body)` doesn't fall back to + # an empty prefix and silently diverge from AR. if use_system_prompt is not None: diffusion_input["use_system_prompt"] = use_system_prompt + if custom_system_prompt is not None: + diffusion_input["system_prompt"] = custom_system_prompt # Forward multimodal data (original image for IT2I conditioning). # The diffusion pre_process_func reads multi_modal_data["image"], which From 329851727cc08022fcccdea0b22e258777c6db51 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 11:50:00 +0800 Subject: [PATCH 23/43] chore: appease ruff F841 / typos / ruff-format pre-commit CI feedback from the previous push: - F841: drop unused `QKEY` in test_serving_chat_multistage_generation.py - typos: avoid the dictionary trigger on "PNGs" plural -- the lowercased form lands in the crate-ci/typos dictionary as a misspelling; rephrase to "transparent-logo uploads" without changing meaning. - ruff-format: collapse the `build_prompt_tokens(...)` call in the e2e accuracy test back to a single line (line is under the 120 char limit ruff-format enforces locally). Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../openai_api/test_serving_chat_multistage_generation.py | 1 - vllm_omni/entrypoints/openai/api_server.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 88d15a684b6..bd2e877bf18 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -391,7 +391,6 @@ def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat): ) images = [Image.new("RGB", (32, 32), color="red")] - QKEY = "prompt" marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ" out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index b485b6a3946..80b01ec284a 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2282,7 +2282,7 @@ async def _load_input_images( # normalizes input files with ``Image.open(...).convert("RGB")`` before # they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes # online IT2I observe a different visual input than offline (for example - # transparent-logo PNGs alpha-composited over white instead of black), + # transparent-logo uploads alpha-composited over white instead of black), # which is enough for HunyuanImage3 AR recaption to diverge before DiT # sees the request -- root cause of the "online 3 magnets vs offline 1 # magnet" systematic semantic mismatch. From 808aca089a36aa990b3b2a8d05de7683cad28355 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 13:12:51 +0800 Subject: [PATCH 24/43] fix(hunyuan_image3): align AR cond image preprocessing with DiT (center crop) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AR-side `HunyuanImage3Processor._resize_and_crop` previously defaulted to `crop_type="resize"` (stretch), while the DiT-side condition-image helper `_resize_and_crop_center` always center-crops. For any portrait input mapped to a landscape output bucket (or vice versa), AR and DiT then conditioned on **visibly different fabric regions**: AR saw the input stretched to fit, DiT saw the input center-cropped to fit. The two cond latents disagreed on what the surroundings should be, and DiT had to inpaint the lateral canvas extension on its own — producing seam-like vertical brightness bands at the AR/DiT-disagreement boundary (reported on `/tmp/rgbfix/result.png` IT2I run with 735x1104 input -> 1280x720 output). Change AR-side default to `crop_type="center"`, matching: - DiT-side `_resize_and_crop_center` (always center). - Official `generate_image(..., infer_align_image_size=False)` (the default; reading `hunyuan3.0_ins/image_processor.py:355-358` maps the False branch to `random_crop="center"`). Add a CPU-only regression test asserting AR and DiT preprocessing produce **byte-identical** pixels for 4 src sizes x 4 target buckets, covering portrait->landscape, landscape->portrait, and square aspects. No model weights / tokenizer / HF cache required, runs in CI. Co-authored-by: Codex Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../test_hunyuan_image3_it2i_ar_format.py | 39 +++++++++++++++++++ .../models/hunyuan_image3/hunyuan_image3.py | 16 ++++---- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py index 7e7b7de91b2..916b565c1af 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py @@ -72,6 +72,45 @@ def _snapshot_dir(model_id: str) -> pathlib.Path: # tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py. +def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache(): + """AR and DiT must preprocess the same IT2I condition image into the + same VAE pixels. + + This catches drift between the AR-side multimodal processor and the + diffusion-side bridge without requiring model weights or tokenizer files. + In particular, portrait input expanded to a landscape output is sensitive + to accidentally using ``crop_type="resize"`` on one side and center crop + on the other; the two paths then condition on visibly different fabric + regions and leave seam-like artifacts around the edited object. + """ + import numpy as np + from PIL import Image + + from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import ( + _resize_and_crop_center, + ) + from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import ( + HunyuanImage3Processor, + ) + + rng = np.random.default_rng(seed=3444) + src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)] + target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)] + + for src_w, src_h in src_size_pairs: + src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8) + src = Image.fromarray(src_arr, mode="RGB") + for tw, th in target_size_pairs: + ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th)) + dit_out = _resize_and_crop_center(src, tw, th) + + assert ar_out.size == dit_out.size == (tw, th) + assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), ( + f"AR and DiT condition preprocessing diverged for " + f"src={(src_w, src_h)} target={(tw, th)}" + ) + + _OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot" diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 9f3b76039d0..40a38c7b5ac 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput): current_info["vit_spatial_shapes"] = _ss.squeeze(0) # VAE: per-image bucket via `reso_group.get_target_size`; mirrors - # HF's `resize_and_crop` (crop_type="center", the official - # generate_image default with infer_align_image_size=False). + # HF's `resize_and_crop` default (crop_type="center", the official + # generate_image default when infer_align_image_size=False). # Keep fp32 — the VAE encoder casts to model dtype at its # boundary (see `_vae_encode`). image_width, image_height = self.reso_group.get_target_size(image.width, image.height) @@ -957,13 +957,13 @@ def _resize_and_crop( self, image: Image.Image, target_size: tuple[int, int], - crop_type: str = "resize", + crop_type: str = "center", ) -> Image.Image: - # Default mode mirrors the official `infer_align_image_size=True` - # path (image_processor.py:355 → crop_type="resize") used by the - # IT2I demo: stretch the cond image to the bucket dims so its - # `` tag and ViT/VAE features stay aligned with the - # bucket, instead of dropping content via center crop. + # Default mode mirrors official `generate_image` with + # infer_align_image_size=False: preserve aspect ratio and center-crop + # to the nearest VAE bucket. Keeping this default aligned with the + # DiT-side condition-image helper avoids AR and DiT seeing different + # conditioning pixels for the same IT2I request. tw, th = target_size if crop_type == "resize": return image.resize((tw, th), resample=Image.Resampling.LANCZOS) From 297a2f5a7efc4525d6184e7de802ad70c71332d2 Mon Sep 17 00:00:00 2001 From: zuiho <2324465096@qq.com> Date: Wed, 13 May 2026 09:14:48 +0800 Subject: [PATCH 25/43] test(hunyuan_image3): apply ruff format hook fixes Signed-off-by: zuiho <2324465096@qq.com> --- .../hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py | 3 +-- tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py index 916b565c1af..7550caa50f7 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py @@ -106,8 +106,7 @@ def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache(): assert ar_out.size == dit_out.size == (tw, th) assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), ( - f"AR and DiT condition preprocessing diverged for " - f"src={(src_w, src_h)} target={(tw, th)}" + f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}" ) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 2ddfbea42dd..641cd5dc9c0 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -9,8 +9,8 @@ import pytest from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, _TASK_PRESETS, + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, available_bot_tasks, available_tasks, build_prompt, From 4cf71f2afe9b7b7dcaf1656398f084534751ea44 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 10:36:24 +0800 Subject: [PATCH 26/43] fix(hunyuan_image3): preserve legacy plain prompt tasks Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/test_prompt_utils.py | 22 ++++++++++++++++++ .../models/hunyuan_image3/prompt_utils.py | 23 +++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 641cd5dc9c0..ef635825c3b 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -66,6 +66,28 @@ def test_legacy_task_presets_still_available(): } <= set(_TASK_PRESETS) +def test_legacy_base_task_omitted_bot_task_keeps_plain_mode(): + prompt = build_prompt("HELLO", task="i2t") + assert prompt.endswith("Assistant: ") + assert not prompt.endswith("") + + result = build_prompt_tokens("hi", FakeTokenizer(), task="i2t") + assert result.system_prompt_type == "en_unified" + assert result.token_ids[-1] not in { + FakeTokenizer.SPECIAL[""], + FakeTokenizer.SPECIAL[""], + } + + +def test_default_prompt_still_uses_it2i_think_mode(): + prompt = build_prompt("HELLO") + assert prompt.endswith("Assistant: ") + + result = build_prompt_tokens("hi", FakeTokenizer()) + assert result.system_prompt_type == "en_unified" + assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""] + + def test_resolve_stop_token_ids_uses_answer_for_generation_tasks(): tok = FakeTokenizer() answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 4ed277eeed2..6e1453d0ed2 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -74,6 +74,13 @@ _TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"}) + +class _DefaultBotTask: + pass + + +_DEFAULT_BOT_TASK = _DefaultBotTask() + # Legacy composite task alias -> (task, bot_task). Keep this during rebase so # older callers and intermediate commits still resolve cleanly. _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { @@ -89,7 +96,11 @@ } -def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]: +def _normalize_task_and_bot_task( + task: str, + bot_task: str | None | _DefaultBotTask, +) -> tuple[str, str | None]: + bot_task_was_omitted = bot_task is _DEFAULT_BOT_TASK if task in _TASK_PRESETS: _, legacy_bot_task, _ = _TASK_PRESETS[task] base_task = task.split("_", 1)[0] @@ -97,9 +108,11 @@ def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, base_task = "t2i" if task in ("t2t", "i2t", "t2i"): base_task = task - if bot_task is None: + if bot_task_was_omitted: bot_task = legacy_bot_task task = base_task + elif bot_task_was_omitted: + bot_task = "think" return task, bot_task @@ -123,7 +136,7 @@ def resolve_sys_type(bot_task: str | None) -> str: def resolve_stop_token_ids( task: str = "it2i", - bot_task: str | None = "think", + bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK, tokenizer: Any | None = None, ) -> list[int]: task, bot_task = _normalize_task_and_bot_task(task, bot_task) @@ -158,7 +171,7 @@ def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]: def build_prompt( user_prompt: str, task: str = "it2i", - bot_task: str | None = "think", + bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK, sys_type: str | None = None, custom_system_prompt: str | None = None, num_images: int = 1, @@ -205,7 +218,7 @@ def build_prompt_tokens( user_prompt: str, tokenizer, task: str = "it2i", - bot_task: str | None = "think", + bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK, sys_type: str | None = None, custom_system_prompt: str | None = None, num_images: int = 1, From cf7e4a24f8874e5667acdd07993d683288af7562 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 11:12:19 +0800 Subject: [PATCH 27/43] fix(hunyuan_image3): align prompt token tests with result API Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../test_hunyuan_image3_it2i_multi_image.py | 24 +++++++++---------- .../hunyuan_image3/test_prompt_utils.py | 8 +++++++ .../models/hunyuan_image3/prompt_utils.py | 8 +++++++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py index 7a1e266b936..1e0fd159063 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py @@ -114,11 +114,11 @@ def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None): """N=1/2/3 -> the resulting id sequence contains exactly N copies of img_id (=2) sitting consecutively after the `User: ` segment.""" tok = FakeTokenizer() - ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1) + ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1).token_ids tok = FakeTokenizer() - ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2) + ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2).token_ids tok = FakeTokenizer() - ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3) + ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3).token_ids assert ids_n1.count(2) == 1 assert ids_n2.count(2) == 2 @@ -145,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy(): omitting the parameter (regression guard for existing single-image callers).""" tok_a = FakeTokenizer() - legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think") + legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think").token_ids tok_b = FakeTokenizer() - explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1) + explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1).token_ids assert legacy == explicit # Also: encode() must have been called on the same set of segments, # so segment boundaries are preserved. @@ -173,7 +173,7 @@ def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_ any num_images and emit zero `` placeholders.""" s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images) assert "" not in s - ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images) + ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images).token_ids assert 2 not in ids @@ -202,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int): img_id = tok.convert_tokens_to_ids("") assert img_id is not None and img_id >= 0, f" not in tokenizer vocab; got id={img_id}" - ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images) + ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images).token_ids # Exactly N copies of id, all consecutive. img_positions = [i for i, x in enumerate(ids) if x == img_id] @@ -225,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id(): tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) img_id = tok.convert_tokens_to_ids("") - ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1) - ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2) - ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3) + ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids + ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2).token_ids + ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3).token_ids assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}" assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}" @@ -250,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy(): from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True) - legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think") - explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1) + legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think").token_ids + explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy" diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index ef635825c3b..371646556f2 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -79,6 +79,14 @@ def test_legacy_base_task_omitted_bot_task_keeps_plain_mode(): } +def test_legacy_composite_task_with_none_bot_task_keeps_encoded_mode(): + prompt = build_prompt("HELLO", task="it2i_think", bot_task=None) + assert prompt.endswith("Assistant: ") + + result = build_prompt_tokens("hi", FakeTokenizer(), task="it2i_recaption", bot_task=None) + assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""] + + def test_default_prompt_still_uses_it2i_think_mode(): prompt = build_prompt("HELLO") assert prompt.endswith("Assistant: ") diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 6e1453d0ed2..f78b19a5746 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -95,6 +95,8 @@ class _DefaultBotTask: "t2i_recaption": ("en_unified", "recaption", ""), } +_LEGACY_COMPOSITE_TASKS: frozenset[str] = frozenset(_TASK_PRESETS) - {"t2t", "i2t", "t2i"} + def _normalize_task_and_bot_task( task: str, @@ -110,6 +112,12 @@ def _normalize_task_and_bot_task( base_task = task if bot_task_was_omitted: bot_task = legacy_bot_task + elif task in _LEGACY_COMPOSITE_TASKS and bot_task is None: + # Composite task names already encode the legacy bot_task. Keep + # calls like build_prompt_tokens(task="it2i_think", bot_task=None) + # on their historical meaning; explicit None is the plain-mode + # escape hatch only for the new two-axis base tasks. + bot_task = legacy_bot_task task = base_task elif bot_task_was_omitted: bot_task = "think" From 4fb78a3b4bee4d4e97e6684f888ca97c4bfd4875 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 11:48:10 +0800 Subject: [PATCH 28/43] fix(hunyuan_image3): harden edit bridge compatibility Signed-off-by: TaffyOfficial <2324465096@qq.com> --- ...test_serving_chat_multistage_generation.py | 86 ++++++++ .../test_hunyuan_image3.py | 103 ++++++++++ .../hunyuan_image3/pipeline_hunyuan_image3.py | 7 +- vllm_omni/entrypoints/openai/api_server.py | 9 +- vllm_omni/entrypoints/openai/serving_chat.py | 43 ++-- .../stage_input_processors/hunyuan_image3.py | 186 +++++++++--------- 6 files changed, 319 insertions(+), 115 deletions(-) create mode 100644 tests/model_executor/stage_input_processors/test_hunyuan_image3.py diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index bd2e877bf18..92f0ac2dc98 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -265,6 +265,92 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi ) +@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"]) +def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str): + """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode. + + The task/bot_task split must not normalize every legacy task-enum request + into bot_task="think"; i2t/t2t had no / trigger before + the split and should stay plain unless the caller passes an explicit + semantic bot_task. + """ + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red")] + + legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="describe me", + extra_body={"bot_task": legacy_task}, + reference_images=images if legacy_task == "i2t" else [], + gen_params=OmniDiffusionSamplingParams(), + ) + explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="describe me", + extra_body={"task": legacy_task}, + reference_images=images if legacy_task == "i2t" else [], + gen_params=OmniDiffusionSamplingParams(), + ) + + assert legacy_prompt["prompt"] == explicit_prompt["prompt"] + assert legacy_prompt["prompt"].endswith("Assistant: ") + assert not legacy_prompt["prompt"].endswith("") + assert not legacy_prompt["prompt"].endswith("") + + +@pytest.mark.parametrize( + "legacy_task,trigger", + [ + ("it2i_think", ""), + ("it2i_recaption", ""), + ], +) +def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work( + serving_chat, + legacy_task: str, + trigger: str, +): + """Legacy composite task names passed through bot_task must still work.""" + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace(stage_type="llm", is_comprehension=True), + SimpleNamespace(stage_type="diffusion", is_comprehension=False), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.0), + OmniDiffusionSamplingParams(), + ], + ) + images = [Image.new("RGB", (32, 32), color="red")] + + legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="edit me", + extra_body={"bot_task": legacy_task}, + reference_images=images, + gen_params=OmniDiffusionSamplingParams(), + ) + + assert legacy_prompt["prompt"].count("") == 1 + assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}") + + def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat): """Passing bot_task=think_recaption (vs default "think") must flip the resolved sys_type to en_think_recaption (and trigger tag is still diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py new file mode 100644 index 00000000000..faaa9785452 --- /dev/null +++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for HunyuanImage3 stage input processor.""" + +import builtins +from types import SimpleNamespace + +import pytest + +from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, +) +from vllm_omni.model_executor.stage_input_processors.hunyuan_image3 import ( + _extract_ratio_index, + _truncate_at_cot_end, + ar2diffusion, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _source_output(token_ids: list[int], text: str = ""): + return SimpleNamespace( + outputs=[ + SimpleNamespace( + token_ids=token_ids, + cumulative_token_ids=token_ids, + text=text, + ) + ], + multimodal_output=None, + ) + + +def test_extract_ratio_index_uses_fixed_special_token_ids(): + ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + + assert _extract_ratio_index([1, ratio_33, 2]) == 33 + assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36 + + +def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials(): + end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + token_ids = [100, 101, end_recaption, answer, boi, ratio] + + text, truncated = _truncate_at_cot_end( + "recaption body without special markers", + token_ids, + ) + + assert text == "recaption body without special markers" + assert truncated == [100, 101, end_recaption] + + +def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch): + real_import = builtins.__import__ + + def _block_transformers_import(name, *args, **kwargs): + if name == "transformers" or name.startswith("transformers."): + raise AssertionError("ar2diffusion must not import transformers on the bridge path") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", _block_transformers_import) + + end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + size = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + token_ids = [100, 101, end_recaption, answer, boi, size, ratio_0] + + result = ar2diffusion( + [_source_output(token_ids, text="decoded without special tokens")], + prompt=[{"prompt": "edit", "height": 64, "width": 64}], + ) + + assert len(result) == 1 + assert (result[0]["height"], result[0]["width"]) == (512, 2048) + assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens" + assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption] + + +def test_ar2diffusion_forwards_custom_system_prompt_body(): + end_think = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + marker = "CUSTOM_SYSTEM_BODY" + + result = ar2diffusion( + [_source_output([100, end_think], text="thought")], + prompt=[ + { + "prompt": "edit", + "use_system_prompt": "custom", + "system_prompt": marker, + } + ], + ) + + assert result[0]["use_system_prompt"] == "custom" + assert result[0]["system_prompt"] == marker diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 5a9d1e48856..35390e7312d 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -1366,10 +1366,13 @@ def forward( use_system_prompt = extra_args.get("use_system_prompt") system_prompt = extra_args.get("system_prompt") # Fall back to per-prompt use_system_prompt forwarded by ar2diffusion - if use_system_prompt is None and req.prompts: + if req.prompts: first_prompt = req.prompts[0] if isinstance(first_prompt, dict): - use_system_prompt = first_prompt.get("use_system_prompt") + if use_system_prompt is None: + use_system_prompt = first_prompt.get("use_system_prompt") + if system_prompt is None: + system_prompt = first_prompt.get("system_prompt") if use_system_prompt is not None: system_prompt = get_system_prompt(use_system_prompt, "image", system_prompt) system_prompt = system_prompt.strip() if system_prompt is not None else "" diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 80b01ec284a..7107b544adc 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1759,11 +1759,10 @@ async def edit_images( status_code=HTTPStatus.BAD_REQUEST.value, detail=detail, ) - # Only convert uploads to RGB when the caller opts into the - # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy - # callers that send only the older bot_task= shape keep - # whatever PIL mode the upload arrived as, to preserve pre-existing - # behavior for non-Hunyuan flows. + # Convert uploads to RGB when the caller opts into the Hunyuan-aware + # API surface. This includes the legacy bot_task= form: + # keeping uploads as RGBA/P PIL objects makes online IT2I observe a + # different visual input than the offline path. normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb) prompt["multi_modal_data"] = {} diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 4ba824f0909..7424a9e0d34 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2260,9 +2260,18 @@ def _build_multistage_generation_inputs( bot_task = extra_body.get("bot_task") sys_type = extra_body.get("sys_type") custom_system_prompt = extra_body.get("system_prompt") - if task is None and bot_task in set(_hunyuan3_available_tasks()): + legacy_task_from_bot_task = False + legacy_task_names = set(_hunyuan3_available_tasks()) | { + "it2i_think", + "it2i_recaption", + "t2i_think", + "t2i_recaption", + "t2i_vanilla", + } + if task is None and bot_task in legacy_task_names: task = bot_task bot_task = None + legacy_task_from_bot_task = True engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] @@ -2282,13 +2291,21 @@ def _build_multistage_generation_inputs( ) num_images = len(reference_images) if reference_images else 1 - # build_prompt defaults task="it2i"; preserve that when caller - # only passed bot_task semantic. effective_task = task if task is not None else "it2i" - # build_prompt defaults bot_task="think"; preserve that for legacy - # callers (passing bot_task=None to build_prompt explicitly gives a - # different (sys, trigger) than the default "think"). - effective_bot_task = bot_task if bot_task is not None else "think" + build_kwargs = { + "task": effective_task, + "sys_type": sys_type, + "custom_system_prompt": custom_system_prompt, + "num_images": num_images, + } + if bot_task is not None: + build_kwargs["bot_task"] = bot_task + elif "bot_task" in extra_body and not legacy_task_from_bot_task: + # Preserve the prompt_utils distinction between omitted + # bot_task and explicit None. Omitted keeps each task's legacy + # default (`it2i` -> think, `i2t`/`t2t` -> plain), while + # explicit None is the caller's plain-mode request. + build_kwargs["bot_task"] = None if tokenizer is not None: # HF byte-for-byte path: feed segment-tokenized prompt_token_ids # so AR sees the same template-tokenization HF apply_chat_template @@ -2301,11 +2318,7 @@ def _build_multistage_generation_inputs( result = build_prompt_tokens( prompt, tokenizer, - task=effective_task, - bot_task=effective_bot_task, - sys_type=sys_type, - custom_system_prompt=custom_system_prompt, - num_images=num_images, + **build_kwargs, ) prompt_token_ids = result.token_ids system_prompt_type = result.system_prompt_type @@ -2313,11 +2326,7 @@ def _build_multistage_generation_inputs( # Legacy string path (e.g. unit tests with no tokenizer plumbed). prompt = build_prompt( prompt, - task=effective_task, - bot_task=effective_bot_task, - sys_type=sys_type, - custom_system_prompt=custom_system_prompt, - num_images=num_images, + **build_kwargs, ) if reference_images and len(reference_images) == 1: engine_prompt_data = {"image": reference_images[0]} diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index c95a2a48f18..a06d030d0da 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Stage input processor for HunyuanImage3: AR → Diffusion transition. +"""Stage input processor for HunyuanImage3: AR to Diffusion transition. In IT2I (image editing) mode: - Stage 0 (AR) receives (image + edit instruction), generates CoT/latent tokens - - Stage 1 (DiT) receives the AR output + original image, denoises → edited image + - Stage 1 (DiT) receives the AR output + original image, denoises to edited image The ar2diffusion function bridges these two stages, following the same signature pattern as glm_image.ar2diffusion. @@ -12,7 +12,6 @@ from __future__ import annotations -import os from functools import lru_cache from typing import Any @@ -20,6 +19,9 @@ from vllm.inputs import TextPrompt from vllm.logger import init_logger +from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, +) from vllm_omni.inputs.data import OmniTokensPrompt logger = init_logger(__name__) @@ -27,12 +29,63 @@ # AR emits `` after `` in IT2I/T2I # (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The # ratio_index resolves to a (height, width) bucket via ResolutionGroup, which -# is the official upstream's mechanism for AR-driven output aspect — without +# is the official upstream's mechanism for AR-driven output aspect; without # this lookup the DiT pipeline falls back to the user-provided width/height # (in the `/v1/images/edits` path that defaults to `pil_images[0].size`, -# i.e. the first reference image's bucket — usually square, see +# i.e. the first reference image's bucket, usually square, see # api_server.py:1808-1811). -_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct" +_HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = ( + "1024x768", + "1280x720", + "768x1024", + "720x1280", +) + + +class _Resolution: + def __init__(self, size: str | int | tuple[int, int], *args: int): + if isinstance(size, str): + if "x" in size: + h, w = size.split("x") + size = (int(h), int(w)) + else: + size = int(size) + if args: + size = (int(size), args[0]) + if isinstance(size, int): + size = (size, size) + + self.height = int(size[0]) + self.width = int(size[1]) + self.ratio = self.height / self.width + + +def _build_resolutions_by_step(base_size: int, align: int = 1) -> list[_Resolution]: + step = base_size // 16 + min_height = base_size // 2 + min_width = base_size // 2 + max_height = base_size * 2 + max_width = base_size * 2 + + resolutions = [_Resolution(base_size, base_size)] + + cur_height, cur_width = base_size, base_size + while True: + if cur_height >= max_height and cur_width <= min_width: + break + cur_height = min(cur_height + step, max_height) + cur_width = max(cur_width - step, min_width) + resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align)) + + cur_height, cur_width = base_size, base_size + while True: + if cur_height <= min_height and cur_width >= max_width: + break + cur_height = max(cur_height - step, min_height) + cur_width = min(cur_width + step, max_width) + resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align)) + + return sorted(resolutions, key=lambda x: x.ratio) @lru_cache(maxsize=4) @@ -43,45 +96,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]: `reso_group[ratio_index]` reverse lookup. Cached because the table is constant per `base_size`. """ - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ( - HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS, - Resolution, - ResolutionGroup, - ) - - reso_group = ResolutionGroup( - base_size=base_size, - extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS], - ) - return [(int(r.height), int(r.width)) for r in reso_group.data] - - -@lru_cache(maxsize=4) -def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]: - """Return `{'': id, '': id}` for cot-boundary - truncation. Empty dict on lookup failure so callers degrade to a - pure text-based search. - """ - try: - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - except Exception as e: # pragma: no cover - environment-dependent - logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e) - return {} - - result: dict[str, int] = {} - for marker in ("", ""): - tid = tokenizer.convert_tokens_to_ids(marker) - if tid is not None and tid != tokenizer.unk_token_id: - result[marker] = int(tid) - return result + resolutions = _build_resolutions_by_step(base_size) + for extra_resolution in (_Resolution(s) for s in _HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS): + if not any(r.ratio == extra_resolution.ratio for r in resolutions): + resolutions.append(extra_resolution) + return [(r.height, r.width) for r in resolutions] def _truncate_at_cot_end( generated_text: str, generated_token_ids, - model_name_or_path: str, ) -> tuple[str, list[int]]: """Truncate AR output at first `` (or `` fallback). @@ -89,63 +113,50 @@ def _truncate_at_cot_end( upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as `cot_text` for DiT. The trailing `` sequence is a stage-transition trigger consumed via `image_size` / - height/width — it must NOT be forwarded to DiT's prompt builder, or + height/width; it must NOT be forwarded to DiT's prompt builder, or the extra `` and ratio tokens drift the DiT's own prompt structure. """ token_list = list(generated_token_ids) if generated_token_ids is not None else [] - end_ids = _build_cot_end_token_ids(model_name_or_path) + end_ids = { + "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + } for marker in ("", ""): - idx = generated_text.find(marker) - if idx == -1: - continue - text_end = idx + len(marker) - truncated_text = generated_text[:text_end] - truncated_tokens = token_list - end_id = end_ids.get(marker) - if end_id is not None and token_list: + end_id = end_ids[marker] + if token_list: try: token_end = token_list.index(end_id) truncated_tokens = token_list[: token_end + 1] except ValueError: pass - return truncated_text, truncated_tokens + + idx = generated_text.find(marker) + if idx != -1: + text_end = idx + len(marker) + return generated_text[:text_end], truncated_tokens + if truncated_tokens is not token_list: + return generated_text, truncated_tokens return generated_text, token_list @lru_cache(maxsize=4) -def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]: - """Return `{token_id: ratio_index}` for `` in the tokenizer. - - Loads the tokenizer once per model path and walks the contiguous - `..` plus the extra slice - `..` (the same shape - `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531). - Empty dict on lookup failure so callers can degrade gracefully. +def _build_ratio_id_lookup() -> dict[int, int]: + """Return `{token_id: ratio_index}` for HunyuanImage3 ratio tokens. + + The ids are fixed in tokenizer.json and already pinned in prompt_utils. + Avoid loading AutoTokenizer here: this bridge runs on the hot AR->DiT + transition path and must keep working in offline deployments where the + tokenizer object is not exposed to the stage-input processor. """ - try: - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - except Exception as e: # pragma: no cover - environment-dependent - logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e) - return {} - - def _id(name: str) -> int | None: - tid = tokenizer.convert_tokens_to_ids(name) - return None if tid is None or tid == tokenizer.unk_token_id else int(tid) - - ratio_0 = _id("") - ratio_32 = _id("") - ratio_33 = _id("") - ratio_36 = _id("") - if None in (ratio_0, ratio_32, ratio_33, ratio_36): - logger.warning("[ar2diffusion] tokenizer is missing one of tokens") - return {} + ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio_32 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] table: dict[int, int] = {} for i in range(ratio_32 - ratio_0 + 1): @@ -156,22 +167,20 @@ def _id(name: str) -> int | None: return table -def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None: +def _extract_ratio_index(generated_token_ids) -> int | None: """Resolve the AR-predicted ratio_index from this stage's output. `HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit exactly one `` token after ` `, so we scan the token stream from the tail for the first - id that maps to a ratio. Token-ids are the source of truth — text-side + id that maps to a ratio. Token-ids are the source of truth; text-side regex is unreliable because most deploy yamls run AR with `skip_special_tokens: True` (special tokens are stripped from text but still present in `cumulative_token_ids`). """ if generated_token_ids is None: return None - table = _build_ratio_id_lookup(model_name_or_path) - if not table: - return None + table = _build_ratio_id_lookup() for tid in reversed(list(generated_token_ids)): idx = table.get(int(tid)) if idx is not None: @@ -230,10 +239,7 @@ def ar2diffusion( # square in the multi-image / mismatched-aspect case. Mirrors the # official upstream where `reso_group[ratio_index]` is the # canonical source of the diffusion target shape. - model_name_or_path = original_prompt.get("model") or os.environ.get( - "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL - ) - ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path) + ratio_idx = _extract_ratio_index(generated_token_ids) ar_predicted = False if ratio_idx is not None: base_size = int(original_prompt.get("image_base_size", 1024)) @@ -253,14 +259,12 @@ def ar2diffusion( # Truncate the AR output at `` (or ``) before # passing to DiT. Mirrors official `generate_image` which keeps - # `cot_text` clean and routes size/ratio via `image_size` only — + # `cot_text` clean and routes size/ratio via `image_size` only; # we already extracted `ratio_idx` above and translated it into # `height` / `width`, so the `` # tail has no remaining job and would only contaminate DiT's # prompt builder if forwarded. - cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end( - generated_text, generated_token_ids, model_name_or_path - ) + cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids) logger.info( "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, " From 38668a6e5785fab2b50728d1b231badd0e82efe1 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 15:22:51 +0800 Subject: [PATCH 29/43] revert(hunyuan_image3): roll cond preprocessing back to magnet_repro state Restores the IT2I online image quality observed at the magnet_repro deploy. Two changes from the PR review-feedback round regressed image quality on multi-image edit prompts: 1. 4da2ff687 switched cond VAE from `latent_dist.sample(generator)` to `latent_dist.mode()` on both AR and DiT sides. The posterior mean produces visibly degraded conditioning vs the fixed-seed sample. 2. 1785580ef changed AR `_resize_and_crop` default from `"resize"` to `"center"` to match a non-existent DiT center-crop default (DiT bridge actually defaults to `"resize"` too). This broke AR/DiT preprocessing alignment instead of fixing it. Revert both: - AR `_resize_and_crop` default back to `"resize"` and its docstring. - AR/DiT `_vae_encode`/`vae_encode` back to fixed-generator sample. - Remove the now-dead `.mode()` method on `DiagonalGaussianDistribution`. - Remove the AR/DiT byte-identical preprocessing test added by 1785580ef -- it asserted the wrong invariant (AR `"center"` == DiT `_resize_and_crop_center`), which no longer holds and was never the right alignment target. Keeps the other 4da2ff687 fixes intact: system_prompt body forwarding, ratio extraction simplification, stale `it2i_recaption` compound name cleanup, duplicate `prompt_token_ids` assignment removal. Signed-off-by: Claude Code Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../test_hunyuan_image3_it2i_ar_format.py | 38 ------------------- .../models/hunyuan_image3/autoencoder.py | 3 -- .../hunyuan_image3/pipeline_hunyuan_image3.py | 9 +++-- .../models/hunyuan_image3/hunyuan_image3.py | 28 +++++++------- 4 files changed, 18 insertions(+), 60 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py index 7550caa50f7..7e7b7de91b2 100644 --- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py @@ -72,44 +72,6 @@ def _snapshot_dir(model_id: str) -> pathlib.Path: # tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py. -def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache(): - """AR and DiT must preprocess the same IT2I condition image into the - same VAE pixels. - - This catches drift between the AR-side multimodal processor and the - diffusion-side bridge without requiring model weights or tokenizer files. - In particular, portrait input expanded to a landscape output is sensitive - to accidentally using ``crop_type="resize"`` on one side and center crop - on the other; the two paths then condition on visibly different fabric - regions and leave seam-like artifacts around the edited object. - """ - import numpy as np - from PIL import Image - - from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import ( - _resize_and_crop_center, - ) - from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import ( - HunyuanImage3Processor, - ) - - rng = np.random.default_rng(seed=3444) - src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)] - target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)] - - for src_w, src_h in src_size_pairs: - src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8) - src = Image.fromarray(src_arr, mode="RGB") - for tw, th in target_size_pairs: - ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th)) - dit_out = _resize_and_crop_center(src, tw, th) - - assert ar_out.size == dit_out.size == (tw, th) - assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), ( - f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}" - ) - - _OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot" diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py index ddd7d5c6df7..efba2f27435 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py @@ -46,9 +46,6 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor: x = self.mean + self.std * sample return x - def mode(self) -> torch.FloatTensor: - return self.mean - @dataclass class DecoderOutput(BaseOutput): diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 35390e7312d..14aa0ea903d 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -634,10 +634,11 @@ def vae_encode(self, image, cfg_factor=1): if isinstance(vae_encode_result, torch.Tensor): latents = vae_encode_result else: - # Cond image is clean conditioning (t=0 below) -- use the - # posterior mean so encoding is deterministic by construction. - # See AR-side comment in model_executor/.../hunyuan_image3.py. - latents = vae_encode_result.latent_dist.mode() + # Match HunyuanImage-3's cond encode path: sample the + # posterior, but use a fixed generator so repeated online + # requests are deterministic. + _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0) + latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) if hasattr(config, "shift_factor") and config.shift_factor: latents.sub_(config.shift_factor) if hasattr(config, "scaling_factor") and config.scaling_factor: diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 40a38c7b5ac..cfd5c6764ad 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput): current_info["vit_spatial_shapes"] = _ss.squeeze(0) # VAE: per-image bucket via `reso_group.get_target_size`; mirrors - # HF's `resize_and_crop` default (crop_type="center", the official - # generate_image default when infer_align_image_size=False). + # HF's `resize_and_crop` (crop_type="center", the official + # generate_image default with infer_align_image_size=False). # Keep fp32 — the VAE encoder casts to model dtype at its # boundary (see `_vae_encode`). image_width, image_height = self.reso_group.get_target_size(image.width, image.height) @@ -957,13 +957,13 @@ def _resize_and_crop( self, image: Image.Image, target_size: tuple[int, int], - crop_type: str = "center", + crop_type: str = "resize", ) -> Image.Image: - # Default mode mirrors official `generate_image` with - # infer_align_image_size=False: preserve aspect ratio and center-crop - # to the nearest VAE bucket. Keeping this default aligned with the - # DiT-side condition-image helper avoids AR and DiT seeing different - # conditioning pixels for the same IT2I request. + # Default mode mirrors the official `infer_align_image_size=True` + # path (image_processor.py:355 → crop_type="resize") used by the + # IT2I demo: stretch the cond image to the bucket dims so its + # `` tag and ViT/VAE features stay aligned with the + # bucket, instead of dropping content via center crop. tw, th = target_size if crop_type == "resize": return image.resize((tw, th), resample=Image.Resampling.LANCZOS) @@ -1777,13 +1777,11 @@ def _vae_encode( images = images.to(dtype=self.vae.dtype) vae_encode_result = self.vae.encode(images) - # Cond image is clean (t=0) conditioning -- take the posterior mean - # so encoding is deterministic by construction. `.sample()` without a - # generator consumes torch's global RNG and silently drifts between - # requests on a long-running server (online) while looking stable for - # fresh-process callers (offline). `.mode()` matches the official - # HunyuanImage-3 cond encode path. - latents = vae_encode_result.latent_dist.mode() + # Match HunyuanImage-3's cond encode path: sample the posterior, but + # use a fixed generator so online requests do not consume the global + # RNG and drift across a long-running server. + _cond_vae_gen = torch.Generator(device=images.device).manual_seed(0) + latents = vae_encode_result.latent_dist.sample(_cond_vae_gen) # Apply shift and scaling factors if present if hasattr(config, "shift_factor") and config.shift_factor: From 9bc67cc589fbb5afc7edcd6b3d60c27bbbcd2656 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 16:12:14 +0800 Subject: [PATCH 30/43] fix(hunyuan_image3): stop AR on <|endoftext|> for image-output tasks `resolve_stop_token_ids` returned `` (128025) for all (task, bot_task) combos. For image-output tasks (`it2i` / `t2i`) this stops the AR halfway through the size/ratio tail that `_stage_transitions[]` forces: <|endoftext|> ^^^^^^^^^^^^ stopped here, ratio never emitted Downstream `ar2diffusion::_extract_ratio_index` then scans `cumulative_token_ids` for any ``, finds none, and falls back to the prompt-carried `height`/`width`. In `end2end.py` for multi-image IT2I that means the first reference image's shape -- e.g. a 512x512 logo + a 1179x685 fabric reference collapses the DiT bucket to 1024x1024 square even though the AR CoT planned image_2's landscape aspect. Width and texture both regress simultaneously because DiT has to squeeze the landscape-planned content into a square bucket. Online didn't trip this because the deploy yaml explicitly sets `stop_token_ids: [127957]` (= `<|endoftext|>`) and end2end.py is not in that codepath. `end2end.py` overrides yaml with `resolve_stop_token_ids(...)`, so offline always hit the broken stop regardless of yaml. Fix: return `[<|endoftext|>]` for `it2i` / `t2i` so AR runs through the forced tail and `` reaches `ar2diffusion`. Keep `[]` for `i2t` / `t2t` -- those are comprehension stages where the response body sits inside ``, so the answer-open *is* the natural terminator. Update `test_resolve_stop_token_ids_uses_answer_for_generation_tasks` to assert the new (correct) split. Signed-off-by: Claude Code Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/test_prompt_utils.py | 24 +++++++++++++++---- .../models/hunyuan_image3/prompt_utils.py | 18 ++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 371646556f2..0579caaaac8 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -96,12 +96,28 @@ def test_default_prompt_still_uses_it2i_think_mode(): assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""] -def test_resolve_stop_token_ids_uses_answer_for_generation_tasks(): +def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer(): + """Image-output tasks must stop on <|endoftext|>, not . + + Stopping on chops off the + tail forced by `_stage_transitions`, so `_extract_ratio_index` in + `ar2diffusion` finds nothing and the DiT output bucket collapses to + the first reference image's shape (e.g. 1024x1024 square when AR's + CoT planned a 1280x720 landscape). + """ tok = FakeTokenizer() + + eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id] - assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id] - assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id] + + # Image-output: t2i / it2i must let AR emit the size/ratio tail. + for bot in ("think", "recaption", "think_recaption", "vanilla"): + assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id] + assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id] + + # Text-output: i2t / t2t comprehension stops on (response sits inside). + assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id] + assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id] @pytest.mark.parametrize( diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index f78b19a5746..196c86dfa5d 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -147,11 +147,29 @@ def resolve_stop_token_ids( bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK, tokenizer: Any | None = None, ) -> list[int]: + """AR stop-token ids for a given (task, bot_task) generation request. + + Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``: + after ```` the AR's ``_stage_transitions`` force-emits + ```` and then samples ```` under + ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping + early on ```` chops off the size/ratio tail, leaves + ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently + collapses the DiT output bucket to the first reference image's shape + (square logo -> 1024x1024 even when AR's CoT plans a landscape). + + Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR is + the final stage, and the comprehension response sits inside the + ```` body so the answer-open is the natural cot/recaption + terminator. + """ task, bot_task = _normalize_task_and_bot_task(task, bot_task) if task not in _TASKS: raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") if bot_task not in _BOT_TASK_PRESETS: raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}") + if task in ("it2i", "t2i"): + return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]] return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]] From dec1c436b70cc2350965813e2e6ab6a3be5f39d3 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Tue, 12 May 2026 22:08:31 +0800 Subject: [PATCH 31/43] [Bugfix][HunyuanImage3] cap AR KV snapshot at , defer mid-decode kv_ready forward Two coupled changes so HunyuanImage3 IT2I no longer ships KV for the tail that DiT discards anyway: 1. deploy/hunyuan_image3.yaml: add ``kv_transfer_criteria`` so AR's snapshot fires at (token id 128019). ``stop_after_transfer: false`` keeps the AR running past the snapshot so it can still emit for ``ar2diffusion._extract_ratio_index``. With this yaml + the orchestrator change below, the colleague-confirmed invariant S - N == 1 (where S is the shipped KV length and N is the DiT-side ``positive_reuse_len``) is restored. Without the yaml the AR ships KV all the way through and S - N collapses to 6. 2. engine/orchestrator.py: ``_handle_kv_ready_raw_outputs`` previously forwarded any kv_ready EngineCoreOutput straight to the next stage. With ``stop_after_transfer: false`` the kv_ready signal fires mid-decode (snapshot at , AR still emitting tail), so the raw EngineCoreOutput has no ``.outputs[0]`` and bridges that read the AR's full text (HunyuanImage3 ``ar2diffusion``) hit ``AttributeError``. Skip the forward when no finished output for the same req_id is present in the same raw_outputs batch; the AR's eventual natural-finish RequestOutput will trigger the forward through ``_route_output``. Bagel's existing flow (kv_ready and the deferred-stop finish output co-emit in the same batch) is preserved. Signed-off-by: zuiho Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/deploy/hunyuan_image3.yaml | 20 ++++++++++++++++++++ vllm_omni/engine/orchestrator.py | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 634165cd33a..8f7c57fdd64 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -37,6 +37,26 @@ stages: rope_type: default omni_kv_config: need_send_cache: true + # Cap AR KV snapshot at so the shipped KV exactly + # matches the prefix the DiT side reuses (positive_reuse_len = + # 0-based index of , slice ``k[:positive_reuse_len]`` + # excludes itself). Mirrors the colleague-confirmed + # invariant S - N == 1. Without this the AR ships KV all the way + # through , which DiT + # silently discards (S - N == 6) and which keeps the AR pipeline + # busy emitting tail tokens that DiT will never use. + # + # ``stop_after_transfer: false`` keeps the AR running past the + # snapshot so it still emits , which ``ar2diffusion`` + # extracts to derive image height/width. The mid-decode kv_ready + # signal that this combination produces is handled in the + # orchestrator: forwarding to DiT is deferred until the AR's + # natural finish output arrives (see + # ``_handle_kv_ready_raw_outputs``). + kv_transfer_criteria: + type: special_token + token_id: 128019 # + stop_after_transfer: false output_connectors: to_stage_1: shared_memory_connector default_sampling_params: diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 2d2ac47cbb3..37a9eb291c8 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -695,6 +695,21 @@ async def _handle_kv_ready_raw_outputs( if self.async_chunk: return + # When kv_ready fires mid-decode (e.g. HunyuanImage3 with + # kv_transfer_criteria=special_token + stop_after_transfer=false, + # snapshot triggers at but AR keeps generating tail + # tokens for ratio extraction), the kv_ready EngineCoreOutput is + # NOT a finished RequestOutput, so bridges that read + # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only + # forward kv_ready when the same raw_outputs batch also contains a + # finished output for that req_id; otherwise wait for AR's natural + # completion to trigger the forward through ``_route_output``. + finished_in_batch = { + o.request_id + for o in raw_outputs.outputs + if getattr(o, "finish_reason", None) is not None + } + for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): @@ -712,6 +727,9 @@ async def _handle_kv_ready_raw_outputs( if (stage_id + 1) in req_state.stage_submit_ts: continue + if req_id not in finished_in_batch: + continue + if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id): self._cfg_tracker.defer_parent(req_id, raw_output, stage_id) else: From b84bc2ffa594c796d40a2af0631d8fb0d0c23628 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 11:43:03 +0800 Subject: [PATCH 32/43] fix(hunyuan_image3): cap IT2I input images at MAX_IMAGES_PER_REQUEST in entry layer Per PR #3444 review (Gaohan123): give a friendly, input-named error at the entry boundary instead of relying on the deeper `prompt_utils._validate_num_images` to surface as a `num_images must be in [1, 3]` message. Reuse `MAX_IMAGES_PER_REQUEST` so the cap stays defined in one place. - offline `end2end.py`: validate `--image-path` count before opening PIL - online `serving_chat._build_multistage_generation_inputs`: validate `reference_images` count before building engine prompt data Signed-off-by: TaffyOfficial <2324465096@qq.com> --- examples/offline_inference/hunyuan_image3/end2end.py | 6 ++++++ vllm_omni/entrypoints/openai/serving_chat.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 908109d65a3..36b3b1199a5 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -8,6 +8,7 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + MAX_IMAGES_PER_REQUEST, build_prompt_tokens, resolve_stop_token_ids, resolve_sys_type, @@ -177,6 +178,11 @@ def main(): from PIL import Image image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()] + if len(image_paths) > MAX_IMAGES_PER_REQUEST: + raise ValueError( + f"--image-path accepts at most {MAX_IMAGES_PER_REQUEST} images for " + f"HunyuanImage-3.0 IT2I, got {len(image_paths)}: {args.image_path}" + ) for image_path in image_paths: if not os.path.exists(image_path): raise ValueError(f"Image path does not exist: {image_path}") diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 7424a9e0d34..26ca0d6170e 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2253,6 +2253,7 @@ def _build_multistage_generation_inputs( # downstream uses the canonical split. Source the task enum from # prompt_utils so this layer stays in sync with the model side. from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images, available_tasks as _hunyuan3_available_tasks, ) @@ -2273,6 +2274,12 @@ def _build_multistage_generation_inputs( bot_task = None legacy_task_from_bot_task = True + if reference_images and len(reference_images) > _hunyuan3_max_images: + raise ValueError( + f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input " + f"images per request, got {len(reference_images)}" + ) + engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] if reference_images: From 029f567d08e7b465069b6f2a5b1af63ee87b51bd Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 12:06:05 +0800 Subject: [PATCH 33/43] chore: apply pre-commit ruff format / isort fixups Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/engine/orchestrator.py | 6 +----- vllm_omni/entrypoints/openai/serving_chat.py | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 37a9eb291c8..a764c3b5247 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -704,11 +704,7 @@ async def _handle_kv_ready_raw_outputs( # forward kv_ready when the same raw_outputs batch also contains a # finished output for that req_id; otherwise wait for AR's natural # completion to trigger the forward through ``_route_output``. - finished_in_batch = { - o.request_id - for o in raw_outputs.outputs - if getattr(o, "finish_reason", None) is not None - } + finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None} for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 26ca0d6170e..dfd6c15168a 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2254,6 +2254,8 @@ def _build_multistage_generation_inputs( # prompt_utils so this layer stays in sync with the model side. from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images, + ) + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( available_tasks as _hunyuan3_available_tasks, ) From d8b9263f042cc09f0cb6d220f9ebef833f163dcf Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 12:09:35 +0800 Subject: [PATCH 34/43] chore: rename MAX_IMAGES_PER_REQUEST alias to uppercase (ruff N811) Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index dfd6c15168a..35dd4524fc0 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2253,7 +2253,7 @@ def _build_multistage_generation_inputs( # downstream uses the canonical split. Source the task enum from # prompt_utils so this layer stays in sync with the model side. from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images, + MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES, ) from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( available_tasks as _hunyuan3_available_tasks, @@ -2276,9 +2276,9 @@ def _build_multistage_generation_inputs( bot_task = None legacy_task_from_bot_task = True - if reference_images and len(reference_images) > _hunyuan3_max_images: + if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES: raise ValueError( - f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input " + f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input " f"images per request, got {len(reference_images)}" ) From 511b76c0865aaac13c8dcd9abe0f0d8cfd49e8c7 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 15:05:59 +0800 Subject: [PATCH 35/43] fix(hunyuan_image3): align AR stop / KV cap / edits Form with upstream (review) Addresses Bounty-hunter's PR review on #3444: 1. resolve_stop_token_ids: image-output tasks now stop on the full token range (ids 128044-128076 + 130103-130106), mirroring upstream modeling_hunyuan_image_3.py:3289-3303 (`final_stop_tokens = list(range(start_ratio, end_ratio + 1))`). Replaces the earlier `<|endoftext|>` stop which let AR waste decode steps past the ratio. test_prompt_utils.py renamed/updated to pin the new contract. 2. deploy/hunyuan_image3.yaml: drop the kv_transfer_criteria block. With the ratio-range stop in place AR finishes naturally at the ratio token, so KV is capped automatically -- no need for special_token criteria + stop_after_transfer=false. 3. orchestrator._handle_kv_ready_raw_outputs: drop the finished_in_batch defer. Mid-decode kv_ready only fired when stop_after_transfer=false was forcing AR past its natural stop; with #2 removed there is no mid-decode kv_ready to defer. The ratio strip for DiT already lives in stage_input_processors/hunyuan_image3._truncate_at_cot_end. 4. serving_chat._build_multistage_generation_inputs: call resolve_stop_token_ids(task, bot_task) and inject into the AR-stage sampling params. Online now matches offline end2end.py rather than relying on yaml-side stop_token_ids. 5. api_server.edit_images: drop the redundant `task` Form field. /v1/images/edits is always IT2I; bot_task / sys_type / system_prompt remain. Legacy bot_task= still works via chat-handler normalization. 6. pipeline_hunyuan_image3 + stage_input_processors/hunyuan_image3: stop reading / writing the `ar_token_ids` extra. The tokenizer-level `batch_cot_token_ids` parameter is retained for a follow-up PR that will unify system/user/cot tokenization. See PR description for the optimization leftover note. Signed-off-by: Claude Code Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/test_prompt_utils.py | 31 ++++++++------ .../test_hunyuan_image3.py | 2 +- vllm_omni/deploy/hunyuan_image3.yaml | 20 ---------- .../hunyuan_image3/pipeline_hunyuan_image3.py | 12 +----- .../models/hunyuan_image3/prompt_utils.py | 35 ++++++++++------ vllm_omni/engine/orchestrator.py | 14 ------- vllm_omni/entrypoints/openai/api_server.py | 40 +++++++------------ vllm_omni/entrypoints/openai/serving_chat.py | 18 +++++++++ .../stage_input_processors/hunyuan_image3.py | 4 -- 9 files changed, 76 insertions(+), 100 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 0579caaaac8..7c3256eee72 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -96,26 +96,31 @@ def test_default_prompt_still_uses_it2i_think_mode(): assert result.token_ids[-1] == FakeTokenizer.SPECIAL[""] -def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer(): - """Image-output tasks must stop on <|endoftext|>, not . - - Stopping on chops off the - tail forced by `_stage_transitions`, so `_extract_ratio_index` in - `ar2diffusion` finds nothing and the DiT output bucket collapses to - the first reference image's shape (e.g. 1024x1024 square when AR's - CoT planned a 1280x720 landscape). +def test_resolve_stop_token_ids_image_tasks_stop_on_ratio_range(): + """Image-output tasks stop on any ```` token. + + Mirrors upstream ``modeling_hunyuan_image_3.py::generate_image`` + (line 3289-3303): when ``need_ratio`` is true, + ``final_stop_tokens = list(range(start_ratio, end_ratio + 1)) + + ratio_token_other_slices``. AR stops AT the ratio token sampled + after ````; the bridge then strips the trailing ratio + token before passing the cot to DiT. """ tok = FakeTokenizer() - eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + expected = list(range(start, end + 1)) + list(range(other_start, other_end + 1)) - # Image-output: t2i / it2i must let AR emit the size/ratio tail. + # Image-output: t2i / it2i stop on the full ratio token range. for bot in ("think", "recaption", "think_recaption", "vanilla"): - assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id] - assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id] + assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == expected + assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == expected # Text-output: i2t / t2t comprehension stops on (response sits inside). + answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id] assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id] diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py index faaa9785452..1901210de09 100644 --- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py +++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py @@ -81,7 +81,7 @@ def _block_transformers_import(name, *args, **kwargs): assert len(result) == 1 assert (result[0]["height"], result[0]["width"]) == (512, 2048) assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens" - assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption] + assert "ar_token_ids" not in result[0]["extra"] def test_ar2diffusion_forwards_custom_system_prompt_body(): diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 8f7c57fdd64..634165cd33a 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -37,26 +37,6 @@ stages: rope_type: default omni_kv_config: need_send_cache: true - # Cap AR KV snapshot at so the shipped KV exactly - # matches the prefix the DiT side reuses (positive_reuse_len = - # 0-based index of , slice ``k[:positive_reuse_len]`` - # excludes itself). Mirrors the colleague-confirmed - # invariant S - N == 1. Without this the AR ships KV all the way - # through , which DiT - # silently discards (S - N == 6) and which keeps the AR pipeline - # busy emitting tail tokens that DiT will never use. - # - # ``stop_after_transfer: false`` keeps the AR running past the - # snapshot so it still emits , which ``ar2diffusion`` - # extracts to derive image height/width. The mid-decode kv_ready - # signal that this combination produces is handled in the - # orchestrator: forwarding to DiT is deferred until the AR's - # natural finish output arrives (see - # ``_handle_kv_ready_raw_outputs``). - kv_transfer_criteria: - type: special_token - token_id: 128019 # - stop_after_transfer: false output_connectors: to_stage_1: shared_memory_connector default_sampling_params: diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 14aa0ea903d..63c367a1006 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -1386,23 +1386,13 @@ def forward( # and ``get_cot_sections()`` can parse the think/recaption structure # directly. cot_text_list = [] - cot_token_ids_list = [] for p in req.prompts: extra = p.get("extra", {}) if isinstance(p, dict) else {} cot_text_list.append(extra.get("ar_generated_text") or None) - cot_token_ids_list.append(extra.get("ar_token_ids")) cot_text = ( [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None ) - # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt - # tokenization matches ARs actual token sequence byte-for-byte. Required - # when KV reuse is enabled: positive_reuse_len computed from DiT-side - # tokenization must equal the AR-side KV cache length, otherwise the - # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs - # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on - # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids - # in hunyuan_image3_tokenizer.py). - cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None + cot_token_ids = None batch_cond_image_info: list[list[JointImageInfo]] | None = None if any(not isinstance(p, str) for p in req.prompts): diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 196c86dfa5d..b178b021fd6 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -149,17 +149,19 @@ def resolve_stop_token_ids( ) -> list[int]: """AR stop-token ids for a given (task, bot_task) generation request. - Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``: - after ```` the AR's ``_stage_transitions`` force-emits - ```` and then samples ```` under - ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping - early on ```` chops off the size/ratio tail, leaves - ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently - collapses the DiT output bucket to the first reference image's shape - (square logo -> 1024x1024 even when AR's CoT plans a landscape). - - Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR is - the final stage, and the comprehension response sits inside the + Image-output tasks (``it2i`` / ``t2i``) stop on any ```` + token. Upstream ``modeling_hunyuan_image_3.py::generate_image`` + (line 3289-3303) sets ``final_stop_tokens`` to the full ratio token + range when ``need_ratio`` is true, then strips the trailing ratio + token before passing the cot to the image stage. AR's natural + trajectory under ``_stage_transitions`` is + ````; stopping + AT the ratio token means KV ends exactly at the prefix DiT reuses, + and ``ar2diffusion`` can read the ratio off the last sampled token + without AR wasting decode steps on ``<|endoftext|>``. + + Text-output tasks (``i2t`` / ``t2t``) stop on ```` -- the AR + is the final stage, and the comprehension response sits inside the ```` body so the answer-open is the natural cot/recaption terminator. """ @@ -169,7 +171,16 @@ def resolve_stop_token_ids( if bot_task not in _BOT_TASK_PRESETS: raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}") if task in ("it2i", "t2i"): - return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]] + # Main ratio range: .. . + start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + stops = list(range(start, end + 1)) + # Other slices (upstream tokenizer ``ratio_token_other_slices``): + # .. . + other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + stops.extend(range(other_start, other_end + 1)) + return stops return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]] diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index a764c3b5247..2d2ac47cbb3 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -695,17 +695,6 @@ async def _handle_kv_ready_raw_outputs( if self.async_chunk: return - # When kv_ready fires mid-decode (e.g. HunyuanImage3 with - # kv_transfer_criteria=special_token + stop_after_transfer=false, - # snapshot triggers at but AR keeps generating tail - # tokens for ratio extraction), the kv_ready EngineCoreOutput is - # NOT a finished RequestOutput, so bridges that read - # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only - # forward kv_ready when the same raw_outputs batch also contains a - # finished output for that req_id; otherwise wait for AR's natural - # completion to trigger the forward through ``_route_output``. - finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None} - for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): @@ -723,9 +712,6 @@ async def _handle_kv_ready_raw_outputs( if (stage_id + 1) in req_state.stage_submit_ts: continue - if req_id not in finished_in_batch: - continue - if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id): self._cfg_tracker.defer_parent(req_id, raw_output, stage_id) else: diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 7107b544adc..c54295cf104 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1701,12 +1701,10 @@ async def edit_images( layers: int | None = Form(None), resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS bot_task: str | None = Form(None), - # P1: task / sys_type / system_prompt split out from the legacy bot_task - # field so callers can express the full HunyuanImage-3.0 prompt template - # surface (task enum + bot_task semantic + sys_type override + custom - # system prompt body). Legacy callers that pass a task-enum value via - # bot_task still work (see normalization below). - task: str | None = Form(None), + # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis + # is fixed and pinned downstream. ``bot_task`` (think / recaption / + # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the + # only HunyuanImage-3.0 knobs callers need to express here. sys_type: str | None = Form(None), system_prompt: str | None = Form(None), ) -> ImageGenerationResponse: @@ -1760,10 +1758,10 @@ async def edit_images( detail=detail, ) # Convert uploads to RGB when the caller opts into the Hunyuan-aware - # API surface. This includes the legacy bot_task= form: - # keeping uploads as RGBA/P PIL objects makes online IT2I observe a - # different visual input than the offline path. - normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None + # API surface (bot_task / sys_type / system_prompt). Keeping uploads + # as RGBA/P PIL objects makes online IT2I observe a different visual + # input than the offline path. + normalize_edit_images_rgb = bot_task is not None or sys_type is not None pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb) prompt["multi_modal_data"] = {} prompt["multi_modal_data"]["image"] = pil_images @@ -1927,21 +1925,13 @@ async def edit_images( lora_dict = _get_lora_from_json_str(lora) _parse_lora_request(lora_dict) extra_body["lora"] = lora_dict - # P1: normalize legacy `bot_task=` form. Callers historically - # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task` - # Form field; promote it to `task` here so the chat_handler can - # split task vs bot_task semantics cleanly. New callers pass both - # `task` and `bot_task` explicitly; we keep them separate. - _task = task - _bot_task = bot_task - _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"} - if _task is None and _bot_task in _legacy_task_enum: - _task = _bot_task - _bot_task = None - if _task is not None: - extra_body["task"] = _task - if _bot_task is not None: - extra_body["bot_task"] = _bot_task + # ``/v1/images/edits`` is always IT2I; the chat handler's + # default (``task="it2i"`` when neither ``task`` nor + # ``bot_task`` resolves to a task enum) covers this implicitly. + # Legacy callers passing the task enum via ``bot_task`` (e.g. + # ``bot_task="it2i"``) are normalized inside the chat handler. + if bot_task is not None: + extra_body["bot_task"] = bot_task if sys_type is not None: extra_body["sys_type"] = sys_type if system_prompt is not None: diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 35dd4524fc0..739e55a2ad1 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2257,6 +2257,7 @@ def _build_multistage_generation_inputs( ) from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( available_tasks as _hunyuan3_available_tasks, + resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids, ) task = extra_body.get("task") @@ -2408,6 +2409,23 @@ def _build_multistage_generation_inputs( extra_args["target_h"] = int(height) extra_args["target_w"] = int(width) + # Resolve AR stop tokens dynamically from (task, bot_task) so the + # online path matches offline ``end2end.py`` and so the AR stops + # at the natural ```` token for image-output tasks + # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``). + # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR + # too early and leave ``ar2diffusion`` without a ratio token. + if ( + comprehension_idx is not None + and idx == comprehension_idx + and hasattr(default_stage_params, "stop_token_ids") + ): + resolved_stops = _hunyuan3_resolve_stop_token_ids( + task=task if task is not None else "it2i", + bot_task=bot_task, + ) + default_stage_params.stop_token_ids = resolved_stops + if stage_type == "diffusion": self._set_if_supported( default_stage_params, diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index a06d030d0da..5b4d5f56529 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -15,7 +15,6 @@ from functools import lru_cache from typing import Any -import torch from vllm.inputs import TextPrompt from vllm.logger import init_logger @@ -278,14 +277,11 @@ def ar2diffusion( f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)", ) - token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long) - diffusion_input: dict[str, Any] = { "prompt": text_prompt, "height": height, "width": width, "extra": { - "ar_token_ids": token_tensor, "ar_generated_text": cot_text_for_dit, }, } From 8d90c17bd4fe82bc7e2c9990105c4920ce297e5e Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 15:22:00 +0800 Subject: [PATCH 36/43] chore: apply pre-commit isort split for resolve_stop_token_ids import Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 739e55a2ad1..6e2a30f56f2 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2257,6 +2257,8 @@ def _build_multistage_generation_inputs( ) from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( available_tasks as _hunyuan3_available_tasks, + ) + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids, ) From b73b00f6fd3e7c509c5de537817ffcea916c048b Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 16:04:17 +0800 Subject: [PATCH 37/43] chore(hunyuan_image3): drop dead cot_token_ids plumbing and online task input - Online chat handler: drop `task` from extra_body; derive task from reference_images presence. Legacy `bot_task=` still normalizes through to the right trigger. - Remove the AR-token-id cot reuse path (`batch_cot_token_ids` in apply_chat_template, `ctx_type == "token_ids"` branch in process_successive_message, and `get_cot_sections_from_token_ids`); it has no caller after the optimization was rolled back per reviewer feedback. - Simplify `_truncate_at_cot_end` to text-only; the token-id return was no longer consumed. - Trim over-explanatory comments across serving_chat / api_server / pipeline / end2end. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../hunyuan_image3/end2end.py | 23 +-- .../hunyuan_image3/test_kvreuse_alignment.py | 135 ------------------ ...test_serving_chat_multistage_generation.py | 72 +--------- .../test_hunyuan_image3.py | 17 +-- .../hunyuan_image3_tokenizer.py | 123 ++-------------- .../hunyuan_image3/pipeline_hunyuan_image3.py | 19 +-- vllm_omni/entrypoints/openai/api_server.py | 24 +--- vllm_omni/entrypoints/openai/serving_chat.py | 97 ++++--------- .../stage_input_processors/hunyuan_image3.py | 49 ++----- 9 files changed, 66 insertions(+), 493 deletions(-) delete mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 36b3b1199a5..16f7d8f06c1 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -19,29 +19,12 @@ _REPO_ROOT = Path(__file__).resolve().parents[3] _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml") -# Modality → (task, default bot_task) mapping. `task` selects only whether -# `` placeholders are emitted; `bot_task` (None | think | recaption | -# think_recaption | vanilla) selects the system prompt + trigger tag. -# -# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short -# forms match the internal task names (see prompt_utils.available_tasks) -# so users who think in those terms don't have to translate. + _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = { "text2img": ("t2i", "think"), - "t2i": ("t2i", "think"), "img2img": ("it2i", "think"), - "it2i": ("it2i", "think"), "img2text": ("i2t", None), - "i2t": ("i2t", None), "text2text": ("t2t", None), - "t2t": ("t2t", None), -} - -_MODALITY_CANONICAL = { - "t2i": "text2img", - "it2i": "img2img", - "i2t": "img2text", - "t2t": "text2text", } _MODALITY_DEFAULT_DEPLOY_CONFIG = { @@ -65,8 +48,7 @@ def parse_args(): parser.add_argument( "--modality", default="text2img", - choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"], - help="Verbose and internal short task names are both accepted.", + choices=list(_MODALITY_TASK_MAP), ) parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.") parser.add_argument( @@ -135,7 +117,6 @@ def main(): os.makedirs(args.output, exist_ok=True) additional_config = parse_additional_config(args.additional_config) - args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality) task, default_bot_task = _MODALITY_TASK_MAP[args.modality] if args.bot_task is None: bot_task: str | None = default_bot_task diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py deleted file mode 100644 index 20faf5487dc..00000000000 --- a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py +++ /dev/null @@ -1,135 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""Regression tests for AR-token-IDs preservation through DiT prompt building. - -Pins the KV-reuse alignment contract: when the AR-side stage input -processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion -stage, `apply_chat_template` must consume those IDs verbatim (no -re-encode of the decoded cot text via `tokenizer.encode`) so that the -DiT-side prompt tokenization matches AR's actually-sampled token -sequence byte-for-byte. - -Why this matters: tokenize-detokenize-tokenize over the cot text is not -lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries), -and the resulting length drift breaks AR KV position alignment -- -DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`) -ends up larger than the actual cached AR KV length, and -`inject_ar_kv_into_layers` then silently truncates via Python slice, -leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off -by N (hard 500 on KV-reuse-enabled requests; see -`pipeline_hunyuan_image3.py:_cache_prompt_kv`). -""" - -from __future__ import annotations - -import os - -import pytest - -pytestmark = [pytest.mark.core_model] - - -def _hf_cached(model_id: str) -> bool: - hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface") - snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots") - return os.path.isdir(snap_dir) and any(os.scandir(snap_dir)) - - -_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct" - - -@pytest.mark.skipif( - not _hf_cached(_HUNYUAN_MODEL_ID), - reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache", -) -def test_get_cot_sections_from_token_ids_round_trips_ar_ids(): - """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the - `` / `` token-id positions and emit sections whose - concatenated tokens equal the input (no re-encode). - - Catches the failure mode where DiT re-encodes the decoded cot text - and the BPE merges differ from AR's sampled tokens (length drift). - """ - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import ( - TokenizerWrapper, - ) - - tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID) - - think_id = tkw.tokenizer.convert_tokens_to_ids("") - end_think_id = tkw.end_think_token_id - - # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens - # surrounded by / markers, plus some leading + trailing - # tokens (e.g. / tail that gets truncated upstream). - thought_payload = [1000, 1001, 1002, 1003, 1004] - leading = [2000, 2001] - trailing = [3000] - ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing - - sections = tkw.get_cot_sections_from_token_ids( - ar_token_ids, - uncond_kwargs={}, - drop_think=False, - ) - - # Sections concatenated must equal the input verbatim. - out: list[int] = [] - for sec in sections: - assert sec["type"] == "text", f"unexpected section type: {sec}" - toks = sec.get("tokens") - assert toks is not None, f"section missing 'tokens' field: {sec}" - out.extend(toks) - assert out == ar_token_ids, ( - f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; " - f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}" - ) - - -@pytest.mark.skipif( - not _hf_cached(_HUNYUAN_MODEL_ID), - reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache", -) -def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids(): - """When `batch_cot_token_ids` is passed, the assistant section in the - final encoded token sequence must contain the AR-sampled token ids - verbatim -- no `tokenizer.encode(cot_text)` round-trip. - - Pins the end-to-end contract that KV-reuse alignment relies on. - """ - from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import ( - TokenizerWrapper, - ) - - tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID) - think_id = tkw.tokenizer.convert_tokens_to_ids("") - end_think_id = tkw.end_think_token_id - - # Construct a synthetic AR cot id sequence. Use mid-range vocab ids - # that are very unlikely to collide with any chat-template specials. - payload = [55001, 55002, 55003] - ar_token_ids = [think_id] + payload + [end_think_id] - - out_with_ids = tkw.apply_chat_template( - batch_prompt=["draw a robot"], - batch_system_prompt=[None], - batch_cot_token_ids=[ar_token_ids], - mode="gen_text", - sequence_template="instruct", - ) - tokens_with_ids = out_with_ids["output"].tokens.tolist()[0] # batched output: take batch 0 - - # The exact AR payload must appear as a contiguous subsequence in the - # encoded output, sandwiched by the think markers we forwarded. - def _find_subseq(haystack: list[int], needle: list[int]) -> int: - n = len(needle) - for i in range(len(haystack) - n + 1): - if haystack[i : i + n] == needle: - return i - return -1 - - full_cot = [think_id] + payload + [end_think_id] - idx = _find_subseq(tokens_with_ids, full_cot) - assert idx >= 0, ( - f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; " - f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead" - ) diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 92f0ac2dc98..dd7f668611e 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -219,13 +219,9 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat): - """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under - `bot_task` in extra_body. After the P1 task/bot_task split, the helper - must still treat that legacy form as `task=, bot_task=None` - (i.e. defaults bot_task semantic to "think"), so the resulting prompt - is identical to the pre-P1 output. - - Pins the back-compat contract. + """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now + inferred from reference_images; legacy bot_task must still trigger the + default think mode rather than getting silently dropped. """ from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat @@ -241,7 +237,6 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi ) images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")] - # Legacy form: only bot_task=. legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( serving_chat, engine=engine, @@ -250,65 +245,8 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi reference_images=images, gen_params=OmniDiffusionSamplingParams(), ) - # New form: explicit task=, no bot_task. - new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="edit me", - extra_body={"task": "it2i"}, - reference_images=images, - gen_params=OmniDiffusionSamplingParams(), - ) - assert legacy_prompt["prompt"] == new_prompt["prompt"], ( - f"legacy bot_task= form must produce the same prompt as task=; " - f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}" - ) - - -@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"]) -def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str): - """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode. - - The task/bot_task split must not normalize every legacy task-enum request - into bot_task="think"; i2t/t2t had no / trigger before - the split and should stay plain unless the caller passes an explicit - semantic bot_task. - """ - from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat - - engine = SimpleNamespace( - stage_configs=[ - SimpleNamespace(stage_type="llm", is_comprehension=True), - SimpleNamespace(stage_type="diffusion", is_comprehension=False), - ], - default_sampling_params_list=[ - SamplingParams(temperature=0.0), - OmniDiffusionSamplingParams(), - ], - ) - images = [Image.new("RGB", (32, 32), color="red")] - - legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="describe me", - extra_body={"bot_task": legacy_task}, - reference_images=images if legacy_task == "i2t" else [], - gen_params=OmniDiffusionSamplingParams(), - ) - explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="describe me", - extra_body={"task": legacy_task}, - reference_images=images if legacy_task == "i2t" else [], - gen_params=OmniDiffusionSamplingParams(), - ) - - assert legacy_prompt["prompt"] == explicit_prompt["prompt"] - assert legacy_prompt["prompt"].endswith("Assistant: ") - assert not legacy_prompt["prompt"].endswith("") - assert not legacy_prompt["prompt"].endswith("") + assert legacy_prompt["prompt"].count("") == 2 + assert legacy_prompt["prompt"].endswith("Assistant: ") @pytest.mark.parametrize( diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py index 1901210de09..76f3e500622 100644 --- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py +++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py @@ -40,20 +40,9 @@ def test_extract_ratio_index_uses_fixed_special_token_ids(): assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36 -def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials(): - end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - token_ids = [100, 101, end_recaption, answer, boi, ratio] - - text, truncated = _truncate_at_cot_end( - "recaption body without special markers", - token_ids, - ) - - assert text == "recaption body without special markers" - assert truncated == [100, 101, end_recaption] +def test_truncate_at_cot_end_strips_tail_after_recaption_marker(): + text = _truncate_at_cot_end("body text") + assert text == "body text" def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch): diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py index e6e0c9db346..5751cb4d831 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py @@ -903,75 +903,6 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th dict(type="text", text=cot_text, **uncond_kwargs), ] - def get_cot_sections_from_token_ids( - self, - token_ids, - uncond_kwargs, - cot_max_length=None, - drop_think=False, - ): - """Split AR-sampled token IDs at think/recaption markers without re-encoding. - - Functional mirror of `get_cot_sections` but operates on AR sampled IDs. - Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR - cot text is not lossless (BPE re-merges across multi-byte UTF-8 and - punctuation boundaries). The resulting length drift breaks AR KV - position alignment (`positive_reuse_len` computed in DiT-tok space vs - the actual cached AR KV in AR-tok space, off by N tokens for prompts - containing Chinese + escaped quotes etc.). - """ - if not token_ids: - return [] - ids = list(token_ids) - - think_id = self.tokenizer.convert_tokens_to_ids("") - end_think_id = self.end_think_token_id - recaption_id = self.tokenizer.convert_tokens_to_ids("") - end_recaption_id = self.end_recaption_token_id - - def _split_at_pair(seq, start_id, end_id): - if start_id is None or end_id is None: - return None - try: - s = seq.index(start_id) - e = seq.index(end_id, s + 1) - except ValueError: - return None - return seq[:s], seq[s + 1 : e], seq[e + 1 :] - - # Try ... first to mirror text-side split order. - split = _split_at_pair(ids, think_id, end_think_id) - if split is not None: - before, inside, after = split - return ( - self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think) - + ( - [ - dict(type="text", tokens=[think_id]), - dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs), - dict(type="text", tokens=[end_think_id]), - ] - if not drop_think - else [] - ) - + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think) - ) - - split = _split_at_pair(ids, recaption_id, end_recaption_id) - if split is not None: - before, inside, after = split - return ( - self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think) - + [ - dict(type="text", tokens=[recaption_id]), - dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs), - dict(type="text", tokens=[end_recaption_id]), - ] - + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think) - ) - - return [dict(type="text", tokens=ids, **uncond_kwargs)] - def apply_general_template( self, message_list, @@ -1022,36 +953,17 @@ def process_successive_message( while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role: message = _message_list[_cur_message_idx] if message["type"] == "text": - content = message["content"] - ctx_type = message.get("context_type", "str") + text = message["content"] if role == "system": - _sub_sections.append(dict(type="text", text=content)) + _sub_sections.append(dict(type="text", text=text)) elif role == "assistant": - if ctx_type == "token_ids": - # Pre-tokenized AR cot tokens; split on marker ids, no re-encode. - if hasattr(content, "tolist"): - content = content.tolist() - think_id = self.tokenizer.convert_tokens_to_ids("") - recaption_id = self.tokenizer.convert_tokens_to_ids("") - has_cot = (think_id in content and self.end_think_token_id in content) or ( - recaption_id in content and self.end_recaption_token_id in content - ) - if has_cot: - _sub_sections.extend( - self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think) - ) - else: - _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs)) + if ("" in text and "" in text) or ( + "" in text and "" in text + ): + _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think)) else: - text = content - if ("" in text and "" in text) or ( - "" in text and "" in text - ): - _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think)) - else: - _sub_sections.append(dict(type="text", text=text, **uncond_kwargs)) + _sub_sections.append(dict(type="text", text=text, **uncond_kwargs)) else: - text = content _sub_sections.append( dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs) ) @@ -1176,7 +1088,6 @@ def apply_chat_template( batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None, batch_system_prompt: list[str] | None = None, batch_cot_text: list[str] | None = None, - batch_cot_token_ids: list | None = None, max_length: int | None = None, bot_task: str = "auto", # auto/image/think/recaption/img_ratio image_base_size: int = 1024, @@ -1205,14 +1116,6 @@ def apply_chat_template( ) else: batch_cot_text = [None] * batch_size - # Optional per-item pre-tokenized AR cot ids (used by KV-reuse). - if batch_cot_token_ids is not None: - assert len(batch_cot_token_ids) == batch_size, ( - f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), " - f"but got {len(batch_cot_token_ids)}." - ) - else: - batch_cot_token_ids = [None] * batch_size if batch_cond_image_info is not None: assert len(batch_cond_image_info) == batch_size, ( f"batch_cond_image_info should have the same length as batch_size ({batch_size}), " @@ -1231,14 +1134,12 @@ def apply_chat_template( prompt, system_prompt, cot_text, - cot_token_ids, gen_image_info, cond_image_info_list, ) in zip( batch_prompt, batch_system_prompt, batch_cot_text, - batch_cot_token_ids, batch_gen_image_info, batch_cond_image_info, ): @@ -1258,15 +1159,7 @@ def apply_chat_template( # 2.2 text inputs message_list.append(dict(role="user", type="text", content=prompt, context_type="str")) # 3. assistant answer sections - if cot_token_ids is not None: - # Use AR-sampled token IDs verbatim. Avoids the - # tokenize-detokenize-tokenize length drift that breaks KV reuse - # (see process_successive_message context_type="token_ids" branch - # and get_cot_sections_from_token_ids docstring). - message_list.append( - dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids") - ) - elif cot_text is not None: + if cot_text is not None: message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str")) if mode == "gen_image": message_list.append( diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 63c367a1006..33bfb65fb41 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -754,7 +754,6 @@ def prepare_model_inputs( mode="gen_image", system_prompt=None, cot_text=None, - cot_token_ids=None, num_inference_steps=50, guidance_scale=5.0, image_size="auto", @@ -771,7 +770,6 @@ def prepare_model_inputs( batch_message_list = message_list batch_prompt = prompt batch_cot_text = cot_text - batch_cot_token_ids = cot_token_ids batch_system_prompt = system_prompt batch_gen_image_info = None batch_cond_image_info = kwargs.pop("batch_cond_image_info", None) @@ -850,7 +848,6 @@ def prepare_model_inputs( batch_cond_image_info=batch_cond_image_info, batch_system_prompt=batch_system_prompt, batch_cot_text=batch_cot_text, - batch_cot_token_ids=batch_cot_token_ids, max_length=kwargs.get("max_length"), bot_task=bot_task, image_base_size=self.config.image_base_size, @@ -1379,20 +1376,13 @@ def forward( system_prompt = system_prompt.strip() if system_prompt is not None else "" prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt - # Extract AR-generated CoT/recaption text from each prompt's extra dict. - # The AR-side stage input processor (``ar2diffusion``) already prepends - # the trigger tag (e.g. ````) when the AR used the KV-reuse - # pretrain format, so ``ar_generated_text`` is a self-contained string - # and ``get_cot_sections()`` can parse the think/recaption structure - # directly. - cot_text_list = [] - for p in req.prompts: - extra = p.get("extra", {}) if isinstance(p, dict) else {} - cot_text_list.append(extra.get("ar_generated_text") or None) + cot_text_list = [ + (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None + for p in req.prompts + ] cot_text = ( [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None ) - cot_token_ids = None batch_cond_image_info: list[list[JointImageInfo]] | None = None if any(not isinstance(p, str) for p in req.prompts): @@ -1433,7 +1423,6 @@ def forward( model_inputs = self.prepare_model_inputs( prompt=prompt, cot_text=cot_text, - cot_token_ids=cot_token_ids, system_prompt=system_prompt, mode="gen_image", generator=generator, diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index c54295cf104..c1467f7190a 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1700,11 +1700,8 @@ async def edit_images( # vllm-omni extension for layered models (e.g., Qwen-Image-Layered) layers: int | None = Form(None), resolution: int | None = Form(None), # See SUPPORTED_LAYERED_RESOLUTIONS + # /v1/images/edits is always IT2I; only the prompting knobs are exposed. bot_task: str | None = Form(None), - # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis - # is fixed and pinned downstream. ``bot_task`` (think / recaption / - # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the - # only HunyuanImage-3.0 knobs callers need to express here. sys_type: str | None = Form(None), system_prompt: str | None = Form(None), ) -> ImageGenerationResponse: @@ -1757,10 +1754,8 @@ async def edit_images( status_code=HTTPStatus.BAD_REQUEST.value, detail=detail, ) - # Convert uploads to RGB when the caller opts into the Hunyuan-aware - # API surface (bot_task / sys_type / system_prompt). Keeping uploads - # as RGBA/P PIL objects makes online IT2I observe a different visual - # input than the offline path. + # Match the offline path: RGB normalize when the caller opts into + # Hunyuan-aware behavior. RGBA/P uploads otherwise diverge from offline. normalize_edit_images_rgb = bot_task is not None or sys_type is not None pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb) prompt["multi_modal_data"] = {} @@ -1895,12 +1890,8 @@ async def edit_images( "seed": effective_seed, "num_outputs_per_prompt": n, } - # When size="auto", width/height were resolved from the first - # input images size (e.g. 512x512 logo), NOT a client-requested - # output dimension. Forwarding them to extra_body would override - # AR-driven pipelines (e.g. HunyuanImage-3.0) AR `` - # token decision via gen_params -> sampling_params. Skip the - # forward when auto, matching offline end2end.py img2img. + # size="auto" resolves width/height from input image; forwarding + # those would override AR-driven `` token selection. if not size_was_auto: if width is not None: extra_body["width"] = width @@ -1925,11 +1916,6 @@ async def edit_images( lora_dict = _get_lora_from_json_str(lora) _parse_lora_request(lora_dict) extra_body["lora"] = lora_dict - # ``/v1/images/edits`` is always IT2I; the chat handler's - # default (``task="it2i"`` when neither ``task`` nor - # ``bot_task`` resolves to a task enum) covers this implicitly. - # Legacy callers passing the task enum via ``bot_task`` (e.g. - # ``bot_task="it2i"``) are normalized inside the chat handler. if bot_task is not None: extra_body["bot_task"] = bot_task if sys_type is not None: diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 6e2a30f56f2..4677135cdb0 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2247,37 +2247,26 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy - # api_server callers may still pass a task-enum value (i2t / it2i / - # t2i / t2t) under `bot_task`; normalize it to `task` here so - # downstream uses the canonical split. Source the task enum from - # prompt_utils so this layer stays in sync with the model side. from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES, ) - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - available_tasks as _hunyuan3_available_tasks, - ) from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids, ) - task = extra_body.get("task") bot_task = extra_body.get("bot_task") sys_type = extra_body.get("sys_type") custom_system_prompt = extra_body.get("system_prompt") - legacy_task_from_bot_task = False - legacy_task_names = set(_hunyuan3_available_tasks()) | { - "it2i_think", - "it2i_recaption", - "t2i_think", - "t2i_recaption", - "t2i_vanilla", - } - if task is None and bot_task in legacy_task_names: - task = bot_task + + # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via + # bot_task. Task is now derived from reference_images presence; map + # composites to their semantic bot_task and drop bare task enums. + bot_task_omitted = False + if bot_task in {"it2i", "t2i", "i2t", "t2t"}: bot_task = None - legacy_task_from_bot_task = True + bot_task_omitted = True + elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}: + bot_task = bot_task.split("_", 1)[1] if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES: raise ValueError( @@ -2285,6 +2274,8 @@ def _build_multistage_generation_inputs( f"images per request, got {len(reference_images)}" ) + task = "it2i" if reference_images else "t2i" + engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] if reference_images: @@ -2296,50 +2287,33 @@ def _build_multistage_generation_inputs( prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None - if task or bot_task: + if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, build_prompt_tokens, ) - num_images = len(reference_images) if reference_images else 1 - effective_task = task if task is not None else "it2i" - build_kwargs = { - "task": effective_task, + build_kwargs: dict[str, Any] = { + "task": task, "sys_type": sys_type, "custom_system_prompt": custom_system_prompt, - "num_images": num_images, + "num_images": len(reference_images) if reference_images else 1, } if bot_task is not None: build_kwargs["bot_task"] = bot_task - elif "bot_task" in extra_body and not legacy_task_from_bot_task: - # Preserve the prompt_utils distinction between omitted - # bot_task and explicit None. Omitted keeps each task's legacy - # default (`it2i` -> think, `i2t`/`t2t` -> plain), while - # explicit None is the caller's plain-mode request. + elif "bot_task" in extra_body and not bot_task_omitted: + # Explicit None from the caller is plain-mode; omitted lets + # each task fall back to its default trigger. build_kwargs["bot_task"] = None if tokenizer is not None: - # HF byte-for-byte path: feed segment-tokenized prompt_token_ids - # so AR sees the same template-tokenization HF apply_chat_template - # produces. Without this, the engine BPE-merges across template - # segment boundaries (e.g. "。\n\n" -> single id) and AR - # diverges from training distribution -- different cot_text, - # different DiT input, different final image. Mirrors offline - # examples/.../end2end.py img2img which always feeds - # prompt_token_ids. See prompt_utils.build_prompt NOTE. - result = build_prompt_tokens( - prompt, - tokenizer, - **build_kwargs, - ) + # Feed segment-tokenized prompt_token_ids so AR matches HF + # apply_chat_template byte-for-byte (engine BPE would merge + # across template boundaries, e.g. "。\n\n" -> single id). + result = build_prompt_tokens(prompt, tokenizer, **build_kwargs) prompt_token_ids = result.token_ids system_prompt_type = result.system_prompt_type else: - # Legacy string path (e.g. unit tests with no tokenizer plumbed). - prompt = build_prompt( - prompt, - **build_kwargs, - ) + prompt = build_prompt(prompt, **build_kwargs) if reference_images and len(reference_images) == 1: engine_prompt_data = {"image": reference_images[0]} modalities = ["image"] @@ -2349,10 +2323,8 @@ def _build_multistage_generation_inputs( engine_prompt["prompt_token_ids"] = prompt_token_ids if system_prompt_type is not None: engine_prompt["use_system_prompt"] = system_prompt_type - # Forward the custom system prompt body too. DiT's - # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads - # the third positional arg, so leaving it None turns a `sys_type=custom` - # request into an empty DiT system prefix (AR/DiT divergence). + # DiT's get_system_prompt(use_system_prompt, "image", system_prompt) reads + # this; omitting it makes sys_type=custom yield an empty DiT prefix. if custom_system_prompt is not None: engine_prompt["system_prompt"] = custom_system_prompt engine_prompt["modalities"] = modalities @@ -2399,10 +2371,8 @@ def _build_multistage_generation_inputs( ): default_stage_params.seed = seed - # Inject target_h/w into comprehension (AR) stage sampling params - # for models that need M-RoPE position pre-computation (e.g. - # GLM-Image). max_tokens is handled via the deploy YAML default - # (upper-bound ceiling) rather than computed dynamically here. + # Inject target_h/w into AR stage for M-RoPE position pre-computation + # (e.g. GLM-Image). max_tokens comes from deploy YAML. if comprehension_idx is not None and idx == comprehension_idx and height is not None and width is not None: extra_args = getattr(default_stage_params, "extra_args", None) if extra_args is None: @@ -2411,22 +2381,17 @@ def _build_multistage_generation_inputs( extra_args["target_h"] = int(height) extra_args["target_w"] = int(width) - # Resolve AR stop tokens dynamically from (task, bot_task) so the - # online path matches offline ``end2end.py`` and so the AR stops - # at the natural ```` token for image-output tasks - # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``). - # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR - # too early and leave ``ar2diffusion`` without a ratio token. + # Stop AR at the natural token for image tasks; mirrors + # upstream modeling_hunyuan_image_3.py:3289-3303. if ( comprehension_idx is not None and idx == comprehension_idx and hasattr(default_stage_params, "stop_token_ids") ): - resolved_stops = _hunyuan3_resolve_stop_token_ids( - task=task if task is not None else "it2i", + default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids( + task=task, bot_task=bot_task, ) - default_stage_params.stop_token_ids = resolved_stops if stage_type == "diffusion": self._set_if_supported( diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py index 5b4d5f56529..749e213e099 100644 --- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -102,45 +102,19 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]: return [(r.height, r.width) for r in resolutions] -def _truncate_at_cot_end( - generated_text: str, - generated_token_ids, -) -> tuple[str, list[int]]: +def _truncate_at_cot_end(generated_text: str) -> str: """Truncate AR output at first `` (or `` fallback). - Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official - upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as - `cot_text` for DiT. The trailing `` - sequence is a stage-transition trigger consumed via `image_size` / - height/width; it must NOT be forwarded to DiT's prompt builder, or - the extra `` and ratio tokens drift the DiT's own prompt - structure. + Mirrors upstream `HunyuanImage3ForCausalMM.generate_image` which feeds + DiT only the cot text up to the closing tag; the trailing + `` is consumed via height/width + extraction and must not leak into DiT's prompt builder. """ - token_list = list(generated_token_ids) if generated_token_ids is not None else [] - - end_ids = { - "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - "": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - } - for marker in ("", ""): - truncated_tokens = token_list - end_id = end_ids[marker] - if token_list: - try: - token_end = token_list.index(end_id) - truncated_tokens = token_list[: token_end + 1] - except ValueError: - pass - idx = generated_text.find(marker) if idx != -1: - text_end = idx + len(marker) - return generated_text[:text_end], truncated_tokens - if truncated_tokens is not token_list: - return generated_text, truncated_tokens - - return generated_text, token_list + return generated_text[: idx + len(marker)] + return generated_text @lru_cache(maxsize=4) @@ -256,14 +230,7 @@ def ar2diffusion( width, ) - # Truncate the AR output at `` (or ``) before - # passing to DiT. Mirrors official `generate_image` which keeps - # `cot_text` clean and routes size/ratio via `image_size` only; - # we already extracted `ratio_idx` above and translated it into - # `height` / `width`, so the `` - # tail has no remaining job and would only contaminate DiT's - # prompt builder if forwarded. - cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids) + cot_text_for_dit = _truncate_at_cot_end(generated_text) logger.info( "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, " From 8d12ddda27f7f4e9d038a7eb2e5dab10a91eb2ee Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 16:09:14 +0800 Subject: [PATCH 38/43] chore: apply ruff-format fixup for cot_text_list comprehension Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 33bfb65fb41..73b89bb11b0 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -1377,8 +1377,7 @@ def forward( prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt cot_text_list = [ - (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None - for p in req.prompts + (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None for p in req.prompts ] cot_text = ( [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None From bfd17b37599207c86b88e55908daea5d2c160041 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 16:23:39 +0800 Subject: [PATCH 39/43] chore: keep for-loop one-line in apply_chat_template (no spurious diff) Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../models/hunyuan_image3/hunyuan_image3_tokenizer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py index 5751cb4d831..751bfb21af8 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py @@ -1130,13 +1130,7 @@ def apply_chat_template( # Convert single round materials into standard message list batch_message_list = [] - for ( - prompt, - system_prompt, - cot_text, - gen_image_info, - cond_image_info_list, - ) in zip( + for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip( batch_prompt, batch_system_prompt, batch_cot_text, From 1de9ec8bcd7f0376f521e4c528a0e6758a26eb05 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 16:38:50 +0800 Subject: [PATCH 40/43] test: rename test_hunyuan_image3.py to avoid pytest basename collision Collided with tests/e2e/accuracy/test_hunyuan_image3.py under pytest's default 'prepend' import mode (no __init__.py in either dir). Rename this one to make basenames unique. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/model_executor/stage_input_processors/{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py} (100%) diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py similarity index 100% rename from tests/model_executor/stage_input_processors/test_hunyuan_image3.py rename to tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py From 58ce6d86cf547aed75bf8c754f5a018153273bfb Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 22:52:11 +0800 Subject: [PATCH 41/43] fix(hunyuan_image3): mark AR stage is_comprehension=true so online IT2I keeps non-square AR shape Online /v1/images/edits collapsed AR-predicted aspects to a square (e.g. 1024x1024) while offline end2end.py honored the predicted ratio (e.g. 1216x832). Root cause is the AR stage in deploy/hunyuan_image3.yaml was marked ``is_comprehension: false`` (read literally as "this task generates an image, not text"), but ``is_comprehension`` inside vllm-omni is the tokenizer-owning AR-stage marker, not a user-visible task type. The serving path in entrypoints/openai/serving_chat.py looks up the AR stage by that flag to apply ``resolve_stop_token_ids`` (image-task stop set = ```` range). With the flag false the lookup returned None, the AR kept the YAML default ``stop_token_ids: []``, and the HunyuanImage3 custom sampler's forced-transition step `` -> `` triggered an immediate stop. The cumulative token ids never reached ````, so ``ar2diffusion._extract_ratio_index`` could not recover the AR aspect and fell back to the carried-through prompt size (1024x1024 for size=auto edits). Offline avoided this because end2end.py overrides the AR stage's stop_token_ids directly without going through the comprehension-stage lookup. Other models did not hit it because their AR stage already had ``is_comprehension: true`` (the field's framework-internal meaning). Fix is one line on the deploy config plus a comment explaining the flag's real semantics so the next model author does not repeat the same misread. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/deploy/hunyuan_image3.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 634165cd33a..93294bcdf44 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -22,7 +22,13 @@ connectors: stages: - stage_id: 0 - is_comprehension: false + # ``is_comprehension`` in vllm-omni names the tokenizer-owning AR stage + # (see config/stage_config.py + serving_chat AR-stage lookup), independent + # of whether the AR's task is comprehension (i2t/t2t) or generation + # (it2i/t2i). HunyuanImage-3.0's stage-0 owns the tokenizer and emits the + # cot+ratio token sequence consumed by stage-1, so it must be marked True + # for the serving path to set AR seed/stop_token_ids on this stage. + is_comprehension: true final_output: true final_output_type: text max_num_seqs: 1 From be0c6840046d96cbd83e7c2ce2318e2e1fcb3a98 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Wed, 13 May 2026 23:27:32 +0800 Subject: [PATCH 42/43] chore(hunyuan_image3): drop redundant hunyuan-specific task/stop logic from serving_chat PR #3444 added 84 lines of HunyuanImage-3.0-specific handling to ``serving_chat._build_multistage_generation_inputs`` (task derivation from reference images, legacy task-enum mapping on ``bot_task``, ``MAX_IMAGES_PER_REQUEST`` cap, and an AR-stage ``stop_token_ids`` override via ``resolve_stop_token_ids``). The endpoint dispatch in ``api_server.py`` (``/v1/images/edits`` vs ``/v1/images/generations``) already encodes the task split, and the AR-stage stop override is redundant: ``HunyuanImage3ForCausalMM.sample`` already forces an EOS after sampling a ratio token (``hunyuan_image3.py`` generation-mode branch), so leaving the YAML default stop set empty lets the AR run through ```` and stop naturally on EOS; ``ar2diffusion._extract_ratio_index`` then reads the ratio off ``cumulative_token_ids``. The production deploy (``vllm_omni/deploy/hunyuan_image3.yaml``) already omits ``stop_token_ids`` for stage-0. Net effect on ``serving_chat.py``: +84/-19 -> +47/-19 (-37 lines). Behavior verified end-to-end on ``/v1/images/edits`` with a non-square target after removal: ``ar2diffusion`` reports ``AR ratio_idx=19, target size=1216x832`` (matches the offline ``end2end.py`` path), identical to the result with the now-removed override in place. Offline ``end2end.py`` still derives ``task`` and overrides ``stop_token_ids`` because it builds the params list directly without the endpoint-level task signal; that path is intentionally unchanged. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 43 ++------------------ 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 4677135cdb0..2c375fa2928 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2247,35 +2247,10 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES, - ) - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids, - ) - bot_task = extra_body.get("bot_task") sys_type = extra_body.get("sys_type") custom_system_prompt = extra_body.get("system_prompt") - # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via - # bot_task. Task is now derived from reference_images presence; map - # composites to their semantic bot_task and drop bare task enums. - bot_task_omitted = False - if bot_task in {"it2i", "t2i", "i2t", "t2t"}: - bot_task = None - bot_task_omitted = True - elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}: - bot_task = bot_task.split("_", 1)[1] - - if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES: - raise ValueError( - f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input " - f"images per request, got {len(reference_images)}" - ) - - task = "it2i" if reference_images else "t2i" - engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] if reference_images: @@ -2287,21 +2262,21 @@ def _build_multistage_generation_inputs( prompt_token_ids: list[int] | None = None system_prompt_type: str | None = None - if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted: + if bot_task is not None or sys_type is not None or custom_system_prompt is not None: from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt, build_prompt_tokens, ) build_kwargs: dict[str, Any] = { - "task": task, + "task": "it2i" if reference_images else "t2i", "sys_type": sys_type, "custom_system_prompt": custom_system_prompt, "num_images": len(reference_images) if reference_images else 1, } if bot_task is not None: build_kwargs["bot_task"] = bot_task - elif "bot_task" in extra_body and not bot_task_omitted: + elif "bot_task" in extra_body: # Explicit None from the caller is plain-mode; omitted lets # each task fall back to its default trigger. build_kwargs["bot_task"] = None @@ -2381,18 +2356,6 @@ def _build_multistage_generation_inputs( extra_args["target_h"] = int(height) extra_args["target_w"] = int(width) - # Stop AR at the natural token for image tasks; mirrors - # upstream modeling_hunyuan_image_3.py:3289-3303. - if ( - comprehension_idx is not None - and idx == comprehension_idx - and hasattr(default_stage_params, "stop_token_ids") - ): - default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids( - task=task, - bot_task=bot_task, - ) - if stage_type == "diffusion": self._set_if_supported( default_stage_params, From 161ba503d52a206a434d681d9c03d7e0632419ad Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2324465096@qq.com> Date: Thu, 14 May 2026 09:41:19 +0800 Subject: [PATCH 43/43] test(hunyuan_image3): drop legacy task-as-bot_task tests after serving_chat cleanup The serving_chat cleanup in the previous commit removed the legacy caller compatibility layer that translated ``bot_task in {"it2i", "t2i", "i2t", "t2t"}`` to ``None`` and ``bot_task in {"it2i_think", "it2i_recaption", ...}`` to the trailing ``think``/``recaption`` part. That translation existed because old callers stuffed task enums into the ``bot_task`` field; the new contract is the endpoint dispatch (``/v1/images/edits`` vs ``/v1/images/generations``) and ``reference_images`` presence carry the task signal, and ``bot_task`` only takes the documented values (``None`` / ``recaption`` / ``think`` / ``think_recaption`` / ``vanilla``). Two tests in ``test_serving_chat_multistage_generation.py`` were explicitly pinning the now-removed legacy form (``test_..._legacy_bot_task_form_unchanged``, ``test_..._legacy_composite_tasks_still_work``); deleting them. Three other tests passed ``bot_task="it2i"`` only to trigger the ``build_prompt`` path (the *value* did not matter, just non-None); switching them to ``bot_task="think"`` keeps the same intent against the new validator. Signed-off-by: TaffyOfficial <2324465096@qq.com> --- .../openai_api/test_image_server.py | 4 +- ...test_serving_chat_multistage_generation.py | 75 +------------------ 2 files changed, 4 insertions(+), 75 deletions(-) diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index fb9c126d3fe..40adb7a9151 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -1675,7 +1675,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on for multi-image fusion). Cross-pins the multi-image fix at the API level: 2 reference images - with bot_task=it2i must produce 2 placeholders in the captured + with bot_task=think must produce 2 placeholders in the captured AR prompt (build_prompt called with num_images=2). """ img_a = make_test_image_bytes((32, 32)) @@ -1686,7 +1686,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on data={ "prompt": "fuse", "size": "auto", - "bot_task": "it2i", + "bot_task": "think", }, ) assert response.status_code == 200, response.text diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index dd7f668611e..4b63588bae7 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -127,7 +127,7 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders serving_chat, engine=engine, prompt="edit me", - extra_body={"bot_task": "it2i"}, + extra_body={"bot_task": "think"}, reference_images=images[:n], gen_params=OmniDiffusionSamplingParams(), ) @@ -196,7 +196,7 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: serving_chat, engine=engine, prompt="edit me", - extra_body={"bot_task": "it2i"}, + extra_body={"bot_task": "think"}, reference_images=images[:n], gen_params=OmniDiffusionSamplingParams(), tokenizer=tok, @@ -218,77 +218,6 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]: assert img_count == n, f"N={n}: expected {n} token ids in prompt_token_ids, got {img_count}" -def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat): - """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now - inferred from reference_images; legacy bot_task must still trigger the - default think mode rather than getting silently dropped. - """ - from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat - - engine = SimpleNamespace( - stage_configs=[ - SimpleNamespace(stage_type="llm", is_comprehension=True), - SimpleNamespace(stage_type="diffusion", is_comprehension=False), - ], - default_sampling_params_list=[ - SamplingParams(temperature=0.0), - OmniDiffusionSamplingParams(), - ], - ) - images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")] - - legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="edit me", - extra_body={"bot_task": "it2i"}, - reference_images=images, - gen_params=OmniDiffusionSamplingParams(), - ) - assert legacy_prompt["prompt"].count("") == 2 - assert legacy_prompt["prompt"].endswith("Assistant: ") - - -@pytest.mark.parametrize( - "legacy_task,trigger", - [ - ("it2i_think", ""), - ("it2i_recaption", ""), - ], -) -def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work( - serving_chat, - legacy_task: str, - trigger: str, -): - """Legacy composite task names passed through bot_task must still work.""" - from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat - - engine = SimpleNamespace( - stage_configs=[ - SimpleNamespace(stage_type="llm", is_comprehension=True), - SimpleNamespace(stage_type="diffusion", is_comprehension=False), - ], - default_sampling_params_list=[ - SamplingParams(temperature=0.0), - OmniDiffusionSamplingParams(), - ], - ) - images = [Image.new("RGB", (32, 32), color="red")] - - legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="edit me", - extra_body={"bot_task": legacy_task}, - reference_images=images, - gen_params=OmniDiffusionSamplingParams(), - ) - - assert legacy_prompt["prompt"].count("") == 1 - assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}") - - def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat): """Passing bot_task=think_recaption (vs default "think") must flip the resolved sys_type to en_think_recaption (and trigger tag is still