From 0f2ee2d16c72fba5605aea2d6a2f48e93e7146ad Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 11:22:57 +0800
Subject: [PATCH 01/43] [Feature] HunyuanImage-3.0 IT2I: support multi-image
 input
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HunyuanImage-3.0-Instruct supports up to 3 reference images for IT2I
"Multi-Image Fusion" upstream (README §200-216, §500). vllm-omni's DiT
pipeline, AR processor, OpenAI schema, and ar2diffusion bridge already
accepted list-shaped `multi_modal_data["image"]`, but four call sites
still encoded a hard "N=1" assumption that blocked real multi-image
runs. End-to-end smoke (4× L20X) on the official `input_1_0.png` +
`input_1_1.png` demo pair runs cleanly and preserves each image's
native bucket (no forced cropping of the second image).

Surgery points:

1. `vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py`:
   `build_prompt` / `build_prompt_tokens` take `num_images: int`
   (default 1, validated 1 <= N <= 3 for image-input tasks) and emit N
   consecutive `<img>` placeholders between `User: ` and the user
   prompt. Mirrors the official tokenizer where each cond_image becomes
   its own user-role message and `apply_general_template` concatenates
   successive user messages back-to-back inside one user_prefix /
   user_suffix wrap.

2. `vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py`
   `HunyuanImage3Processor.process_image`: each cond image now keeps
   its own VAE `reso_group` bucket (mirrors the official ragged
   behavior in `_encode_cond_image`). Per-image VAE pixel tensors are
   flattened to 1-D and concatenated; `_get_mm_fields_config` declares
   `vae_pixel_values` with `MultiModalFieldConfig.flat_from_sizes(...,
   vae_pixel_size)` so vLLM splits the buffer back per image at
   consumption time. Mirrors the GLM-Image / Ming-Flash-Omni pattern.
   `_parse_and_validate_image_input` reconstructs a list of per-image
   (3, H_i, W_i) tensors using `vae_token_grid_hw`; `embed_multimodal`
   loops over the list for VAE encode + patch_embed (which was already
   per-image after the encode call). VIT (Siglip2 naflex) keeps the
   `batched("image")` path since naflex pads to `max_num_patches`.

3. `vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py`
   `instantiate_timestep_tokens`: `_encode_cond_image` returns
   `cond_t` as `list[Tensor]` for the multi-image branch (one tensor
   of N_cond_images timesteps per batch item). `instantiate_vae_image_tokens`
   already had a per-batch zip loop for the list shape; this function
   was missed and used a global flatten that silently broke on
   heterogeneous batches (different image counts per batch item).
   Adds a per-batch loop that mirrors `instantiate_vae_image_tokens`,
   slicing both `t` and `timestep_scatter_index` per batch item.

4. `examples/offline_inference/hunyuan_image3/end2end.py`:
   `--image-path` accepts comma-separated paths (matching the official
   upstream CLI); `num_images` is threaded through to the prompt
   builder.

Tests: new regression file pinning N=1/2/3 placeholder layout (string
+ token-id, FakeTokenizer for fast CPU coverage), default-N=1
byte-equivalence with legacy callers, ValueError for out-of-range N,
and three real-`AutoTokenizer.from_pretrained` cases proving N=1/2/3
produce N consecutive `<img>` token ids on the production tokenizer
path with no separator drift between successive `<img>` placeholders.

End-to-end smoke (4× L20X 143GB, AR=TP2 + DiT=TP2, 20 denoise steps,
multi-image fusion against the official demo pair):
- AR generated CoT tokens for the fused request
- DiT denoise 20/20 steps in 24s (~1.10 s/step)
- Peak GPU mem 95.52 GB reserved / 90.10 GB allocated, 5.7% pool
- Output PNG saved cleanly; second reference image's native aspect
  visible in the fusion (vs the prior shared-bucket implementation
  that forced it into the first image's square bucket).

Output-size handling for the AR/DiT ratio lifecycle is intentionally
NOT touched. The pre-existing `image_list[0]` raw-pixel fallback in
`pre_process_func` bypasses the AR's ratio-token prediction (the
`<img_ratio_X>` token sampled under `SliceVocabLogitsProcessor`);
properly wiring that into `ar2diffusion`'s width/height assignment is
a separate refactor.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/end2end.py                 |  34 ++-
 .../test_hunyuan_image3_it2i_multi_image.py   | 251 ++++++++++++++++++
 .../hunyuan_image3/pipeline_hunyuan_image3.py |   7 +-
 .../models/hunyuan_image3/prompt_utils.py     |  36 ++-
 .../models/hunyuan_image3/hunyuan_image3.py   | 117 +++++---
 5 files changed, 389 insertions(+), 56 deletions(-)
 create mode 100644 tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 5232568f11e..f9f734c9f4a 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -10,6 +10,7 @@
 Usage:
     python end2end.py --modality text2img --prompts "A cute cat"
     python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy"
+    python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine"
     python end2end.py --modality img2text --image-path input.png --prompts "Describe this image"
 """
 
@@ -71,7 +72,7 @@ def parse_args():
         "--image-path",
         type=str,
         default=None,
-        help="Path to input image (for img2img/img2text).",
+        help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
     )
     parser.add_argument(
         "--output",
@@ -207,14 +208,19 @@ def main():
         print("[Info] No prompts provided, using default.")
         prompts = ["A cute cat"]
 
-    # Load image if needed
-    input_image = None
+    input_images: list = []
     if args.modality in ("img2img", "img2text"):
-        if not args.image_path or not os.path.exists(args.image_path):
+        if not args.image_path:
             raise ValueError(f"--image-path required for {args.modality}, got: {args.image_path}")
         from PIL import Image
 
-        input_image = Image.open(args.image_path).convert("RGB")
+        image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
+        for p in image_paths:
+            if not os.path.exists(p):
+                raise ValueError(f"Image path does not exist: {p}")
+            input_images.append(Image.open(p).convert("RGB"))
+        if not input_images:
+            raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
 
     # Load tokenizer for segment-wise prompt tokenization (matches HF
     # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
@@ -222,10 +228,18 @@ def main():
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
 
+    mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
+
     # Format prompts
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
-        result = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
+        # Only pass `num_images` for modalities that actually consume images;
+        # text-only paths ignore the parameter, but threading it
+        # unconditionally reads as if t2i needed at least one image.
+        build_kwargs: dict = {"task": task, "sys_type": args.sys_type}
+        if input_images:
+            build_kwargs["num_images"] = len(input_images)
+        result = build_prompt_tokens(p, tokenizer, **build_kwargs)
         token_ids = result.token_ids
         effective_sys_type = result.system_prompt_type
 
@@ -243,12 +257,12 @@ def main():
             prompt_dict["modalities"] = ["image"]
         elif args.modality == "img2img":
             prompt_dict["modalities"] = ["image"]
-            prompt_dict["multi_modal_data"] = {"image": input_image}
-            prompt_dict["height"] = input_image.height
-            prompt_dict["width"] = input_image.width
+            prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
+            prompt_dict["height"] = input_images[0].height
+            prompt_dict["width"] = input_images[0].width
         elif args.modality == "img2text":
             prompt_dict["modalities"] = ["text"]
-            prompt_dict["multi_modal_data"] = {"image": input_image}
+            prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
         elif args.modality == "text2text":
             prompt_dict["modalities"] = ["text"]
 
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
new file mode 100644
index 00000000000..c8a9891385c
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -0,0 +1,251 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Multi-image input regression for HunyuanImage3 IT2I prompt construction.
+
+The official HunyuanImage-3.0-Instruct supports up to 3 reference images
+per IT2I request ("Multi-Image Fusion"; see hunyuan3.0_ins/README.md
+section 200-216 + line 500). Each cond image becomes its own user-role
+message and `apply_general_template` concatenates successive user
+messages back-to-back inside ONE user_prefix/user_suffix wrap (see
+hunyuan3.0_ins/tokenization_hunyuan_image_3.py:1399-1400, 1499-1515).
+The lightweight `<img>` + `multi_modal_data` builder used by the example
+flow must match that contract: N consecutive `<img>` placeholders sit
+between `User: ` and the user prompt, with no separator between them.
+
+This file pins:
+  1. N consecutive `<img>` placeholders for N=1/2/3 across both the
+     string builder (`build_prompt`) and the token builder
+     (`build_prompt_tokens`).
+  2. The N=1 path stays bit-identical to the legacy single-image builder
+     (regression guard so default callers don't notice).
+  3. N=2 / N=3 token sequences differ from N=1 by exactly (N-1) extra
+     `<img>` ids inserted between `User: ` and `user_prompt`.
+  4. Validation: N<1 and N>3 raise ValueError (hard cap N<=3 mirrors
+     official upstream).
+  5. Text-only tasks ignore `num_images` (no validation, no extra ids).
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    MAX_IMAGES_PER_REQUEST,
+    build_prompt,
+    build_prompt_tokens,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+class FakeTokenizer:
+    """Recording fake tokenizer mirroring the one in test_prompt_utils.
+
+    Special token ids: `<|startoftext|>`=1, `<img>`=2, `<think>`=3,
+    `<recaption>`=4. encode() returns one id per character starting at
+    100, so substring-position assertions are stable.
+    """
+
+    SPECIAL = {
+        "<|startoftext|>": 1,
+        "<img>": 2,
+        "<think>": 3,
+        "<recaption>": 4,
+    }
+
+    def __init__(self) -> None:
+        self.encode_calls: list[str] = []
+
+    def convert_tokens_to_ids(self, tok: str) -> int:
+        return self.SPECIAL.get(tok, 0)
+
+    def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+        self.encode_calls.append(text)
+        return list(range(100, 100 + len(text)))
+
+
+_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption")
+_TEXT_ONLY_TASKS = ("t2t",)
+
+
+# -------------------- string builder --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("num_images", [1, 2, 3])
+def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int):
+    """N=1/2/3 -> exactly N `<img>` substrings appear consecutively
+    between `User: ` and the user prompt, with no separator between them."""
+    s = build_prompt("HELLO", task=task, num_images=num_images)
+    assert s.count("<img>") == num_images, (
+        f"task={task} num_images={num_images}: expected {num_images} <img> "
+        f"placeholders, found {s.count('<img>')} -- prompt was: {s!r}"
+    )
+
+    # All `<img>` placeholders must form one contiguous run "<img><img>..."
+    # immediately after `User: ` and before HELLO.
+    user_idx = s.index("User: ") + len("User: ")
+    hello_idx = s.index("HELLO")
+    between = s[user_idx:hello_idx]
+    assert between == "<img>" * num_images, (
+        f"region between `User: ` and prompt must be exactly N <img> placeholders; got {between!r}"
+    )
+
+
+def test_build_prompt_default_num_images_matches_legacy():
+    """num_images default = 1 must produce a string bit-identical to the
+    pre-multi-image behavior (single `<img>` placeholder)."""
+    legacy = build_prompt("HELLO", task="it2i_think")
+    explicit = build_prompt("HELLO", task="it2i_think", num_images=1)
+    assert legacy == explicit, "default num_images=1 must match legacy single-image output"
+
+
+# -------------------- token builder --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+def test_build_prompt_tokens_inserts_N_img_ids(task: str):
+    """N=1/2/3 -> the resulting id sequence contains exactly N copies of
+    img_id (=2) sitting consecutively after the `User: ` segment."""
+    tok = FakeTokenizer()
+    ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1)
+    tok = FakeTokenizer()
+    ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2)
+    tok = FakeTokenizer()
+    ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3)
+
+    assert ids_n1.count(2) == 1
+    assert ids_n2.count(2) == 2
+    assert ids_n3.count(2) == 3
+
+    # Each additional image must extend the sequence by exactly one img_id,
+    # not shift other tokens around.
+    assert len(ids_n2) == len(ids_n1) + 1
+    assert len(ids_n3) == len(ids_n1) + 2
+
+    # The img_ids must be CONSECUTIVE (no other token between successive
+    # `<img>` placeholders -- mirrors the official `process_successive_message`
+    # wrapping where successive user messages share one user_prefix/suffix).
+    for ids, n in [(ids_n2, 2), (ids_n3, 3)]:
+        first = ids.index(2)
+        for k in range(n):
+            assert ids[first + k] == 2, (
+                f"img_ids must be consecutive starting at position {first} for n={n}; got {ids[first : first + n]!r}"
+            )
+
+
+def test_build_prompt_tokens_default_num_images_matches_legacy():
+    """num_images default = 1 must produce the same id sequence as
+    omitting the parameter (regression guard for existing single-image
+    callers)."""
+    tok_a = FakeTokenizer()
+    legacy = build_prompt_tokens("hi", tok_a, task="it2i_think")
+    tok_b = FakeTokenizer()
+    explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1)
+    assert legacy == explicit
+    # Also: encode() must have been called on the same set of segments,
+    # so segment boundaries are preserved.
+    assert tok_a.encode_calls == tok_b.encode_calls
+
+
+# -------------------- validation --------------------
+
+
+@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99])
+def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int):
+    with pytest.raises(ValueError, match="num_images must be in"):
+        build_prompt("hi", task=task, num_images=bad)
+    with pytest.raises(ValueError, match="num_images must be in"):
+        build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad)
+
+
+@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS)
+@pytest.mark.parametrize("num_images", [0, 1, 2, 99])
+def test_text_only_tasks_ignore_num_images(task: str, num_images: int):
+    """Validation only kicks in for image-input tasks; t2t et al. accept
+    any num_images and emit zero `<img>` placeholders."""
+    s = build_prompt("hi", task=task, num_images=num_images)
+    assert "<img>" not in s
+    ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images)
+    assert 2 not in ids
+
+
+# -------------------- real HF tokenizer regression --------------------
+
+_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
+
+
+def _hf_cached(model_id: str) -> bool:
+    hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+    snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
+    return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+@pytest.mark.parametrize("num_images", [1, 2, 3])
+def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
+    """Real `AutoTokenizer.from_pretrained(...)` (the production path) must
+    encode N=1/2/3 prompts to a sequence with exactly N consecutive `<img>`
+    token-ids in the right place — proves the placeholder layout from
+    `build_prompt_tokens` survives a real BPE tokenizer, not just FakeTokenizer.
+    """
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+    img_id = tok.convert_tokens_to_ids("<img>")
+    assert img_id is not None and img_id >= 0, f"<img> not in tokenizer vocab; got id={img_id}"
+
+    ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images)
+
+    # Exactly N copies of <img> id, all consecutive.
+    img_positions = [i for i, x in enumerate(ids) if x == img_id]
+    assert len(img_positions) == num_images, (
+        f"expected {num_images} <img> ids, got {len(img_positions)} at positions {img_positions}"
+    )
+    assert img_positions == list(range(img_positions[0], img_positions[0] + num_images)), (
+        f"<img> ids must be contiguous; got positions {img_positions}"
+    )
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
+    """Going from N to N+1 images must extend the encoded id sequence by
+    exactly one extra `<img>` token-id and shift nothing else. Catches
+    accidental separator tokens between successive `<img>` placeholders
+    that a FakeTokenizer (deterministic encode) can't surface."""
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+    img_id = tok.convert_tokens_to_ids("<img>")
+
+    ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+    ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2)
+    ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3)
+
+    assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
+    assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
+
+    # Insert one img_id at the existing position; everything else unchanged.
+    p1 = ids_n1.index(img_id)
+    assert ids_n2[: p1 + 1] == ids_n1[: p1 + 1] + [], "prefix before extra <img> must match N=1"
+    assert ids_n2[p1] == img_id and ids_n2[p1 + 1] == img_id, "two consecutive <img> ids at the insertion point"
+    assert ids_n2[p1 + 2 :] == ids_n1[p1 + 1 :], "tail after the extra <img> must match N=1's tail"
+    # N=3 same pattern, three in a row.
+    assert ids_n3[p1 : p1 + 3] == [img_id, img_id, img_id]
+    assert ids_n3[p1 + 3 :] == ids_n1[p1 + 1 :]
+
+
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
+def test_real_tokenizer_default_n1_byte_identical_to_legacy():
+    """Default `num_images=1` must produce the exact same id sequence as
+    omitting the parameter — pins the legacy single-image regression
+    against the real tokenizer (not just FakeTokenizer)."""
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+    legacy = build_prompt_tokens("hi", tok, task="it2i_think")
+    explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+    assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 1f88e9e7155..74fe268babf 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -539,7 +539,12 @@ def instantiate_timestep_tokens(
         timestep_scatter_index: BatchRaggedTensor,
     ):
         batch_size, seq_len, n_embd = x.shape
-        # batch_size x n x n_embd
+        # `_encode_cond_image` returns `t` as list[Tensor] for the
+        # multi-image branch (outer length = batch_size, currently fixed
+        # at 1 by the stage runtime `max_batch_size`); flatten to a Tensor
+        # before reshape.
+        if isinstance(t, list):
+            t = torch.cat([ti.reshape(-1) for ti in t], dim=0)
         timestep_scatter_src = self.timestep_emb(t.reshape(-1)).reshape(batch_size, -1, n_embd)
         x.scatter_(
             dim=1,
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 5d8e9af6ab8..068dad87f8b 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -72,11 +72,21 @@ def resolve_stop_token_ids(
     return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]]
 
 
+# Upstream "Multi-Image Fusion" caps reference images at 3 per request.
+MAX_IMAGES_PER_REQUEST = 3
+
+
+def _validate_num_images(num_images: int) -> None:
+    if not (1 <= num_images <= MAX_IMAGES_PER_REQUEST):
+        raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}")
+
+
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
+    num_images: int = 1,
 ) -> str:
     """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).
 
@@ -85,6 +95,9 @@ def build_prompt(
     tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
     inputs that need to match HF baseline byte-for-byte, use
     `build_prompt_tokens` instead and feed the result via prompt_token_ids.
+
+    `num_images` emits N consecutive `<img>` placeholders between
+    `User: ` and `user_prompt`. Ignored for text-only tasks.
     """
     if task not in _TASK_PRESETS:
         raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
@@ -96,6 +109,8 @@ def build_prompt(
     sys_text = system_prompt.strip() if system_prompt else ""
 
     has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    if has_image_input:
+        _validate_num_images(num_images)
 
     # t2i_vanilla: pretrain mode for direct text->image generation. The
     # vanilla system prompt drives the model with no chat structure.
@@ -108,7 +123,7 @@ def build_prompt(
 
     # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
     # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
-    #   <|startoftext|>{system?}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger?}
+    #   <|startoftext|>{system?}\n\nUser: {<img>*N?}{user_prompt}\n\nAssistant: {trigger?}
     # generation_config.json declares sequence_template="instruct", so the
     # AR prefill MUST use this template -- verified to match HF's
     # apply_chat_template output token-for-token (modulo BPE boundary merges).
@@ -121,7 +136,7 @@ def build_prompt(
         parts.append(f"{sys_text}\n\n")
     parts.append("User: ")
     if has_image_input:
-        parts.append("<img>")
+        parts.extend(["<img>"] * num_images)
     parts.append(user_prompt)
     parts.append("\n\nAssistant: ")
     if trigger_tag:
@@ -142,6 +157,7 @@ def build_prompt_tokens(
     task: str = "it2i_think",
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
+    num_images: int = 1,
 ) -> PromptTokensResult:
     """Segment-by-segment tokenization that matches HF apply_chat_template.
 
@@ -155,6 +171,8 @@ def build_prompt_tokens(
 
     Returns:
         PromptTokensResult
+
+    `num_images` inserts N `<img>` token ids; see `build_prompt`.
     """
     if task not in _TASK_PRESETS:
         raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
@@ -167,6 +185,8 @@ def build_prompt_tokens(
     trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
     has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    if has_image_input:
+        _validate_num_images(num_images)
 
     # t2i_vanilla uses pretrain template with no chat structure; the vanilla
     # system prompt drives the model directly. No segment boundaries to
@@ -190,7 +210,7 @@ def build_prompt_tokens(
         ids += tokenizer.encode("\n\n", add_special_tokens=False)
     ids += tokenizer.encode("User: ", add_special_tokens=False)
     if has_image_input:
-        ids += [img_id]
+        ids += [img_id] * num_images
     ids += tokenizer.encode(user_prompt, add_special_tokens=False)
     ids += tokenizer.encode("\n\nAssistant: ", add_special_tokens=False)
     if trig_id is not None:
@@ -202,4 +222,12 @@ def build_prompt_tokens(
     )
 
 
-__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS]
+__all__ = [
+    "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
+    "MAX_IMAGES_PER_REQUEST",
+    "_TASK_PRESETS",
+    "available_tasks",
+    "build_prompt",
+    "build_prompt_tokens",
+    "resolve_stop_token_ids",
+]
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 1e057a71efa..e9d41ebf958 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -860,6 +860,13 @@ def process_image(self, image_input: ImageInput):
         else:
             raise TypeError(f"Unsupported image type: {type(image_input)}.")
 
+        # Each cond image keeps its own VAE bucket (mirrors official HF's
+        # ragged behavior in `_encode_cond_image`). VAE pixel tensors have
+        # different (H_i, W_i) per image, so they're flattened to 1-D and
+        # concatenated; vLLM `flat_from_sizes("image", vae_pixel_size)` slices
+        # them back per-image at consumption time. VIT (Siglip2 naflex) pads
+        # to `max_num_patches` so VIT fields keep the existing `batched`
+        # stack path.
         batch_data = []
         for image in images:
             current_info = {}
@@ -883,42 +890,49 @@ def process_image(self, image_input: ImageInput):
                 _ss = torch.tensor(_ss, dtype=torch.long)
             current_info["vit_spatial_shapes"] = _ss.squeeze(0)
 
-            # VAE processing.
-            # The resize/crop math here mirrors HF's `resize_and_crop` with
-            # crop_type="center" (hunyuan3.0_ins/image_processor.py:61). VAE
-            # normalize uses the same transforms.Compose([ToTensor,
-            # Normalize([0.5], [0.5])]) as HF's `pil_image_to_tensor`. So
-            # numerical output of this branch should match HF up to floating-
-            # point reduction order.
+            # VAE: per-image bucket via `reso_group.get_target_size`; mirrors
+            # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the
+            # VAE encoder casts to model dtype at its boundary (see
+            # `_vae_encode`).
             image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
             resized_image = self._resize_and_crop(image, (image_width, image_height))
-            vae_pixel_values = self.vae_processor(resized_image)
+            vae_pixel_values = self.vae_processor(resized_image).squeeze(0)
             token_height = image_height // (self.hf_config.vae_downsample_factor[0] * self.hf_config.patch_size)
             token_width = image_width // (self.hf_config.vae_downsample_factor[1] * self.hf_config.patch_size)
-            # Keep fp32 — the VAE encoder casts to model dtype at its boundary
-            # (see _vae_encode). Casting to bf16 here costs ~7e-4 mean-abs-diff
-            # bf16 quantization error on every pixel vs HF (which keeps fp32
-            # in build_cond_images), measurable as a real numerical drift in
-            # downstream image embeddings.
-            current_info["vae_pixel_values"] = vae_pixel_values.squeeze(0)
+
+            current_info["vae_pixel_values_flat"] = vae_pixel_values.reshape(-1)
+            current_info["vae_pixel_size"] = torch.tensor(vae_pixel_values.numel(), dtype=torch.long)
             current_info["vae_token_grid_hw"] = torch.tensor([token_height, token_width])
 
-            # size
             base_size, ratio_index = self.reso_group.get_base_size_and_ratio_index(image_width, image_height)
             current_info["base_size"] = torch.tensor(base_size)
             current_info["ratio_index"] = torch.tensor(ratio_index)
 
             batch_data.append(current_info)
 
-        # Stack the tensors in the list into a batch dimension (B, ...)
-        final_image_info = {}
-        if len(batch_data) > 0:
-            for key in batch_data[0].keys():
-                final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0)
+        final_image_info: dict[str, torch.Tensor] = {}
+        if not batch_data:
+            return final_image_info
+
+        # Same-shape fields: stack along a new image-batch dim as before.
+        same_shape_keys = [
+            "vit_pixel_values",
+            "vit_pixel_attention_mask",
+            "vit_spatial_shapes",
+            "vae_token_grid_hw",
+            "vae_pixel_size",
+            "base_size",
+            "ratio_index",
+        ]
+        for key in same_shape_keys:
+            final_image_info[key] = torch.stack([d[key] for d in batch_data], dim=0)
+
+        # Variable-shape VAE pixels: 1-D concat across images (paired with
+        # `vae_pixel_size` via `flat_from_sizes` in `_get_mm_fields_config`).
+        final_image_info["vae_pixel_values"] = torch.cat([d["vae_pixel_values_flat"] for d in batch_data], dim=0)
 
-        if final_image_info:
-            shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()}
-            logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}")
+        shapes_info = {k: tuple(v.shape) for k, v in final_image_info.items()}
+        logger.info(f"Successfully processed {len(images)} image(s). Final tensor shapes: {shapes_info}")
 
         return final_image_info
 
@@ -1030,8 +1044,13 @@ def _get_mm_fields_config(
             config["vit_pixel_attention_mask"] = MultiModalFieldConfig.batched("image")
         if "vit_spatial_shapes" in hf_inputs:
             config["vit_spatial_shapes"] = MultiModalFieldConfig.batched("image")
-        if "vae_pixel_values" in hf_inputs:
-            config["vae_pixel_values"] = MultiModalFieldConfig.batched("image")
+        # `vae_pixel_values` is a 1-D concatenation of variable-shape per-image
+        # VAE tensors (see `process_image`). `vae_pixel_size` carries the
+        # per-image flat length so vLLM can split the buffer back per image.
+        if "vae_pixel_values" in hf_inputs and "vae_pixel_size" in hf_inputs:
+            config["vae_pixel_values"] = MultiModalFieldConfig.flat_from_sizes("image", hf_inputs["vae_pixel_size"])
+        if "vae_pixel_size" in hf_inputs:
+            config["vae_pixel_size"] = MultiModalFieldConfig.batched("image")
         if "vae_token_grid_hw" in hf_inputs:
             config["vae_token_grid_hw"] = MultiModalFieldConfig.batched("image")
         if "base_size" in hf_inputs:
@@ -1668,6 +1687,9 @@ def _parse_and_validate_image_input(
         vit_pixel_attention_mask = kwargs.pop("vit_pixel_attention_mask", None)
         vit_spatial_shapes = kwargs.pop("vit_spatial_shapes", None)
         vae_pixel_values = kwargs.pop("vae_pixel_values", None)
+        # vae_pixel_size is only metadata for vLLM's flat_from_sizes split;
+        # we reconstruct per-image shapes from vae_token_grid_hw below.
+        kwargs.pop("vae_pixel_size", None)
         vae_token_grid_hw = kwargs.pop("vae_token_grid_hw", None)
 
         if vit_pixel_values is None or vae_pixel_values is None:
@@ -1677,13 +1699,36 @@ def _parse_and_validate_image_input(
         if vit_pixel_values.numel() == 0 or vae_pixel_values.numel() == 0:
             return None
 
+        # `vae_pixel_values` arrives as a 1-D concatenation of per-image flat
+        # buffers (see `process_image` + `flat_from_sizes`). Reconstruct a
+        # list of per-image (3, H_i, W_i) tensors using the per-image grid
+        # dims so the downstream VAE encoder can run image-by-image.
+        vae_factor_h = self.config.vae_downsample_factor[0] * self.config.patch_size
+        vae_factor_w = self.config.vae_downsample_factor[1] * self.config.patch_size
+        num_images = vae_token_grid_hw.shape[0]
+        vae_image_list: list[torch.Tensor] = []
+        offset = 0
+        flat = vae_pixel_values.reshape(-1)
+        for i in range(num_images):
+            token_h, token_w = vae_token_grid_hw[i].tolist()
+            h_i = int(token_h) * vae_factor_h
+            w_i = int(token_w) * vae_factor_w
+            n_i = 3 * h_i * w_i
+            vae_image_list.append(flat[offset : offset + n_i].reshape(3, h_i, w_i))
+            offset += n_i
+        if offset != flat.numel():
+            raise ValueError(
+                f"vae_pixel_values size mismatch: consumed {offset} of {flat.numel()} elements "
+                f"across {num_images} images (token_grid_hw={vae_token_grid_hw.tolist()})"
+            )
+
         return HunyuanImage3PixelInputs(
             type="pixel_values",
             pixel_values={
                 "vit_pixel_values": vit_pixel_values,
                 "vit_pixel_attention_mask": vit_pixel_attention_mask,
                 "vit_spatial_shapes": vit_spatial_shapes,
-                "vae_pixel_values": vae_pixel_values,
+                "vae_pixel_values": vae_image_list,
                 "vae_token_grid_hw": vae_token_grid_hw,
             },
         )
@@ -1795,22 +1840,12 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # Perform ViT encoding
         vit_embeddings = self._vit_encode(vit_pixel_values, vit_pixel_attention_mask, vit_spatial_shapes)
 
-        # Perform VAE encoding
-        t, latents = self._vae_encode(vae_pixel_values, vae_cfg_factor)
-
-        # Process VAE latents through patch_embed to convert to token embeddings
-        # VAE latents are in (B, C, H, W) format, need to be converted to (B, seq_len, hidden_size)
+        # VAE encode + patch_embed per image — each cond image is at its own
+        # `reso_group` bucket so shapes are ragged across the image-batch dim.
         vae_token_embeddings = []
-        batch_size = latents.shape[0]
-        for i in range(batch_size):
-            t_i = t[i]
-            latents_i = latents[i : i + 1]  # Shape: (1, C, H, W)
-
-            # Time embedding for VAE processing
-            t_emb = self.time_embed(t_i)
-
-            # Process VAE latent through patch_embed
-            # Input: (1, C, H, W) -> Output: (1, seq_len, hidden_size)
+        for vae_image_i in vae_pixel_values:
+            t_i, latents_i = self._vae_encode(vae_image_i.unsqueeze(0), vae_cfg_factor)
+            t_emb = self.time_embed(t_i[0])
             vae_tokens, _, _ = self.patch_embed(latents_i, t_emb)
             vae_token_embeddings.append(vae_tokens)
 

From 46b3b84091954588861edbcc62a9638ec5f4cb67 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Fri, 8 May 2026 21:55:21 +0800
Subject: [PATCH 02/43] [Refactor] HunyuanImage-3.0 prompt_utils: split task
 and bot_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace conflated `task` strings (`it2i_think`, `t2i_recaption`,
`t2i_vanilla`, ...) with two orthogonal axes:

  task     ∈ {t2t, i2t, it2i, t2i}
           controls only `<img>` placeholder emission.

  bot_task ∈ {None, think, recaption, think_recaption, vanilla}
           controls system prompt + trigger tag.

Mapping:
  bot_task=None             → en_unified            no trigger
  bot_task=think            → en_unified            <think>
  bot_task=recaption        → en_unified            <recaption>
  bot_task=think_recaption  → en_think_recaption    <think>
  bot_task=vanilla          → en_vanilla            no trigger, no chat
                              (only valid with task='t2i')

The pre-existing `_TASK_PRESETS` carried a `bot_task` field that was
dead code under all paths actually exercised (`sys_type='en_unified' /
'en_vanilla'`); only `sys_type='dynamic'` consumed it, and nothing in
the repo ever set that. The refactor promotes `bot_task` to the
user-facing API and drops the `task` × mode conflation, also exposing
the previously-unreachable `en_think_recaption` system prompt.

Public helpers `available_bot_tasks()` and `resolve_sys_type(bot_task)`
let callers derive the default sys_type without re-encoding the table.

Side fix on `build_prompt`: the legacy code stripped the system
prompt's leading whitespace while `build_prompt_tokens` did not. This
was invisible while every system prompt was `unified_system_prompt_en`
(no leading newline) but would diverge byte-wise once
`bot_task='think_recaption'` exposes `en_think_recaption` (which
starts with `\n`). `build_prompt` now keeps the system prompt verbatim,
matching the segment-by-segment tokenization path and HF's
`apply_chat_template`.

end2end.py: `--bot-task` choices are now {none, think, recaption,
think_recaption, vanilla}. The literal `none` is the explicit way to
request `bot_task=None` on a modality whose default is `think`
(text2img / img2img); leaving --bot-task unset still falls back to the
modality default. The duplicated `_TASK_PRESETS` literal in the example
script is removed in favor of `resolve_sys_type(bot_task)`.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/README.md                  |  26 +-
 .../hunyuan_image3/end2end.py                 |  97 ++-----
 .../test_hunyuan_image3_it2i_multi_image.py   |  62 +++--
 .../hunyuan_image3/test_prompt_utils.py       | 252 ++++++------------
 .../models/hunyuan_image3/prompt_utils.py     | 186 ++++++++-----
 5 files changed, 254 insertions(+), 369 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 6db4cbec9ed..8b90e6b7fa3 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -112,6 +112,7 @@ python end2end.py --modality text2img \
                   --additional-config '{"torchair_graph_config":{"enabled":true}}'
 ```
 
+
 ## Key Arguments
 
 | Argument | Description |
@@ -123,16 +124,15 @@ python end2end.py --modality text2img \
 | `--steps` | Number of diffusion inference steps for image generation. |
 | `--guidance-scale` | Classifier-free guidance scale for image generation. |
 | `--height`, `--width` | Output image size for `text2img`. |
-| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds `<think>`; `recaption` adds `<recaption>`; `vanilla` uses the text-to-image pretrain template. |
+| `--bot-task` | Override prompt mode. `none`, `think`, `recaption`, `think_recaption`, or `vanilla`. |
 | `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. |
 | `--vae-use-tiling` | Enable VAE tiling for memory reduction. |
 
 ## Notes
 
-- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults.
-- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage.
+- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy.
+- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`.
 - The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs.
-- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only.
 
 ## Prompt Format
 
@@ -148,22 +148,8 @@ Assistant: {trigger_tag?}
 
 - `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline).
 - Trigger tags: `<think>` for CoT and `<recaption>` for recaptioning, placed after `Assistant: `.
-- System prompt: Auto-selected based on task.
-- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure.
-- The example composes the internal prompt task from `--modality` and `--bot-task`
-  before calling `prompt_utils`; for example, `img2text + think` becomes
-  `i2t_think` for prompt and stop-token lookup.
+- System prompt: Auto-selected from `task` and `bot_task`.
+- `bot_task='vanilla'` with `task='t2i'` uses the bare pretrain template.
 
 The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
 helper handles segment-by-segment tokenization and matches HF `apply_chat_template`.
-
-## FAQ
-
-- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`.
-- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended).
-
-| Stage | VRAM (approx) |
-| :--- | :--- |
-| Stage 0 (AR) | ~15 GiB + KV Cache |
-| Stage 1 (DiT) | ~30 GiB |
-| Total (8-GPU) | ~45 GiB + KV Cache |
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index f9f734c9f4a..9d8f5113201 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -2,16 +2,10 @@
 HunyuanImage-3.0-Instruct unified end-to-end inference script.
 
 Supports all modalities through a single entry point:
-  - text2img:  Text → AR → DiT → Image
-  - img2img:   Text+Image → AR → DiT → Edited Image (IT2I)
-  - img2text:  Image+Text → AR → Text description (I2T)
-  - text2text: Text → AR → Text (comprehension, no image)
-
-Usage:
-    python end2end.py --modality text2img --prompts "A cute cat"
-    python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy"
-    python end2end.py --modality img2img --image-path img1.png,img2.png --prompts "Combine"
-    python end2end.py --modality img2text --image-path input.png --prompts "Describe this image"
+  - text2img:  Text -> AR -> DiT -> Image
+  - img2img:   Text+Image -> AR -> DiT -> Edited Image (IT2I)
+  - img2text:  Image+Text -> AR -> Text description (I2T)
+  - text2text: Text -> AR -> Text (comprehension, no image)
 """
 
 import argparse
@@ -20,9 +14,9 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    _TASK_PRESETS,
     build_prompt_tokens,
     resolve_stop_token_ids,
+    resolve_sys_type,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -46,11 +40,12 @@
     "text2text": "text-to-text",
 }
 
-_MODALITY_TASK_MAP = {
-    "text2img": "t2i",
-    "img2img": "it2i",
-    "img2text": "i2t",
-    "text2text": "t2t",
+# Modality -> (task, default bot_task) mapping.
+_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
+    "text2img": ("t2i", "think"),
+    "img2img": ("it2i", "think"),
+    "img2text": ("i2t", None),
+    "text2text": ("t2t", None),
 }
 
 
@@ -81,7 +76,6 @@ def parse_args():
         help="Output directory to save results.",
     )
 
-    # Generation parameters
     parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
     parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
@@ -93,17 +87,12 @@ def parse_args():
         help="Enable VAE tiling for memory optimization.",
     )
 
-    # Prompt configuration
     parser.add_argument(
         "--bot-task",
         type=str,
-        default="auto",
-        choices=["auto", "think", "recaption", "think_recaption", "vanilla"],
-        help=(
-            "Prompt behavior. 'auto' selects the default for the modality; "
-            "'think' adds <think>; 'recaption' adds <recaption>; "
-            "'vanilla' uses the t2i pretrain template."
-        ),
+        default=None,
+        choices=["none", "think", "recaption", "think_recaption", "vanilla"],
+        help="Override prompt mode. Default: auto from --modality.",
     )
     parser.add_argument(
         "--sys-type",
@@ -112,7 +101,6 @@ def parse_args():
         help="Override system prompt type (e.g. en_unified, en_vanilla).",
     )
 
-    # Omni init args
     parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
     parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
     parser.add_argument("--log-stats", action="store_true", default=False)
@@ -158,22 +146,13 @@ def main():
     os.makedirs(args.output, exist_ok=True)
     additional_config = parse_additional_config(args.additional_config)
 
-    # Determine task for prompt formatting from modality + bot behavior.
-    task = _MODALITY_TASK_MAP[args.modality]
-    assert task is not None
-    bot_task = args.bot_task
-    if bot_task != "auto":
-        task = task + "_" + bot_task
-    if task not in _TASK_PRESETS:
-        valid_bot_tasks = {
-            "text2img": ["think", "recaption", "vanilla"],
-            "img2img": ["think", "recaption", "think_recaption"],
-            "img2text": ["auto"],
-            "text2text": ["auto"],
-        }[args.modality]
-        raise ValueError(
-            f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}"
-        )
+    task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
+    if args.bot_task is None:
+        bot_task: str | None = default_bot_task
+    elif args.bot_task == "none":
+        bot_task = None
+    else:
+        bot_task = args.bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -183,7 +162,6 @@ def main():
     if deploy_config is None and stage_configs_path is None:
         deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality]
 
-    # Build Omni
     omni_kwargs = {
         "model": args.model,
         "vae_use_tiling": args.vae_use_tiling,
@@ -202,10 +180,8 @@ def main():
 
     omni = Omni(**omni_kwargs)
 
-    # Prepare prompts
     prompts = args.prompts or ["A cute cat"]
     if not prompts:
-        print("[Info] No prompts provided, using default.")
         prompts = ["A cute cat"]
 
     input_images: list = []
@@ -222,34 +198,23 @@ def main():
         if not input_images:
             raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
 
-    # Load tokenizer for segment-wise prompt tokenization (matches HF
-    # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
     from transformers import AutoTokenizer
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
-
     mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
 
-    # Format prompts
     formatted_prompts: list[OmniPromptType] = []
-    for p in prompts:
-        # Only pass `num_images` for modalities that actually consume images;
-        # text-only paths ignore the parameter, but threading it
-        # unconditionally reads as if t2i needed at least one image.
-        build_kwargs: dict = {"task": task, "sys_type": args.sys_type}
+    for prompt in prompts:
+        build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
         if input_images:
             build_kwargs["num_images"] = len(input_images)
-        result = build_prompt_tokens(p, tokenizer, **build_kwargs)
+        result = build_prompt_tokens(prompt, tokenizer, **build_kwargs)
         token_ids = result.token_ids
-        effective_sys_type = result.system_prompt_type
+        effective_sys_type = args.sys_type or resolve_sys_type(bot_task)
 
-        # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
-        # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
-        # the DiT stage so the diffusion pipeline can rebuild the same
-        # system prefix when constructing its model inputs.
         prompt_dict: dict = {
             "prompt_token_ids": token_ids,
-            "prompt": p,
+            "prompt": prompt,
             "use_system_prompt": effective_sys_type,
         }
 
@@ -268,14 +233,11 @@ def main():
 
         formatted_prompts.append(prompt_dict)
 
-    # Build sampling params from defaults
     params_list = list(omni.default_sampling_params_list)
 
-    # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
     ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
-    assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
@@ -283,13 +245,12 @@ def main():
             sp.guidance_scale_provided = True
             if args.seed is not None:
                 sp.seed = args.seed
-            if args.modality in ("text2img",):
+            if args.modality == "text2img":
                 sp.height = args.height
                 sp.width = args.width
         elif hasattr(sp, "stop_token_ids"):
             sp.stop_token_ids = ar_stop_token_ids
 
-    # Print configuration
     print(f"\n{'=' * 60}")
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
@@ -314,13 +275,10 @@ def main():
     print(f"  Prompts: {prompts}")
     print(f"{'=' * 60}\n")
 
-    # Generate
     omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
 
-    # Process outputs
     img_idx = 0
     for req_output in omni_outputs:
-        # Text output (AR stage or text-only)
         ro = getattr(req_output, "request_output", None)
         txt = ""
         if ro and getattr(ro, "outputs", None):
@@ -334,7 +292,6 @@ def main():
         if txt:
             print(f"[Output] Text:\n{txt}")
 
-        # Image output (DiT stage)
         images = getattr(req_output, "images", None)
         if not images and ro and hasattr(ro, "images"):
             images = ro.images
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
index c8a9891385c..7a1e266b936 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -66,21 +66,25 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
         return list(range(100, 100 + len(text)))
 
 
-_IMAGE_TASKS = ("i2t", "it2i_think", "it2i_recaption")
-_TEXT_ONLY_TASKS = ("t2t",)
+_IMAGE_TASK_COMBOS = (
+    ("i2t", None),
+    ("it2i", "think"),
+    ("it2i", "recaption"),
+)
+_TEXT_ONLY_TASK_COMBOS = (("t2t", None),)
 
 
 # -------------------- string builder --------------------
 
 
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
 @pytest.mark.parametrize("num_images", [1, 2, 3])
-def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images: int):
+def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, bot_task: str | None, num_images: int):
     """N=1/2/3 -> exactly N `<img>` substrings appear consecutively
     between `User: ` and the user prompt, with no separator between them."""
-    s = build_prompt("HELLO", task=task, num_images=num_images)
+    s = build_prompt("HELLO", task=task, bot_task=bot_task, num_images=num_images)
     assert s.count("<img>") == num_images, (
-        f"task={task} num_images={num_images}: expected {num_images} <img> "
+        f"task={task} bot_task={bot_task} num_images={num_images}: expected {num_images} <img> "
         f"placeholders, found {s.count('<img>')} -- prompt was: {s!r}"
     )
 
@@ -97,24 +101,24 @@ def test_build_prompt_emits_N_consecutive_img_placeholders(task: str, num_images
 def test_build_prompt_default_num_images_matches_legacy():
     """num_images default = 1 must produce a string bit-identical to the
     pre-multi-image behavior (single `<img>` placeholder)."""
-    legacy = build_prompt("HELLO", task="it2i_think")
-    explicit = build_prompt("HELLO", task="it2i_think", num_images=1)
+    legacy = build_prompt("HELLO", task="it2i", bot_task="think")
+    explicit = build_prompt("HELLO", task="it2i", bot_task="think", num_images=1)
     assert legacy == explicit, "default num_images=1 must match legacy single-image output"
 
 
 # -------------------- token builder --------------------
 
 
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
-def test_build_prompt_tokens_inserts_N_img_ids(task: str):
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
+def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None):
     """N=1/2/3 -> the resulting id sequence contains exactly N copies of
     img_id (=2) sitting consecutively after the `User: ` segment."""
     tok = FakeTokenizer()
-    ids_n1 = build_prompt_tokens("hi", tok, task=task, num_images=1)
+    ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1)
     tok = FakeTokenizer()
-    ids_n2 = build_prompt_tokens("hi", tok, task=task, num_images=2)
+    ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2)
     tok = FakeTokenizer()
-    ids_n3 = build_prompt_tokens("hi", tok, task=task, num_images=3)
+    ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3)
 
     assert ids_n1.count(2) == 1
     assert ids_n2.count(2) == 2
@@ -141,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
     omitting the parameter (regression guard for existing single-image
     callers)."""
     tok_a = FakeTokenizer()
-    legacy = build_prompt_tokens("hi", tok_a, task="it2i_think")
+    legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think")
     tok_b = FakeTokenizer()
-    explicit = build_prompt_tokens("hi", tok_b, task="it2i_think", num_images=1)
+    explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1)
     assert legacy == explicit
     # Also: encode() must have been called on the same set of segments,
     # so segment boundaries are preserved.
@@ -153,23 +157,23 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
 # -------------------- validation --------------------
 
 
-@pytest.mark.parametrize("task", _IMAGE_TASKS)
+@pytest.mark.parametrize("task,bot_task", _IMAGE_TASK_COMBOS)
 @pytest.mark.parametrize("bad", [0, -1, MAX_IMAGES_PER_REQUEST + 1, 99])
-def test_build_prompt_rejects_out_of_range_num_images(task: str, bad: int):
+def test_build_prompt_rejects_out_of_range_num_images(task: str, bot_task: str | None, bad: int):
     with pytest.raises(ValueError, match="num_images must be in"):
-        build_prompt("hi", task=task, num_images=bad)
+        build_prompt("hi", task=task, bot_task=bot_task, num_images=bad)
     with pytest.raises(ValueError, match="num_images must be in"):
-        build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=bad)
+        build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=bad)
 
 
-@pytest.mark.parametrize("task", _TEXT_ONLY_TASKS)
+@pytest.mark.parametrize("task,bot_task", _TEXT_ONLY_TASK_COMBOS)
 @pytest.mark.parametrize("num_images", [0, 1, 2, 99])
-def test_text_only_tasks_ignore_num_images(task: str, num_images: int):
+def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_images: int):
     """Validation only kicks in for image-input tasks; t2t et al. accept
     any num_images and emit zero `<img>` placeholders."""
-    s = build_prompt("hi", task=task, num_images=num_images)
+    s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images)
     assert "<img>" not in s
-    ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, num_images=num_images)
+    ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images)
     assert 2 not in ids
 
 
@@ -198,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
     img_id = tok.convert_tokens_to_ids("<img>")
     assert img_id is not None and img_id >= 0, f"<img> not in tokenizer vocab; got id={img_id}"
 
-    ids = build_prompt_tokens("hi", tok, task="it2i_think", num_images=num_images)
+    ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images)
 
     # Exactly N copies of <img> id, all consecutive.
     img_positions = [i for i, x in enumerate(ids) if x == img_id]
@@ -221,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
     tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
     img_id = tok.convert_tokens_to_ids("<img>")
 
-    ids_n1 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
-    ids_n2 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=2)
-    ids_n3 = build_prompt_tokens("hi", tok, task="it2i_think", num_images=3)
+    ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
+    ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2)
+    ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3)
 
     assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
     assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
@@ -246,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy():
     from transformers import AutoTokenizer
 
     tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
-    legacy = build_prompt_tokens("hi", tok, task="it2i_think")
-    explicit = build_prompt_tokens("hi", tok, task="it2i_think", num_images=1)
+    legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think")
+    explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
     assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 1130c0f6db1..4d98bc5dcf2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -1,20 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Regression tests for HunyuanImage3 prompt construction (PR #3243).
-
-Two layers:
-  1. Pure-logic tests with a recording fake tokenizer -- protect the
-     prompt template structure (BOS, User:/Assistant: framing, trigger
-     placement, image placeholder position) and protect the segment-
-     by-segment tokenization contract (each segment must hit
-     `tokenizer.encode` in isolation).
-  2. Real-tokenizer regression -- run when the HunyuanImage3-Instruct
-     tokenizer is in the local HF cache. Asserts the segment-tokenized
-     output diverges from the naive full-string encode, which is the
-     bug-tripping fixture for the cross-segment BPE merge fix
-     (commit 7bd429ed).
-"""
-
 from __future__ import annotations
 
 import ast
@@ -25,6 +10,8 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+    _TASK_PRESETS,
+    available_bot_tasks,
     available_tasks,
     build_prompt,
     build_prompt_tokens,
@@ -34,18 +21,7 @@
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
-# -------------------- Pure-logic structural tests --------------------
-
-
 class FakeTokenizer:
-    """Minimal tokenizer stub that records every encode() call.
-
-    Returns deterministic ids from convert_tokens_to_ids while
-    encode() returns one id per character starting at 100. This lets
-    tests both verify segmentation (by inspecting `encode_calls`) and
-    locate substrings inside the returned id list.
-    """
-
     SPECIAL = {
         "<|startoftext|>": 1,
         "<img>": 2,
@@ -72,85 +48,80 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
 
 
 def test_available_tasks_covers_all_modalities():
-    tasks = set(available_tasks())
-    assert tasks >= {
-        "t2t",
-        "i2t",
+    assert set(available_tasks()) == {"t2t", "i2t", "it2i", "t2i"}
+
+
+def test_available_bot_tasks_covers_all_modes():
+    assert set(available_bot_tasks()) == {None, "think", "recaption", "think_recaption", "vanilla"}
+
+
+def test_legacy_task_presets_still_available():
+    assert {
         "it2i_think",
         "it2i_recaption",
         "it2i_think_recaption",
         "t2i_think",
         "t2i_recaption",
         "t2i_vanilla",
-    }
+    } <= set(_TASK_PRESETS)
 
 
 def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
     tok = FakeTokenizer()
-
     answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
     assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id]
     assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id]
+    assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
 
 
 @pytest.mark.parametrize(
-    "task",
+    "task,bot_task",
     [
-        "t2t",
-        "i2t",
-        "it2i_think",
-        "it2i_recaption",
-        "it2i_think_recaption",
-        "t2i_think",
-        "t2i_recaption",
+        ("t2t", None),
+        ("i2t", None),
+        ("it2i", "think"),
+        ("it2i", "recaption"),
+        ("it2i", "think_recaption"),
+        ("t2i", "think"),
+        ("t2i", "recaption"),
+        ("t2i", "think_recaption"),
     ],
 )
-def test_build_prompt_string_structure_chat_template(task: str):
-    """Chat-template tasks must produce <|startoftext|>...User: ...Assistant: ...
-    with image placeholder (when applicable) and trigger tag AFTER `Assistant: `."""
-    s = build_prompt("HELLO", task=task)
-
+def test_build_prompt_string_structure_chat_template(task: str, bot_task: str | None):
+    s = build_prompt("HELLO", task=task, bot_task=bot_task)
     assert s.startswith("<|startoftext|>")
     assert "User: " in s
     assert "Assistant: " in s
     assert s.index("User: ") < s.index("HELLO") < s.index("Assistant: ")
 
-    if task.startswith(("i2t", "it2i")):
-        assert s.index("User: ") < s.index("<img>") < s.index("HELLO"), (
-            "<img> placeholder must sit between `User: ` and the user prompt"
-        )
+    if task in ("i2t", "it2i"):
+        assert s.index("User: ") < s.index("<img>") < s.index("HELLO")
     else:
         assert "<img>" not in s
 
-    # Trigger tag must be the FINAL token of the prompt (after `Assistant: `).
-    # Note: the system prompt itself mentions <think>/<recaption> as mode
-    # documentation, so substring index() catches the wrong occurrence -- use
-    # endswith() which directly captures "trigger is at the tail" (the Part A
-    # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
-    if task in ("it2i_think", "t2i_think", "it2i_think_recaption"):
-        assert s.endswith("Assistant: <think>"), (
-            f"Trigger <think> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
-        )
-    if task in ("it2i_recaption", "t2i_recaption"):
-        assert s.endswith("Assistant: <recaption>"), (
-            f"Trigger <recaption> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
-        )
-    if task in ("t2t", "i2t"):
-        assert s.endswith("Assistant: "), "Plain (no-trigger) task must end at `Assistant: ` with no trailing tag."
+    if bot_task in ("think", "think_recaption"):
+        assert s.endswith("Assistant: <think>")
+    elif bot_task == "recaption":
+        assert s.endswith("Assistant: <recaption>")
+    elif bot_task is None:
+        assert s.endswith("Assistant: ")
 
 
 def test_build_prompt_vanilla_uses_pretrain_template():
-    """t2i_vanilla is the only task that bypasses chat structure -- direct
-    text->image generation driven by the vanilla system prompt."""
-    s = build_prompt("HELLO", task="t2i_vanilla")
+    s = build_prompt("HELLO", task="t2i", bot_task="vanilla")
     assert s.startswith("<|startoftext|>")
     assert "User: " not in s
     assert "Assistant: " not in s
-    assert "<think>" not in s
-    assert "<recaption>" not in s
     assert s.endswith("HELLO")
 
 
+def test_build_prompt_vanilla_rejects_non_t2i_task():
+    with pytest.raises(ValueError, match="bot_task='vanilla'"):
+        build_prompt("x", task="it2i", bot_task="vanilla")
+    with pytest.raises(ValueError, match="bot_task='vanilla'"):
+        build_prompt_tokens("x", FakeTokenizer(), task="i2t", bot_task="vanilla")
+
+
 def test_build_prompt_unknown_task_raises():
     with pytest.raises(ValueError, match="Unknown task"):
         build_prompt("x", task="bogus")
@@ -158,127 +129,83 @@ def test_build_prompt_unknown_task_raises():
         build_prompt_tokens("x", FakeTokenizer(), task="bogus")
 
 
+def test_build_prompt_unknown_bot_task_raises():
+    with pytest.raises(ValueError, match="Unknown bot_task"):
+        build_prompt("x", task="t2i", bot_task="bogus")
+    with pytest.raises(ValueError, match="Unknown bot_task"):
+        build_prompt_tokens("x", FakeTokenizer(), task="t2i", bot_task="bogus")
+
+
 def test_build_prompt_tokens_segments_each_boundary():
-    """Regression for cross-segment BPE merge bug (commit 7bd429ed):
-    each template segment must hit tokenizer.encode() independently;
-    user_prompt MUST NOT be concatenated with the following separator
-    in the same encode() call."""
     tok = FakeTokenizer()
-    build_prompt_tokens("写诗。", tok, task="i2t")
-
-    # Each canonical segment is encoded in its own call.
+    build_prompt_tokens("写诗。", tok, task="i2t", bot_task=None)
     assert "User: " in tok.encode_calls
-    assert "写诗。" in tok.encode_calls, (
-        "user_prompt must be encoded alone -- if it is concatenated with the "
-        "trailing separator, BPE will merge across the boundary (the PR-#3243 bug)."
-    )
+    assert "写诗。" in tok.encode_calls
     assert "\n\nAssistant: " in tok.encode_calls
-
-    # No call must contain user_prompt glued to neighboring text.
     for call in tok.encode_calls:
         if call != "写诗。":
-            assert "写诗。" not in call, f"user_prompt leaked into a multi-segment encode call: {call!r}"
+            assert "写诗。" not in call
 
 
 def test_build_prompt_tokens_image_placeholder_present_for_image_tasks():
     tok = FakeTokenizer()
-    result = build_prompt_tokens("hi", tok, task="i2t")
+    result = build_prompt_tokens("hi", tok, task="i2t", bot_task=None)
     ids = result.token_ids
-    assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"], "BOS (<|startoftext|>) must be the first token"
-    assert FakeTokenizer.SPECIAL["<img>"] in ids, "<img> placeholder must be present for i2t/it2i tasks"
+    assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"]
+    assert FakeTokenizer.SPECIAL["<img>"] in ids
 
 
 def test_build_prompt_tokens_no_image_for_text_only_tasks():
     tok = FakeTokenizer()
-    result = build_prompt_tokens("hi", tok, task="t2t")
+    result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None)
     ids = result.token_ids
-    assert FakeTokenizer.SPECIAL["<img>"] not in ids, "<img> must NOT appear for text-only tasks"
+    assert FakeTokenizer.SPECIAL["<img>"] not in ids
 
 
 @pytest.mark.parametrize(
-    "task,trigger_id",
+    "task,bot_task,trigger_id",
     [
-        ("it2i_think", FakeTokenizer.SPECIAL["<think>"]),
-        ("t2i_think", FakeTokenizer.SPECIAL["<think>"]),
-        ("it2i_recaption", FakeTokenizer.SPECIAL["<recaption>"]),
-        ("t2i_recaption", FakeTokenizer.SPECIAL["<recaption>"]),
+        ("it2i", "think", FakeTokenizer.SPECIAL["<think>"]),
+        ("t2i", "think", FakeTokenizer.SPECIAL["<think>"]),
+        ("t2i", "think_recaption", FakeTokenizer.SPECIAL["<think>"]),
+        ("it2i", "recaption", FakeTokenizer.SPECIAL["<recaption>"]),
+        ("t2i", "recaption", FakeTokenizer.SPECIAL["<recaption>"]),
+        ("it2i_think", None, FakeTokenizer.SPECIAL["<think>"]),
+        ("it2i_recaption", None, FakeTokenizer.SPECIAL["<recaption>"]),
     ],
 )
-def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
-    """Trigger tag id must be the LAST token (after `Assistant: ` segment)."""
+def test_build_prompt_tokens_trigger_is_last_token(task: str, bot_task: str | None, trigger_id: int):
     tok = FakeTokenizer()
-    result = build_prompt_tokens("hi", tok, task=task)
-    ids = result.token_ids
-    assert ids[-1] == trigger_id
+    result = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task)
+    assert result.token_ids[-1] == trigger_id
 
 
 def test_build_prompt_tokens_no_trigger_for_plain_tasks():
-    """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id."""
     tok = FakeTokenizer()
-    result = build_prompt_tokens("hi", tok, task="t2t")
-    ids = result.token_ids
-    assert ids[-1] not in {
+    result = build_prompt_tokens("hi", tok, task="t2t", bot_task=None)
+    assert result.token_ids[-1] not in {
         FakeTokenizer.SPECIAL["<think>"],
         FakeTokenizer.SPECIAL["<recaption>"],
     }
 
 
-# -------------------- end2end.py wiring guard --------------------
-
-
 def _repo_root() -> pathlib.Path:
-    # tests/diffusion/models/hunyuan_image3/test_prompt_utils.py -> repo root
     return pathlib.Path(__file__).resolve().parents[4]
 
 
 def test_end2end_routes_through_shared_prompt_utils():
-    """Regression for the *delivery vector* of PR #3243.
-
-    Background: the wrong-template bug that PR #3243 fixes was introduced
-    when end2end.py grew its own hand-rolled prompt builder that diverged
-    from the canonical instruct chat template. To prevent that exact
-    failure mode from recurring, end2end.py MUST:
-      1. Import the prompt builders from the shared prompt_utils module.
-      2. NOT redefine `build_prompt` or `build_prompt_tokens` locally.
-
-    A local redefinition is precisely how a future merge can silently
-    re-introduce a pretrain-style template (trigger BEFORE user_prompt,
-    no User:/Assistant: framing, etc.) without touching prompt_utils,
-    bypassing every other test in this file.
-    """
     end2end_path = _repo_root() / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py"
-    assert end2end_path.is_file(), f"end2end.py not found at {end2end_path}"
-
     tree = ast.parse(end2end_path.read_text(encoding="utf-8"))
 
     local_func_names = {n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)}
-    forbidden = {"build_prompt", "build_prompt_tokens"}
-    redefined = local_func_names & forbidden
-    assert not redefined, (
-        f"end2end.py defines {sorted(redefined)} locally. This is exactly how "
-        "the wrong prompt template re-entered the example before PR #3243. "
-        "Use the shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils` "
-        "helpers instead."
-    )
+    assert not (local_func_names & {"build_prompt", "build_prompt_tokens"})
 
     imported_from_prompt_utils: set[str] = set()
     for node in ast.walk(tree):
         if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
             imported_from_prompt_utils.update(alias.name for alias in node.names)
-    expected_imports = {
-        "_TASK_PRESETS",
-        "build_prompt_tokens",
-        "resolve_stop_token_ids",
-    }
-    assert expected_imports <= imported_from_prompt_utils, (
-        "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "
-        "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared "
-        "module is the single source of truth for the AR-prefill template and "
-        "bot_task-derived AR stop token ids."
-    )
-
-
-# -------------------- Real-tokenizer regression --------------------
+    expected_imports = {"build_prompt_tokens", "resolve_stop_token_ids", "resolve_sys_type"}
+    assert expected_imports <= imported_from_prompt_utils
 
 
 _HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
@@ -290,41 +217,14 @@ def _hf_cached(model_id: str) -> bool:
     return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
 
 
-@pytest.mark.skipif(
-    not _hf_cached(_HUNYUAN_MODEL_ID),
-    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
+@pytest.mark.skipif(not _hf_cached(_HUNYUAN_MODEL_ID), reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache")
 def test_segment_tokenize_diverges_from_full_string_encode():
-    """Regression for PR #3243 segment-tokenization fix.
-
-    The naive `tokenizer.encode(build_prompt(...))` lets BPE merge tokens
-    across segment boundaries (notably `。\\n\\n` -> a single id), which
-    drifts the AR prefill away from HF's apply_chat_template output. The
-    segment-by-segment build_prompt_tokens must produce a STRICTLY
-    DIFFERENT id sequence on a prompt that triggers the merge.
-
-    If someone "simplifies" build_prompt_tokens to call encode() on the
-    full string, this assertion fires.
-    """
     from transformers import AutoTokenizer
 
     tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
-
     user_prompt = "写一首关于夜的诗。"
-    result = build_prompt_tokens(user_prompt, tok, task="i2t")
+    result = build_prompt_tokens(user_prompt, tok, task="i2t", bot_task=None)
     seg_ids = result.token_ids
-    full_ids = tok.encode(build_prompt(user_prompt, task="i2t"), add_special_tokens=False)
-
-    assert seg_ids != full_ids, (
-        "build_prompt_tokens output equals naive full-string encode -- "
-        "the BPE-merge-bypass behavior is no longer exercised. This means "
-        "the segment-by-segment fix from PR #3243 has been silently undone."
-    )
-
-    # Segmenting prevents merges, so the segment id list should have AT LEAST
-    # as many tokens as the merged version (a merge consumes 2+ ids -> 1).
-    assert len(seg_ids) >= len(full_ids), (
-        f"segment-encoded length ({len(seg_ids)}) shorter than full-string "
-        f"merged length ({len(full_ids)}) -- impossible if segmentation is "
-        f"genuinely bypassing merges."
-    )
+    full_ids = tok.encode(build_prompt(user_prompt, task="i2t", bot_task=None), add_special_tokens=False)
+    assert seg_ids != full_ids
+    assert len(seg_ids) >= len(full_ids)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 068dad87f8b..4ed277eeed2 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -11,8 +11,23 @@
 `JointImageInfo` objects produced by image preprocessing. The example
 flow uses an `<img>` placeholder + `multi_modal_data` instead, so it
 needs a lighter-weight builder that only requires a HF tokenizer. This
-module provides that builder; the task -> template mapping below is the
-canonical mapping for both flows.
+module provides that builder; the (task, bot_task) -> template mapping
+below is the canonical mapping for both flows.
+
+Two orthogonal axes:
+
+  * `task` selects the I/O modality combination, which only controls
+    whether `<img>` placeholders are emitted between `User: ` and the
+    user prompt: ``i2t`` / ``it2i`` produce them, ``t2t`` / ``t2i`` do
+    not.
+
+  * `bot_task` selects the prompting mode and drives both the system
+    prompt and the trigger tag appended after ``Assistant: ``. ``None``
+    (default) gives a plain Assistant turn under the unified prompt;
+    ``think`` / ``recaption`` switch the trigger tag to ``<think>`` /
+    ``<recaption>``; ``think_recaption`` swaps the system prompt for
+    the dedicated combined-mode template; ``vanilla`` drops the chat
+    structure entirely (pretrain template, ``t2i`` only).
 """
 
 from __future__ import annotations
@@ -45,30 +60,77 @@
     "<img_ratio_36>": 130106,
 }
 
-# task -> (sys_type, bot_task, trigger_tag)
+# bot_task -> (sys_type, trigger_tag).
+# ``vanilla`` is special-cased downstream: it bypasses the chat template
+# (no ``User:`` / ``Assistant:`` framing) and is only valid with
+# ``task='t2i'``.
+_BOT_TASK_PRESETS: dict[str | None, tuple[str, str | None]] = {
+    None: ("en_unified", None),
+    "think": ("en_unified", "<think>"),
+    "recaption": ("en_unified", "<recaption>"),
+    "think_recaption": ("en_think_recaption", "<think>"),
+    "vanilla": ("en_vanilla", None),
+}
+
+_TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"})
+
+# Legacy composite task alias -> (task, bot_task). Keep this during rebase so
+# older callers and intermediate commits still resolve cleanly.
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
     "i2t": ("en_unified", None, None),
     "it2i_think": ("en_unified", "think", "<think>"),
     "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
     "it2i_think_recaption": ("en_unified", "think_recaption", "<think>"),
-    "t2i": ("en_unified", "image", None),
-    "t2i_vanilla": ("en_vanilla", "image", None),
+    "t2i": ("en_unified", None, None),
+    "t2i_vanilla": ("en_vanilla", "vanilla", None),
     "t2i_think": ("en_unified", "think", "<think>"),
     "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
 }
 
 
+def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]:
+    if task in _TASK_PRESETS:
+        _, legacy_bot_task, _ = _TASK_PRESETS[task]
+        base_task = task.split("_", 1)[0]
+        if base_task == "t2i" and task == "t2i":
+            base_task = "t2i"
+        if task in ("t2t", "i2t", "t2i"):
+            base_task = task
+        if bot_task is None:
+            bot_task = legacy_bot_task
+        task = base_task
+    return task, bot_task
+
+
 def available_tasks() -> list[str]:
-    """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
-    return sorted(_TASK_PRESETS)
+    """Sorted list of `task` values accepted by the prompt builders."""
+    return sorted(_TASKS)
+
+
+def available_bot_tasks() -> list[str | None]:
+    """Sorted list of `bot_task` values (with ``None`` first)."""
+    rest = sorted(k for k in _BOT_TASK_PRESETS if k is not None)
+    return [None, *rest]
+
+
+def resolve_sys_type(bot_task: str | None) -> str:
+    """Default system-prompt type for a given ``bot_task``."""
+    if bot_task not in _BOT_TASK_PRESETS:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+    return _BOT_TASK_PRESETS[bot_task][0]
 
 
 def resolve_stop_token_ids(
-    task: str = "it2i_think",
-    bot_task: str = "think",
+    task: str = "it2i",
+    bot_task: str | None = "think",
     tokenizer: Any | None = None,
-):
+) -> list[int]:
+    task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+    if task not in _TASKS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+    if bot_task not in _BOT_TASK_PRESETS:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
     return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]]
 
 
@@ -81,56 +143,45 @@ def _validate_num_images(num_images: int) -> None:
         raise ValueError(f"num_images must be in [1, {MAX_IMAGES_PER_REQUEST}], got {num_images}")
 
 
+def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]:
+    """Validate (task, bot_task) and return ``(sys_type, trigger_tag)``."""
+    task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+    if task not in _TASKS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+    if bot_task not in _BOT_TASK_PRESETS:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+    if bot_task == "vanilla" and task != "t2i":
+        raise ValueError(f"bot_task='vanilla' is only valid with task='t2i' (pretrain template); got task={task!r}")
+    return _BOT_TASK_PRESETS[bot_task]
+
+
 def build_prompt(
     user_prompt: str,
-    task: str = "it2i_think",
+    task: str = "it2i",
+    bot_task: str | None = "think",
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
     num_images: int = 1,
 ) -> str:
-    """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).
-
-    NOTE: when this string is passed to the engine, the engine's tokenizer
-    will run a single BPE pass over the whole string, which can merge
-    tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
-    inputs that need to match HF baseline byte-for-byte, use
-    `build_prompt_tokens` instead and feed the result via prompt_token_ids.
-
-    `num_images` emits N consecutive `<img>` placeholders between
-    `User: ` and `user_prompt`. Ignored for text-only tasks.
-    """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path)."""
+    task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+    preset_sys_type, trigger_tag = _resolve_preset(task, bot_task)
     effective_sys_type = sys_type or preset_sys_type
 
-    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
-    sys_text = system_prompt.strip() if system_prompt else ""
+    system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt)
+    sys_text = system_prompt or ""
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = task in ("i2t", "it2i")
     if has_image_input:
         _validate_num_images(num_images)
 
-    # t2i_vanilla: pretrain mode for direct text->image generation. The
-    # vanilla system prompt drives the model with no chat structure.
-    if task == "t2i_vanilla":
+    if bot_task == "vanilla":
         parts = ["<|startoftext|>"]
         if sys_text:
             parts.append(sys_text)
         parts.append(user_prompt)
         return "".join(parts)
 
-    # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
-    # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
-    #   <|startoftext|>{system?}\n\nUser: {<img>*N?}{user_prompt}\n\nAssistant: {trigger?}
-    # generation_config.json declares sequence_template="instruct", so the
-    # AR prefill MUST use this template -- verified to match HF's
-    # apply_chat_template output token-for-token (modulo BPE boundary merges).
-    # The trigger_tag (e.g. <think>) MUST come AFTER the `Assistant: ` prefix:
-    # if it goes BEFORE user_prompt (the old pretrain layout) the model puts
-    # the user's instructions inside the "thinking section" and collapses
-    # into repetition garbage under greedy decoding.
     parts = ["<|startoftext|>"]
     if sys_text:
         parts.append(f"{sys_text}\n\n")
@@ -141,67 +192,52 @@ def build_prompt(
     parts.append("\n\nAssistant: ")
     if trigger_tag:
         parts.append(trigger_tag)
-
     return "".join(parts)
 
 
 @dataclass
 class PromptTokensResult:
-    token_ids: list[int]  # The tokenized prompt
-    system_prompt_type: str  # The effective system prompt type used
+    token_ids: list[int]
+    system_prompt_type: str
 
 
 def build_prompt_tokens(
     user_prompt: str,
     tokenizer,
-    task: str = "it2i_think",
+    task: str = "it2i",
+    bot_task: str | None = "think",
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
     num_images: int = 1,
 ) -> PromptTokensResult:
-    """Segment-by-segment tokenization that matches HF apply_chat_template.
-
-    Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE
-    merge tokens across segment boundaries (e.g. user_prompt ends with `。`
-    and the next segment is `\\n\\n` -> they merge into a single token id
-    3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes
-    each segment independently and concatenates token_ids, so no cross-
-    boundary merge happens. We replicate that here and feed the result to
-    Omni via OmniTokensPrompt (prompt_token_ids).
-
-    Returns:
-        PromptTokensResult
-
-    `num_images` inserts N `<img>` token ids; see `build_prompt`.
-    """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    """Segment-by-segment tokenization that matches HF apply_chat_template."""
+    task, bot_task = _normalize_task_and_bot_task(task, bot_task)
+    preset_sys_type, trigger_tag = _resolve_preset(task, bot_task)
     effective_sys_type = sys_type or preset_sys_type
 
     bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
     img_id = tokenizer.convert_tokens_to_ids("<img>")
     trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = task in ("i2t", "it2i")
     if has_image_input:
         _validate_num_images(num_images)
 
-    # t2i_vanilla uses pretrain template with no chat structure; the vanilla
-    # system prompt drives the model directly. No segment boundaries to
-    # protect, fall back to whole-string encode.
-    if task == "t2i_vanilla":
-        s = build_prompt(user_prompt, task, sys_type, custom_system_prompt)
+    if bot_task == "vanilla":
+        s = build_prompt(
+            user_prompt,
+            task=task,
+            bot_task=bot_task,
+            sys_type=sys_type,
+            custom_system_prompt=custom_system_prompt,
+        )
         token_ids = tokenizer.encode(s, add_special_tokens=False)
         return PromptTokensResult(
             token_ids=token_ids,
             system_prompt_type=effective_sys_type,
         )
 
-    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
-    # Do NOT strip -- HF apply_chat_template keeps the system prompt's
-    # natural trailing newline; stripping it would shift one token id.
+    system_prompt = get_system_prompt(effective_sys_type, bot_task, custom_system_prompt)
     sys_text = system_prompt or ""
 
     ids: list[int] = [bos_id]
@@ -226,8 +262,10 @@ def build_prompt_tokens(
     "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
     "MAX_IMAGES_PER_REQUEST",
     "_TASK_PRESETS",
+    "available_bot_tasks",
     "available_tasks",
     "build_prompt",
     "build_prompt_tokens",
     "resolve_stop_token_ids",
+    "resolve_sys_type",
 ]

From f4d76d5ea2b791b9a54fbc4daaa84242c89c0f62 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Sat, 9 May 2026 14:47:51 +0800
Subject: [PATCH 03/43] [Feature] HunyuanImage-3.0 IT2I: wire multi-image
 through online serving
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-image IT2I worked offline but `/v1/images/edits` returned HTTP 400
"multi_modal_uuids['image'] must have same length as multi_modal_data['image']"
because the serving layer never expanded uuids past one-per-modality-key.

Two serving-side gaps the model PR did not cover:

1. `serving_chat.py:_build_multistage_generation_inputs` (and its mirror in
   the chat-completion image-gen path) built `multi_modal_uuids` by iterating
   over dict keys, producing one uuid per modality regardless of value
   shape. For `engine_prompt_data = {"image": [pil1, pil2]}` this yielded
   `{"image": ["img-image-0"]}` (1 uuid), which vLLM's renderer then
   rejected against the 2-item parsed image list. Fixed by expanding the
   uuid list to `len(value)` when the value is a list, while keeping the
   single-uuid behavior for scalar values (e.g. `{"img2img": pil}`).

2. `model_metadata._DIFFUSION_MODEL_METADATA` only registered
   `QwenImageEditPlusPipeline` as supports_multimodal_inputs=True, so
   `od_config.supports_multimodal_inputs` defaulted to False for
   HunyuanImage3Pipeline. The multistage edit path bypasses that check
   on the way in, but the chat path's `generate_diffusion_images` does
   query it (line 2322) and would reject multi-image with "Multiple
   input images are not supported by the current diffusion model".
   Registered `HunyuanImage3Pipeline` with `max_multimodal_image_inputs=3`
   to match upstream's "Multi-Image Fusion" cap (README §200-216).

Static change only; uuid expansion was traced through serving_chat ->
async_omni -> async_omni_engine.add_request -> InputProcessor ->
OmniInputPreprocessor._process_text -> renderer._process_multimodal ->
_validate_mm_uuids. End-to-end smoke against /v1/images/edits with two
`-F image=@...` parts is left for a follow-up; reproducing requires
PYTHONPATH=<wt-mi-checkout> when launching `vllm serve` so the system
Python's editable vllm-omni install does not shadow the rebased branch.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 vllm_omni/diffusion/model_metadata.py        |  6 ++++++
 vllm_omni/entrypoints/openai/serving_chat.py | 13 +++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/model_metadata.py b/vllm_omni/diffusion/model_metadata.py
index ec133e7380e..f3346338434 100644
--- a/vllm_omni/diffusion/model_metadata.py
+++ b/vllm_omni/diffusion/model_metadata.py
@@ -13,6 +13,8 @@ class DiffusionModelMetadata:
 
 
 QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES = 4
+# Upstream HunyuanImage-3.0 "Multi-Image Fusion" caps reference images at 3.
+HUNYUAN_IMAGE3_MAX_INPUT_IMAGES = 3
 
 
 _DIFFUSION_MODEL_METADATA: dict[str, DiffusionModelMetadata] = {
@@ -20,6 +22,10 @@ class DiffusionModelMetadata:
         supports_multimodal_inputs=True,
         max_multimodal_image_inputs=QWEN_IMAGE_EDIT_PLUS_MAX_INPUT_IMAGES,
     ),
+    "HunyuanImage3Pipeline": DiffusionModelMetadata(
+        supports_multimodal_inputs=True,
+        max_multimodal_image_inputs=HUNYUAN_IMAGE3_MAX_INPUT_IMAGES,
+    ),
 }
 
 
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 99827454e70..9ec626a3e74 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -419,7 +419,10 @@ async def create_chat_completion(
                     # consistency.  After the multimodal processor consumes
                     # the image data, the uuids remain as a stable reference.
                     tprompt["multi_modal_uuids"] = {
-                        k: [f"{request_id}-{k}-{i}"] for i, k in enumerate(engine_prompt_image)
+                        k: [f"{request_id}-{k}-{i}" for i in range(len(v))]
+                        if isinstance(v, list)
+                        else [f"{request_id}-{k}-0"]
+                        for k, v in engine_prompt_image.items()
                     }
 
                 engine_prompts = [tprompt]
@@ -2295,7 +2298,13 @@ def _build_multistage_generation_inputs(
             engine_prompt["multi_modal_data"] = engine_prompt_data
             # Provide multi_modal_uuids so that newer vLLM versions can
             # validate multi_modal_data / multi_modal_uuids consistency.
-            engine_prompt["multi_modal_uuids"] = {k: [f"img-{k}-{i}"] for i, k in enumerate(engine_prompt_data)}
+            # Generate one uuid per image when the value is a list (multi-image inputs).
+            engine_prompt["multi_modal_uuids"] = {
+                k: [f"img-{k}-{i}" for i in range(len(v))]
+                if isinstance(v, list)
+                else [f"img-{k}-0"]
+                for k, v in engine_prompt_data.items()
+            }
 
         comprehension_idx = None
         for idx, stage in enumerate(stage_configs):

From c18f01674d457e7da3d7f79b93f7fe871a34fbb1 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Sat, 9 May 2026 15:03:27 +0800
Subject: [PATCH 04/43] [Bugfix] HunyuanImage-3.0 ar2diffusion: honor
 AR-predicted output ratio

DiT output collapsed to a square whenever the input bucket was square,
even though the AR engine had already predicted a different aspect via
its `<img_size_*><img_ratio_*>` tail. The bridge ignored the prediction
and forwarded the prompt-carried `height`/`width` straight to the
diffusion pipeline:

  height = original_prompt.get("height", 1024)
  width  = original_prompt.get("width",  1024)

In the `/v1/images/edits` path that prompt height/width is filled with
`pil_images[0].size` (api_server.py:1808-1811) when the client does not
pass `--size`/`resolution`, so the first reference image's bucket
(typically a logo, square) determined the DiT canvas regardless of what
the prompt actually called for. Mirrors the issue called out in the
multi-image PR's commit message ("Output-size handling for the AR/DiT
ratio lifecycle is intentionally NOT touched ... properly wiring that
into ar2diffusion's width/height assignment is a separate refactor").

Wires the AR's ratio_index back into the bridge:

  1. Recover ratio_index from the AR output. Probe the detokenized text
     first (cheap, works under `skip_special_tokens: False` like
     `hunyuan_image3_it2i_kv_reuse.yaml`); fall back to scanning
     `cumulative_token_ids` against the tokenizer's
     `<img_ratio_0>..<img_ratio_36>` id range so the fix also holds when
     the AR engine strips special tokens from text. The token-id table
     is loaded once via AutoTokenizer (cached, model name overridable
     via `VLLM_OMNI_HUNYUAN_IMAGE3_MODEL`) and shaped to mirror
     `HunyuanImage3ForCausalMM.__init__:1523-1531` (contiguous main
     slice 0..32 plus extra slice 33..36).

  2. Resolve ratio_index to (height, width) via
     `ResolutionGroup(base_size=1024).data[ratio_index]`, which is the
     same reverse lookup `HunyuanImage3ImageProcessor.build_image_info`
     uses upstream when constructing the DiT image_info from
     `<img_ratio_*>`. Falls back to the prompt-carried height/width
     when no ratio token is present (comprehension paths, AR aborted
     before the size+ratio tail) so non-IT2I/T2I flows are unaffected.

End-to-end smoke is left for a follow-up: test/repro requires
`PYTHONPATH=<wt-mi-checkout> vllm serve ...` to keep the system
Python's editable vllm-omni install from shadowing this branch (same
caveat as the prior multi-image uuid commit).

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 .../stage_input_processors/hunyuan_image3.py  | 137 +++++++++++++++++-
 1 file changed, 136 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index b7630bb8ac8..9a53bf4be06 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -12,6 +12,9 @@
 
 from __future__ import annotations
 
+import os
+import re
+from functools import lru_cache
 from typing import Any
 
 import torch
@@ -22,6 +25,108 @@
 
 logger = init_logger(__name__)
 
+# AR emits `<img_size_BASE><img_ratio_Y>` after `</recaption>` in IT2I/T2I
+# (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The
+# ratio_index resolves to a (height, width) bucket via ResolutionGroup, which
+# is the official upstream's mechanism for AR-driven output aspect — without
+# this lookup the DiT pipeline falls back to the user-provided width/height
+# (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
+# i.e. the first reference image's bucket — usually square, see
+# api_server.py:1808-1811).
+_RATIO_TOKEN_RE = re.compile(r"<img_ratio_(\d+)>")
+_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
+
+
+@lru_cache(maxsize=4)
+def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
+    """Return `[(height, width)]` indexed by ratio_index for HunyuanImage-3.
+
+    Mirrors `HunyuanImage3ImageProcessor.build_image_info`'s
+    `reso_group[ratio_index]` reverse lookup. Cached because the table
+    is constant per `base_size`.
+    """
+    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup
+
+    reso_group = ResolutionGroup(base_size=base_size)
+    return [(int(r.height), int(r.width)) for r in reso_group.data]
+
+
+@lru_cache(maxsize=4)
+def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
+    """Return `{token_id: ratio_index}` for `<img_ratio_*>` in the tokenizer.
+
+    Loads the tokenizer once per model path and walks the contiguous
+    `<img_ratio_0>..<img_ratio_32>` plus the extra slice
+    `<img_ratio_33>..<img_ratio_36>` (the same shape
+    `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531).
+    Empty dict on lookup failure so callers can degrade gracefully.
+    """
+    try:
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    except Exception as e:  # pragma: no cover - environment-dependent
+        logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e)
+        return {}
+
+    def _id(name: str) -> int | None:
+        tid = tokenizer.convert_tokens_to_ids(name)
+        return None if tid is None or tid == tokenizer.unk_token_id else int(tid)
+
+    ratio_0 = _id("<img_ratio_0>")
+    ratio_32 = _id("<img_ratio_32>")
+    ratio_33 = _id("<img_ratio_33>")
+    ratio_36 = _id("<img_ratio_36>")
+    if None in (ratio_0, ratio_32, ratio_33, ratio_36):
+        logger.warning("[ar2diffusion] tokenizer is missing one of <img_ratio_{0,32,33,36}> tokens")
+        return {}
+
+    table: dict[int, int] = {}
+    for i in range(ratio_32 - ratio_0 + 1):
+        table[ratio_0 + i] = i
+    base_idx = ratio_32 - ratio_0 + 1
+    for j in range(ratio_36 - ratio_33 + 1):
+        table[ratio_33 + j] = base_idx + j
+    return table
+
+
+def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None:
+    """Resolve the AR-predicted ratio_index from this stage's output.
+
+    Two probe paths:
+      1. Text regex on `generated_text` — works when the AR engine is
+         configured with `skip_special_tokens: False` (e.g.
+         `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading
+         the tokenizer.
+      2. Token-id scan over `cumulative_token_ids` against the tokenizer's
+         `<img_ratio_*>` id range — survives `skip_special_tokens: True`
+         where the special tokens are stripped from text but still present
+         in the raw token stream.
+
+    Takes the LAST ratio token in the stream because the AR's
+    stage-transition logic emits exactly one such token at the tail of the
+    `<img_size_*><img_ratio_*><eos>` sequence; using "last" is robust to
+    any earlier accidental occurrences in the prompt scaffold.
+    """
+    matches = _RATIO_TOKEN_RE.findall(generated_text or "")
+    if matches:
+        try:
+            return int(matches[-1])
+        except ValueError:
+            pass
+
+    if generated_token_ids is None:
+        return None
+    table = _build_ratio_id_lookup(model_name_or_path)
+    if not table:
+        return None
+    last_ratio_idx: int | None = None
+    for tid in generated_token_ids:
+        idx = table.get(int(tid))
+        if idx is not None:
+            last_ratio_idx = idx
+    return last_ratio_idx
+
 
 def ar2diffusion(
     source_outputs: list[Any],
@@ -65,13 +170,43 @@ def ar2diffusion(
         text_prompt = original_prompt.get("prompt", "")
         use_system_prompt = original_prompt.get("use_system_prompt")
 
+        # Prefer the AR's predicted output aspect (`<img_size_*><img_ratio_*>`
+        # tail emitted by `HunyuanImage3ForCausalMM.sample` under the
+        # ratio-restriction logits processor) over the carried-through
+        # height/width, which the serving layer fills with the first
+        # reference image's bucket and so collapses non-square targets to
+        # square in the multi-image / mismatched-aspect case. Mirrors the
+        # official upstream where `reso_group[ratio_index]` is the
+        # canonical source of the diffusion target shape.
+        model_name_or_path = original_prompt.get("model") or os.environ.get(
+            "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
+        )
+        ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path)
+        ar_predicted = False
+        if ratio_idx is not None:
+            base_size = int(original_prompt.get("image_base_size", 1024))
+            size_table = _build_ratio_size_table(base_size)
+            if 0 <= ratio_idx < len(size_table):
+                height, width = size_table[ratio_idx]
+                ar_predicted = True
+            else:
+                logger.warning(
+                    "[ar2diffusion] Request %d: ratio_index=%d out of range [0,%d), keeping prompt size %dx%d",
+                    i,
+                    ratio_idx,
+                    len(size_table),
+                    height,
+                    width,
+                )
+
         logger.info(
-            "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d",
+            "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)",
             i,
             len(generated_token_ids),
             len(generated_text),
             height,
             width,
+            f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
         )
 
         token_tensor = torch.tensor(generated_token_ids, dtype=torch.long)

From c5f2f9bd618e4b5998ba8fbe53ccca7bb3b894a2 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Sat, 9 May 2026 15:23:39 +0800
Subject: [PATCH 05/43] [Chore] HunyuanImage-3.0 end2end: accept internal task
 names as --modality aliases

`--modality img2img` historically pointed at the internal task `it2i`,
so users who think in the post-`prompt_utils` task vocabulary
(`t2i`/`it2i`/`i2t`/`t2t`, see `_TASK_PRESETS`) had to translate.
Common enough that two recent reproduction commands hit the
`invalid choice: 'it2i'` argparse error before getting any actual
output.

Accepts both spellings on the CLI and canonicalizes the short forms to
the verbose names right after parsing so the downstream
`args.modality == "img2img"` branches stay one-line and do not have to
enumerate aliases. Default value, choices listing, and behavior for
existing verbose names unchanged.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 .../hunyuan_image3/end2end.py                 | 78 +++++++------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 9d8f5113201..b560926f1b7 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -1,11 +1,5 @@
 """
 HunyuanImage-3.0-Instruct unified end-to-end inference script.
-
-Supports all modalities through a single entry point:
-  - text2img:  Text -> AR -> DiT -> Image
-  - img2img:   Text+Image -> AR -> DiT -> Edited Image (IT2I)
-  - img2text:  Image+Text -> AR -> Text description (I2T)
-  - text2text: Text -> AR -> Text (comprehension, no image)
 """
 
 import argparse
@@ -21,11 +15,29 @@
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
 
-# Default deploy configs are absolute so this example works from any cwd.
 _REPO_ROOT = Path(__file__).resolve().parents[3]
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
 _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
 
+# Both verbose and short-form aliases are accepted.
+_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
+    "text2img": ("t2i", "think"),
+    "t2i": ("t2i", "think"),
+    "img2img": ("it2i", "think"),
+    "it2i": ("it2i", "think"),
+    "img2text": ("i2t", None),
+    "i2t": ("i2t", None),
+    "text2text": ("t2t", None),
+    "t2t": ("t2t", None),
+}
+
+_MODALITY_CANONICAL = {
+    "t2i": "text2img",
+    "it2i": "img2img",
+    "i2t": "img2text",
+    "t2t": "text2text",
+}
+
 _MODALITY_DEFAULT_DEPLOY_CONFIG = {
     "text2img": _DEFAULT_DEPLOY_CONFIG,
     "img2img": _DEFAULT_DEPLOY_CONFIG,
@@ -40,27 +52,15 @@
     "text2text": "text-to-text",
 }
 
-# Modality -> (task, default bot_task) mapping.
-_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
-    "text2img": ("t2i", "think"),
-    "img2img": ("it2i", "think"),
-    "img2text": ("i2t", None),
-    "text2text": ("t2t", None),
-}
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
-    parser.add_argument(
-        "--model",
-        default="tencent/HunyuanImage-3.0-Instruct",
-        help="Model name or local path.",
-    )
+    parser.add_argument("--model", default="tencent/HunyuanImage-3.0-Instruct", help="Model name or local path.")
     parser.add_argument(
         "--modality",
         default="text2img",
-        choices=["text2img", "img2img", "img2text", "text2text"],
-        help="Modality mode to control stage execution.",
+        choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"],
+        help="Verbose and internal short task names are both accepted.",
     )
     parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
     parser.add_argument(
@@ -69,24 +69,14 @@ def parse_args():
         default=None,
         help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
     )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=".",
-        help="Output directory to save results.",
-    )
+    parser.add_argument("--output", type=str, default=".", help="Output directory to save results.")
 
     parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
     parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
     parser.add_argument("--height", type=int, default=1024, help="Output image height.")
     parser.add_argument("--width", type=int, default=1024, help="Output image width.")
-    parser.add_argument(
-        "--vae-use-tiling",
-        action="store_true",
-        help="Enable VAE tiling for memory optimization.",
-    )
-
+    parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.")
     parser.add_argument(
         "--bot-task",
         type=str,
@@ -94,13 +84,7 @@ def parse_args():
         choices=["none", "think", "recaption", "think_recaption", "vanilla"],
         help="Override prompt mode. Default: auto from --modality.",
     )
-    parser.add_argument(
-        "--sys-type",
-        type=str,
-        default=None,
-        help="Override system prompt type (e.g. en_unified, en_vanilla).",
-    )
-
+    parser.add_argument("--sys-type", type=str, default=None, help="Override system prompt type.")
     parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
     parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
     parser.add_argument("--log-stats", action="store_true", default=False)
@@ -146,6 +130,7 @@ def main():
     os.makedirs(args.output, exist_ok=True)
     additional_config = parse_additional_config(args.additional_config)
 
+    args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality)
     task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
     if args.bot_task is None:
         bot_task: str | None = default_bot_task
@@ -168,6 +153,7 @@ def main():
         "log_stats": args.log_stats,
         "init_timeout": args.init_timeout,
         "enforce_eager": args.enforce_eager,
+        "mode": _MODALITY_MODE[args.modality],
     }
 
     if additional_config is not None:
@@ -176,14 +162,10 @@ def main():
         omni_kwargs["deploy_config"] = deploy_config
     else:
         omni_kwargs["stage_configs_path"] = stage_configs_path
-    omni_kwargs["mode"] = _MODALITY_MODE[args.modality]
 
     omni = Omni(**omni_kwargs)
 
     prompts = args.prompts or ["A cute cat"]
-    if not prompts:
-        prompts = ["A cute cat"]
-
     input_images: list = []
     if args.modality in ("img2img", "img2text"):
         if not args.image_path:
@@ -217,7 +199,6 @@ def main():
             "prompt": prompt,
             "use_system_prompt": effective_sys_type,
         }
-
         if args.modality == "text2img":
             prompt_dict["modalities"] = ["image"]
         elif args.modality == "img2img":
@@ -228,9 +209,8 @@ def main():
         elif args.modality == "img2text":
             prompt_dict["modalities"] = ["text"]
             prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
-        elif args.modality == "text2text":
+        else:
             prompt_dict["modalities"] = ["text"]
-
         formatted_prompts.append(prompt_dict)
 
     params_list = list(omni.default_sampling_params_list)
@@ -276,7 +256,6 @@ def main():
     print(f"{'=' * 60}\n")
 
     omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
-
     img_idx = 0
     for req_output in omni_outputs:
         ro = getattr(req_output, "request_output", None)
@@ -295,7 +274,6 @@ def main():
         images = getattr(req_output, "images", None)
         if not images and ro and hasattr(ro, "images"):
             images = ro.images
-
         if images:
             for j, img in enumerate(images):
                 save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png")

From 2ff92b7f0002a6fe957e2247d7ea205d92a13467 Mon Sep 17 00:00:00 2001
From: skf1999 <13234016272@163.com>
Date: Sun, 10 May 2026 01:41:23 +0800
Subject: [PATCH 06/43] feat(end2end): semantic output shape for multi-image
 IT2I

Signed-off-by: skf1999 <13234016272@163.com>
---
 .../hunyuan_image3/end2end.py                 | 48 +++++++++++++++----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index b560926f1b7..b46e326d1c8 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -5,6 +5,7 @@
 import argparse
 import json
 import os
+import re
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
@@ -19,7 +20,6 @@
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
 _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
 
-# Both verbose and short-form aliases are accepted.
 _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
     "text2img": ("t2i", "think"),
     "t2i": ("t2i", "think"),
@@ -70,7 +70,6 @@ def parse_args():
         help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
     )
     parser.add_argument("--output", type=str, default=".", help="Output directory to save results.")
-
     parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
     parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
@@ -125,6 +124,30 @@ def parse_additional_config(raw_value: str | None) -> dict | None:
     return additional_config
 
 
+def _infer_shape_reference_index(prompt: str, num_images: int) -> int:
+    chinese_nums = {"一": 1, "二": 2, "三": 3}
+
+    def _to_idx(match: re.Match[str]) -> int | None:
+        token = match.group(1).strip()
+        value = chinese_nums.get(token, int(token) if token.isdigit() else None)
+        return value - 1 if value and 1 <= value <= num_images else None
+
+    for pattern in (
+        r"参考图\s*([一二三123])",
+        r"参考第\s*([一二三123])\s*张",
+        r"参考\s*image\s*([123])",
+        r"ref(?:erence)?\s*image\s*([123])",
+        r"基于图\s*([一二三123])",
+        r"基于第\s*([一二三123])\s*张",
+        r"基于\s*image\s*([123])",
+        r"based\s*on\s*image\s*([123])",
+    ):
+        match = re.search(pattern, prompt, re.IGNORECASE)
+        if match and (idx := _to_idx(match)) is not None:
+            return idx
+    return 0
+
+
 def main():
     args = parse_args()
     os.makedirs(args.output, exist_ok=True)
@@ -173,10 +196,10 @@ def main():
         from PIL import Image
 
         image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
-        for p in image_paths:
-            if not os.path.exists(p):
-                raise ValueError(f"Image path does not exist: {p}")
-            input_images.append(Image.open(p).convert("RGB"))
+        for image_path in image_paths:
+            if not os.path.exists(image_path):
+                raise ValueError(f"Image path does not exist: {image_path}")
+            input_images.append(Image.open(image_path).convert("RGB"))
         if not input_images:
             raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
 
@@ -186,6 +209,7 @@ def main():
     mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
 
     formatted_prompts: list[OmniPromptType] = []
+    shape_indices: list[int] = []
     for prompt in prompts:
         build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
         if input_images:
@@ -204,8 +228,10 @@ def main():
         elif args.modality == "img2img":
             prompt_dict["modalities"] = ["image"]
             prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
-            prompt_dict["height"] = input_images[0].height
-            prompt_dict["width"] = input_images[0].width
+            shape_idx = _infer_shape_reference_index(prompt, len(input_images))
+            prompt_dict["height"] = input_images[shape_idx].height
+            prompt_dict["width"] = input_images[shape_idx].width
+            shape_indices.append(shape_idx)
         elif args.modality == "img2text":
             prompt_dict["modalities"] = ["text"]
             prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
@@ -218,6 +244,7 @@ def main():
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
     ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
+    diffusion_idx = 0
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
@@ -228,6 +255,11 @@ def main():
             if args.modality == "text2img":
                 sp.height = args.height
                 sp.width = args.width
+            elif args.modality == "img2img":
+                shape_idx = shape_indices[diffusion_idx]
+                sp.height = input_images[shape_idx].height
+                sp.width = input_images[shape_idx].width
+            diffusion_idx += 1
         elif hasattr(sp, "stop_token_ids"):
             sp.stop_token_ids = ar_stop_token_ids
 

From 74e5caca3b8d8b8ff7e3b3a529ad33cd3567c1e5 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 03:57:53 +0800
Subject: [PATCH 07/43] [Chore] Apply pre-commit formatting fixes

Auto-applied by ruff/whitespace hooks: extra blank lines between
top-level functions, stripped trailing whitespace, and collapsed a
dict-comprehension expression onto a single line.

Signed-off-by: zuiho <2324465096@qq.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 9ec626a3e74..a5ca494c89e 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2300,9 +2300,7 @@ def _build_multistage_generation_inputs(
             # validate multi_modal_data / multi_modal_uuids consistency.
             # Generate one uuid per image when the value is a list (multi-image inputs).
             engine_prompt["multi_modal_uuids"] = {
-                k: [f"img-{k}-{i}" for i in range(len(v))]
-                if isinstance(v, list)
-                else [f"img-{k}-0"]
+                k: [f"img-{k}-{i}" for i in range(len(v))] if isinstance(v, list) else [f"img-{k}-0"]
                 for k, v in engine_prompt_data.items()
             }
 

From d7400dca983a03c9c74bbb59fa6288b226e17452 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 15:40:10 +0800
Subject: [PATCH 08/43] fix(hunyuan_image3): honor ar2diffusion's predicted
 shape in pre_process_func

pre_process_func was unconditionally filling None sampling_params.height/width
with image_list[0].size, burying the AR-predicted ratio that ar2diffusion
(e31197f0) had written into prompt["height"]/["width"]. forward() reads only
sampling_params, so the bridge was a silent no-op on the IT2I path -- DiT
output collapsed to the first reference image's bucket regardless of what
the AR predicted via <img_size_*><img_ratio_*>.

Now prefer prompt["height"]/["width"] (bridge-supplied) over image_list[0]
when sampling_params is None. Caller-explicit sampling_params still wins
via the surrounding `is None` guards. Mirrors GLM-Image's precedent at
pipeline_glm_image.py:718-737 and matches official HunyuanImage-3.0
image_size=="auto" semantics where vae_reso_group[ratio_index] is the
canonical source of DiT shape.

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../models/hunyuan_image3/pipeline_hunyuan_image3.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 74fe268babf..b1ba2687f86 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -283,11 +283,13 @@ def pre_process_func(request: OmniDiffusionRequest):
                 cond_image_infos = [_build_cond_joint_image(image) for image in image_list]
                 prompt["additional_information"]["batch_cond_image_info"] = cond_image_infos
 
+                bridge_h = prompt.get("height") if isinstance(prompt, dict) else None
+                bridge_w = prompt.get("width") if isinstance(prompt, dict) else None
                 first_image_w, first_image_h = _to_pil_image(image_list[0]).size
                 if request.sampling_params.width is None:
-                    request.sampling_params.width = int(first_image_w)
+                    request.sampling_params.width = int(bridge_w or first_image_w)
                 if request.sampling_params.height is None:
-                    request.sampling_params.height = int(first_image_h)
+                    request.sampling_params.height = int(bridge_h or first_image_h)
 
             request.prompts[i] = prompt
 

From d7c760e258c4e4ec1896768fd5e0e5c7d5d4c6bd Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 15:40:23 +0800
Subject: [PATCH 09/43] refactor(end2end): drop multi-image regex shape
 heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts 6a1985f1 ("feat(end2end): semantic output shape for multi-image
IT2I"). With the prior commit's pipeline fix in place, AR-predicted
<img_ratio_*> tokens flow through ar2diffusion to DiT output shape, so
the prompt-regex layer (parsing "参考图二" / "based on image 2" to pick
a reference image's H/W) is no longer needed and contradicts official
HunyuanImage-3.0 image_size=="auto" semantics.

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../hunyuan_image3/end2end.py                 | 38 +------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index b46e326d1c8..82e8c194c5a 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -5,7 +5,6 @@
 import argparse
 import json
 import os
-import re
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
@@ -124,30 +123,6 @@ def parse_additional_config(raw_value: str | None) -> dict | None:
     return additional_config
 
 
-def _infer_shape_reference_index(prompt: str, num_images: int) -> int:
-    chinese_nums = {"一": 1, "二": 2, "三": 3}
-
-    def _to_idx(match: re.Match[str]) -> int | None:
-        token = match.group(1).strip()
-        value = chinese_nums.get(token, int(token) if token.isdigit() else None)
-        return value - 1 if value and 1 <= value <= num_images else None
-
-    for pattern in (
-        r"参考图\s*([一二三123])",
-        r"参考第\s*([一二三123])\s*张",
-        r"参考\s*image\s*([123])",
-        r"ref(?:erence)?\s*image\s*([123])",
-        r"基于图\s*([一二三123])",
-        r"基于第\s*([一二三123])\s*张",
-        r"基于\s*image\s*([123])",
-        r"based\s*on\s*image\s*([123])",
-    ):
-        match = re.search(pattern, prompt, re.IGNORECASE)
-        if match and (idx := _to_idx(match)) is not None:
-            return idx
-    return 0
-
-
 def main():
     args = parse_args()
     os.makedirs(args.output, exist_ok=True)
@@ -209,7 +184,6 @@ def main():
     mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
 
     formatted_prompts: list[OmniPromptType] = []
-    shape_indices: list[int] = []
     for prompt in prompts:
         build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
         if input_images:
@@ -228,10 +202,8 @@ def main():
         elif args.modality == "img2img":
             prompt_dict["modalities"] = ["image"]
             prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
-            shape_idx = _infer_shape_reference_index(prompt, len(input_images))
-            prompt_dict["height"] = input_images[shape_idx].height
-            prompt_dict["width"] = input_images[shape_idx].width
-            shape_indices.append(shape_idx)
+            prompt_dict["height"] = input_images[0].height
+            prompt_dict["width"] = input_images[0].width
         elif args.modality == "img2text":
             prompt_dict["modalities"] = ["text"]
             prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
@@ -244,7 +216,6 @@ def main():
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
     ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
-    diffusion_idx = 0
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
@@ -255,11 +226,6 @@ def main():
             if args.modality == "text2img":
                 sp.height = args.height
                 sp.width = args.width
-            elif args.modality == "img2img":
-                shape_idx = shape_indices[diffusion_idx]
-                sp.height = input_images[shape_idx].height
-                sp.width = input_images[shape_idx].width
-            diffusion_idx += 1
         elif hasattr(sp, "stop_token_ids"):
             sp.stop_token_ids = ar_stop_token_ids
 

From 2175a9974bfbb0b3f6a85d26c070f2c22329df8f Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:42:44 +0800
Subject: [PATCH 10/43] fix(hunyuan_image3): add official extra resolution
 buckets (idx 33-36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`ResolutionGroup` only walked the step-based buckets (idx 0-32) and
dropped the official's four extra resolutions at indices 33-36. The
trained model has ratio token vocabulary 0-36, and AR was trained to
address all 37 buckets; without the extras, wide reference images
bucket-collapse to the closest base ratio (e.g. input_1_1's 1179x685
maps to idx=12 / 1280x768 instead of idx=36 / 720x1280) and the AR's
`<img_ratio_*>` token range can't address the missing aspects.

Adds `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` in `hunyuan_image3_transformer.py`
as the single source of truth (mirrors official `image_processor.py:
147-152`) and threads it through both:
  - `HunyuanImage3Processor.ResolutionGroup` (AR-side cond-image bucket
    selection)
  - `_build_ratio_size_table` (bridge's reverse lookup ratio_idx →
    (h, w) for ar2diffusion → DiT shape)

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../hunyuan_image3_transformer.py             | 20 ++++++++++++++++-
 .../models/hunyuan_image3/hunyuan_image3.py   | 22 +++++++++++++++++--
 .../stage_input_processors/hunyuan_image3.py  | 13 ++++++++---
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
index 1eb0cdf113b..5a707acbda5 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
@@ -471,8 +471,21 @@ def __str__(self):
         return f"{self.h}x{self.w}"
 
 
+# Baked-in extras matching the official model's
+# `HunyuanImage3ImageProcessor.vae_reso_group` (image_processor.py:147-152).
+# These four aspect buckets sit at ratio_token indices 33-36 in the trained
+# model and the AR was trained to address them, so any deviation breaks the
+# ratio-token vocab → output-shape lookup.
+HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = (
+    "1024x768",
+    "1280x720",
+    "768x1024",
+    "720x1280",
+)
+
+
 class ResolutionGroup:
-    def __init__(self, base_size=None, step=None, align=1):
+    def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None):
         self.align = align
         self.base_size = base_size
         assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}"
@@ -486,6 +499,11 @@ def __init__(self, base_size=None, step=None, align=1):
         self.step = step
         self.data = self._calc_by_step()
 
+        if extra_resolutions is not None:
+            for er in extra_resolutions:
+                if not any(r.ratio == er.ratio for r in self.data):
+                    self.data.append(er)
+
         self.ratio = np.array([x.ratio for x in self.data])
         self.attr = ["" for _ in range(len(self.data))]
         self.prefix_space = 0
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index e9d41ebf958..bdafa5c6f87 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -737,7 +737,7 @@ def __str__(self):
     class ResolutionGroup:
         """Group of resolutions for image processing."""
 
-        def __init__(self, base_size=None, step=None, align=1):
+        def __init__(self, base_size=None, step=None, align=1, extra_resolutions=None):
             self.align = align
             self.base_size = base_size
             assert base_size % align == 0, f"base_size {base_size} is not divisible by align {align}"
@@ -751,6 +751,11 @@ def __init__(self, base_size=None, step=None, align=1):
             self.step = step
             self.data = self._calc_by_step()
 
+            if extra_resolutions is not None:
+                for er in extra_resolutions:
+                    if not any(r.ratio == er.ratio for r in self.data):
+                        self.data.append(er)
+
             self.ratio = np.array([x.ratio for x in self.data])
             self.attr = ["" for _ in range(len(self.data))]
             self.prefix_space = 0
@@ -815,7 +820,20 @@ def get_base_size_and_ratio_index(self, width, height):
     def __init__(self, tokenizer, hf_config, **kwargs: object):
         self.tokenizer = tokenizer
         self.hf_config = hf_config
-        self.reso_group = self.ResolutionGroup(base_size=hf_config.image_base_size)
+        # `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` mirrors the official
+        # `vae_reso_group` extras (image_processor.py:147-152). Build with
+        # this processor's inner Resolution class so `data` stays
+        # type-homogeneous.
+        from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
+            HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
+        )
+
+        self.reso_group = self.ResolutionGroup(
+            base_size=hf_config.image_base_size,
+            extra_resolutions=[
+                HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS
+            ],
+        )
         self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor)
         self.vae_processor = transforms.Compose(
             [
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 9a53bf4be06..63af2f7f1dd 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -45,9 +45,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
     `reso_group[ratio_index]` reverse lookup. Cached because the table
     is constant per `base_size`.
     """
-    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ResolutionGroup
-
-    reso_group = ResolutionGroup(base_size=base_size)
+    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
+        HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
+        Resolution,
+        ResolutionGroup,
+    )
+
+    reso_group = ResolutionGroup(
+        base_size=base_size,
+        extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
+    )
     return [(int(r.height), int(r.width)) for r in reso_group.data]
 
 

From 4aaa77261b303322a907cfe0b7fe4e71b7cf6782 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:43:13 +0800
Subject: [PATCH 11/43] fix(hunyuan_image3): default cond image preprocessing
 to resize-stretch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Match official `infer_align_image_size=True` path (image_processor.py:355
→ crop_type="resize") for IT2I cond-image preprocessing. Previously
hardcoded to center crop, which lost content from non-square reference
images and produced a near-correct-but-not-equal pixel buffer compared to
the HF reference run.

Center-crop mode is preserved as opt-in via `crop_type="center"` for
callers that want the legacy behavior.

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3.py   | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index bdafa5c6f87..f6bd31283d9 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -954,29 +954,33 @@ def process_image(self, image_input: ImageInput):
 
         return final_image_info
 
-    def _resize_and_crop(self, image: Image.Image, target_size: tuple[int, int]) -> Image.Image:
+    def _resize_and_crop(
+        self,
+        image: Image.Image,
+        target_size: tuple[int, int],
+        crop_type: str = "resize",
+    ) -> Image.Image:
+        # Default mode mirrors the official `infer_align_image_size=True`
+        # path (image_processor.py:355 → crop_type="resize") used by the
+        # IT2I demo: stretch the cond image to the bucket dims so its
+        # `<img_ratio_*>` tag and ViT/VAE features stay aligned with the
+        # bucket, instead of dropping content via center crop.
         tw, th = target_size
+        if crop_type == "resize":
+            return image.resize((tw, th), resample=Image.Resampling.LANCZOS)
         w, h = image.size
-
         tr = th / tw
         r = h / w
-
-        # resize
         if r < tr:
             resize_height = th
             resize_width = int(round(th / h * w))
         else:
             resize_width = tw
             resize_height = int(round(tw / w * h))
-
         image = image.resize((resize_width, resize_height), resample=Image.Resampling.LANCZOS)
-
-        # center crop
         crop_top = int(round((resize_height - th) / 2.0))
         crop_left = int(round((resize_width - tw) / 2.0))
-
-        image = image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
-        return image
+        return image.crop((crop_left, crop_top, crop_left + tw, crop_top + th))
 
 
 class HunyuanImage3ProcessingInfo(BaseProcessingInfo):

From d0c2acbfb07debda01a68b87181db3c21cbf70ac Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Sun, 10 May 2026 21:43:59 +0800
Subject: [PATCH 12/43] fix(hunyuan_image3): use real <timestep> token id at
 scaffold slot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-image scaffold timestep slot was placeholder'd with `<img>` token id
(128006) instead of the real `<timestep>` token id (128017), as a
workaround for vLLM's `PromptUpdateDetails.select_token_id` accepting
only a single `embed_token_id`. The runtime embedding was patched in via
the multimodal-embedding merger, so single-image numerics matched HF.

But under the AR's multimodal-bidirectional attention, that
`<img>`-as-timestep slot folded into each image's MM region. With
multi-image input, this asymmetry biased the AR's `<img_ratio_*>`
greedy argmax to the FIRST conditioning image's bucket regardless of
prompt semantics:

  input order        | image_1 bucket | image_2 bucket | AR predicts
  -------------------|----------------|----------------|------------
  square + wide      | 16             | 36             | 16
  wide + square      | 36             | 16             | 36
  single wide        | --             | --             | 36 (correct)

Recaption text in both broken cases explicitly said "use image_2
resolution" but the model's ratio token still landed on image_1's
bucket. Single-image worked because there was no second region to
contaminate.

Switches the slot to the real `<timestep>` id and patches its embedding
with `timestep_emb(0)` in `embed_input_ids` via a token-id mask — same
effect as HF's `instantiate_continuous_tokens` scatter-replace
(modeling_hunyuan_image_3.py:1964). Numerically equivalent for
single-image while removing the multi-image attention pollution.

Touches: `_get_prompt_updates` scaffold, `embed_multimodal` (no longer
prepends timestep_emb), `embed_input_ids` (new mask-based replacement),
`__init__` (caches `_timestep_token_id`), `get_mrope_input_positions`
(timestep slot check now matches the real token id).

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3.py   | 77 +++++++++----------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index f6bd31283d9..ab9c2ee4d6e 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1126,31 +1126,22 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
             ratio_token_id = tokenizer.convert_tokens_to_ids(f"<img_ratio_{_ratio_index}>")
             if ratio_token_id is None:
                 raise ValueError(f"Ratio token '<img_ratio_{_ratio_index}>' not found in tokenizer vocabulary")
-
-            # NOTE on the timestep slot:
-            # HF's apply_chat_template emits the literal <timestep> token id
-            # 128017 here. HF's modeling forward (`instantiate_continuous_tokens`,
-            # see hunyuan3.0_ins/modeling_hunyuan_image_3.py:1964) then *scatter-
-            # replaces* the embedding at that position with `timestep_emb(0)`
-            # for cond images. So the wte embedding of <timestep> is irrelevant
-            # at runtime — what matters is the timestep_emb injection.
-            #
-            # vllm-omni achieves the same effect via the multimodal-embedding
-            # merger: we put an <img> (128006) placeholder here and ship a
-            # `timestep_emb(0)` tensor at the head of `embed_multimodal()`'s
-            # combined_embeddings. The merger replaces this placeholder's
-            # embedding with the timestep tensor, yielding a final hidden
-            # state numerically equivalent to HF at that position.
-            #
-            # Keep this slot as <img> (NOT <timestep>): switching to <timestep>
-            # requires either (a) a second PromptReplacement targeting 128017,
-            # or (b) the merger's embed_token_id to be a list — neither is
-            # currently supported by PromptUpdateDetails.select_token_id.
+            timestep_token_id = tokenizer.convert_tokens_to_ids("<timestep>")
+            if timestep_token_id is None:
+                raise ValueError("Timestep token '<timestep>' not found in tokenizer vocabulary")
+
+            # Use the real <timestep> token id (HF parity). The trained wte
+            # at this slot is overwritten with timestep_emb(0) at runtime by
+            # `embed_input_ids` — same effect as HF's
+            # `instantiate_continuous_tokens` scatter-replace. Keeping the
+            # slot as <img> would have folded the timestep position into the
+            # multimodal bidirectional region, which empirically biased
+            # multi-image AR ratio prediction to the first image's bucket.
             replacement = (
                 [boi_token_id]
                 + [base_size_token_id]
                 + [ratio_token_id]
-                + [img_token_id] * timestep_token_num
+                + [timestep_token_id] * timestep_token_num
                 + [img_token_id] * vae_token_num
                 + [joint_img_sep_token_id]
                 + [img_token_id] * vit_token_num
@@ -1542,6 +1533,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._end_of_answer_id = tokenizer.convert_tokens_to_ids("</answer>")
         image_base_size = getattr(config, "image_base_size", 1024)
         self._size_token_id = tokenizer.convert_tokens_to_ids(f"<img_size_{image_base_size}>")
+        self._timestep_token_id = tokenizer.convert_tokens_to_ids("<timestep>")
         self._start_ratio_id = tokenizer.convert_tokens_to_ids("<img_ratio_0>")
         self._end_ratio_id = tokenizer.convert_tokens_to_ids("<img_ratio_32>")
         ratio_33 = tokenizer.convert_tokens_to_ids("<img_ratio_33>")
@@ -1877,27 +1869,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             "Each image should have both VAE and ViT embeddings."
         )
 
-        # Order per image: timestep -> VAE tokens -> ViT tokens.
-        # The <img> placeholder at the timestep slot (see _get_prompt_updates)
-        # gets its embedding replaced by `timestep_emb(0)` here, which is what
-        # HF achieves via instantiate_continuous_tokens at runtime.
+        # Order per image: VAE tokens -> ViT tokens. The <timestep> slot at
+        # the head of each per-image scaffold is NOT included here — its
+        # embedding is patched in by `embed_input_ids` via a token-id mask,
+        # mirroring HF's `instantiate_continuous_tokens` scatter-replace.
         combined_embeddings: list[torch.Tensor] = []
         num_images = len(vae_token_embeddings)
         for img_idx in range(num_images):
-            # 1. Timestep embedding (cond image timestep == 0)
-            timestep = torch.zeros((1,)).to(vit_embeddings.device).to(vit_embeddings.dtype)
-            timestep_emb = self._timestep_encode(timestep)
-
-            # 2. VAE image token embeddings
             vae_token_embed = vae_token_embeddings[img_idx]
-            # Remove batch dimension if present: (B, seq_len, hidden_size) -> (seq_len, hidden_size)
             if vae_token_embed.ndim == 3:
                 vae_token_embed = vae_token_embed.squeeze(0)
-
-            # 3. ViT image embeddings
             vit_embed = vit_embeddings[img_idx]
-
-            stacked_embed = torch.cat([timestep_emb, vae_token_embed, vit_embed], dim=0)
+            stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0)
             combined_embeddings.append(stacked_embed)
 
         return combined_embeddings
@@ -1910,14 +1893,25 @@ def embed_input_ids(
         is_multimodal: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Embed input IDs with optional multimodal embeddings."""
-        # Get text embeddings
         inputs_embeds = self.model.embed_input_ids(input_ids)
 
-        # If no multimodal embeddings, return text embeddings
+        # Patch <timestep> slots with timestep_emb(0). HF parity: the trained
+        # wte at this slot is irrelevant; runtime uses
+        # `instantiate_continuous_tokens(timestep_emb(0))`. With multi-image,
+        # keeping these slots as <img> ids merged the timestep position into
+        # the bidirectional MM region and biased AR ratio prediction toward
+        # the first image's bucket.
+        timestep_mask = input_ids == self._timestep_token_id
+        n_timestep = int(timestep_mask.sum().item())
+        if n_timestep > 0:
+            timestep_input = torch.zeros(
+                (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+            )
+            inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input)
+
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
             return inputs_embeds
 
-        # Merge multimodal embeddings with text embeddings
         merged_embeds = _merge_multimodal_embeddings(
             inputs_embeds=inputs_embeds,
             multimodal_embeddings=multimodal_embeddings,
@@ -2133,6 +2127,7 @@ def get_mrope_input_positions(
         boi_token_id = self._mrope_boi_token_id
         eoi_token_id = self._mrope_eoi_token_id
         joint_img_sep_token_id = self._mrope_joint_img_sep_token_id
+        timestep_token_id = self._timestep_token_id
 
         # Build position arrays
         t_pos: list[int] = []  # temporal (same as 1D for this model)
@@ -2149,7 +2144,7 @@ def get_mrope_input_positions(
 
             if tok == boi_token_id:
                 # Found start of image block.
-                # Structure: <boi> <size> <ratio> <img>*timestep <img>*vae
+                # Structure: <boi> <size> <ratio> <timestep> <img>*vae
                 #            <joint_img_sep> <img>*vit <eoi>
                 # <boi> token
                 t_pos.append(pos)
@@ -2174,8 +2169,8 @@ def get_mrope_input_positions(
                     pos += 1
                     i += 1
 
-                # Timestep token (1 <img> token)
-                if i < n and input_tokens[i] == img_token_id:
+                # <timestep> token (1 token)
+                if i < n and input_tokens[i] == timestep_token_id:
                     t_pos.append(pos)
                     h_pos.append(pos)
                     w_pos.append(pos)

From f83c2814a6c853f24a050330d8544cb395203d0c Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 02:40:19 +0800
Subject: [PATCH 13/43] fix(hunyuan_image3): include <joint_img_sep> in
 per-image MM region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-image embed mask in `_get_prompt_updates` only marked `<img>`
tokens via `PromptUpdateDetails.select_token_id(<img>)`, so vLLM's
prefix-LM bidirectional region for each image was split into TWO
contiguous runs: VAE block, then ViT block — with `<joint_img_sep>`
sitting between them as a non-MM (causal-only) token.

Official `Tencent-Hunyuan/HunyuanImage-3.0` builds its full-attention
range via `joint_image_slices` (image_processor.py:388, default
`cond_token_attn_type` flow), spanning VAE + sep + ViT as ONE
continuous bidirectional slice per cond image. The trained model
expects this layout.

In the multi-image case the asymmetry between training (sep inside
the MM region) and our inference (sep outside) was the dominant
remaining mismatch: empirically AR's `<img_ratio_*>` greedy argmax
landed on the FIRST conditioning image's bucket regardless of
prompt semantics. Single-image and dup-bucket cases worked because
there was no second region to be asymmetric against.

Switches `_get_prompt_updates` to
`PromptUpdateDetails.select_token_ids([<img>, <joint_img_sep>])` so
the embed mask now spans VAE+sep+ViT as one True run per image, and
inserts the `<joint_img_sep>` wte tensor in `embed_multimodal`'s
per-image stack between VAE and ViT — numerically identical to what
`model.embed_input_ids` would have produced for that token, so
single-image semantics don't change.

Verified end-to-end on 47.79.124.13 (4× L20X, AR=TP2 + DiT=TP2):

  case                            | image_1 | image_2 | AR ratio
  --------------------------------|---------|---------|---------
  multi (1_0+1_1, prompt → img2)  | 16      | 36      | 36 ✓
  multi swap (1_1+1_0)            | 36      | 16      | 36 ✓
  single 1_1 (regression)         | --      | --      | 36 ✓
  single 1_0 (regression)         | --      | --      | 16 ✓
  multi dup wide                  | 36      | 36      | 36 ✓

Pre-fix behavior on the same setup had AR landing on the first
conditioning image's bucket regardless of prompt, output collapsing
to a square instead of image_2's wide aspect.

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3.py   | 45 ++++++++++++++-----
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index ab9c2ee4d6e..08d25e9c896 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1132,11 +1132,16 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
 
             # Use the real <timestep> token id (HF parity). The trained wte
             # at this slot is overwritten with timestep_emb(0) at runtime by
-            # `embed_input_ids` — same effect as HF's
-            # `instantiate_continuous_tokens` scatter-replace. Keeping the
-            # slot as <img> would have folded the timestep position into the
-            # multimodal bidirectional region, which empirically biased
-            # multi-image AR ratio prediction to the first image's bucket.
+            # `embed_input_ids`.
+            #
+            # Mark <img>*VAE + <joint_img_sep> + <img>*ViT as one contiguous
+            # embed run so vLLM's prefix-LM mask treats it as a single
+            # bidirectional region, mirroring official `joint_image_slices`
+            # full-attention range (image_processor.py:388, with
+            # cond_token_attn_type effectively spanning VAE+sep+ViT). With the
+            # default `select_token_id(<img>)` mask, sep splits the run into
+            # two regions; that asymmetry is what biased multi-image AR
+            # ratio prediction to the first image's bucket.
             replacement = (
                 [boi_token_id]
                 + [base_size_token_id]
@@ -1148,7 +1153,10 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
                 + [eoi_token_id]
             )
             logger.debug(f"actual replacement token count: {timestep_token_num + vae_token_num + vit_token_num}")
-            return PromptUpdateDetails.select_token_id(replacement, embed_token_id=img_token_id)
+            return PromptUpdateDetails.select_token_ids(
+                replacement,
+                embed_token_ids=[img_token_id, joint_img_sep_token_id],
+            )
 
         return [
             PromptReplacement(modality="image", target=[img_token_id], replacement=get_replacement_image),
@@ -1869,10 +1877,25 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             "Each image should have both VAE and ViT embeddings."
         )
 
-        # Order per image: VAE tokens -> ViT tokens. The <timestep> slot at
-        # the head of each per-image scaffold is NOT included here — its
-        # embedding is patched in by `embed_input_ids` via a token-id mask,
-        # mirroring HF's `instantiate_continuous_tokens` scatter-replace.
+        # Order per image: VAE tokens -> <joint_img_sep> wte -> ViT tokens.
+        # The <joint_img_sep> wte is included so it joins the bidirectional
+        # MM region (matching the official `joint_image_slices` full-attn
+        # range that spans VAE+sep+ViT). The merger replaces the sep slot
+        # with this wte tensor, which is numerically identical to what
+        # `model.embed_input_ids` would produce — no semantic change for
+        # single-image, but with multi-image the sep position now sits
+        # inside the bidirectional region (matching how the model was
+        # trained).
+        sep_token_id = self._mrope_joint_img_sep_token_id
+        sep_input_ids = torch.tensor(
+            [sep_token_id], device=vit_embeddings.device, dtype=torch.long
+        )
+        sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype)
+
+        # The <timestep> slot at the head of each per-image scaffold is NOT
+        # included here — its embedding is patched in by `embed_input_ids`
+        # via a token-id mask, mirroring HF's `instantiate_continuous_tokens`
+        # scatter-replace.
         combined_embeddings: list[torch.Tensor] = []
         num_images = len(vae_token_embeddings)
         for img_idx in range(num_images):
@@ -1880,7 +1903,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             if vae_token_embed.ndim == 3:
                 vae_token_embed = vae_token_embed.squeeze(0)
             vit_embed = vit_embeddings[img_idx]
-            stacked_embed = torch.cat([vae_token_embed, vit_embed], dim=0)
+            stacked_embed = torch.cat([vae_token_embed, sep_embed, vit_embed], dim=0)
             combined_embeddings.append(stacked_embed)
 
         return combined_embeddings

From b7c968bd5547d6188ecc3f21d76903080369c695 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 02:40:48 +0800
Subject: [PATCH 14/43] fix(hunyuan_image3): pass extra resolutions to DiT-side
 reso_group
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`HunyuanImage3ImageProcessor.__init__` (DiT-side image processor in
`hunyuan_image3_transformer.py`) constructed `ResolutionGroup` without
the `HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS` extras, so it only knew the 33
step-based buckets (idx 0-32). When the AR predicted `<img_ratio_36>`
and the bridge resolved it to (h=720, w=1280), the DiT pipeline's
`get_target_size` re-bucketed those dims to the closest 33-bucket
ratio (idx 12 = 1280×768) and the final output PNG came out at
1280×768 instead of 1280×720.

Threads the same `extra_resolutions` constant the AR-side processor
(commit b3f91f3d) already uses, so the DiT side recognizes idx 33-36
as valid buckets and respects the AR's predicted dims end-to-end.

Verified output PIL.size now matches AR's predicted bucket: multi-image
prediction `<img_ratio_36>` → (h=720, w=1280) → output (1280, 720).

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3_transformer.py      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
index 5a707acbda5..4edcfb6ca3a 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py
@@ -1369,7 +1369,10 @@ class HunyuanImage3ImageProcessor:
     def __init__(self, config):
         self.config = config
 
-        self.reso_group = ResolutionGroup(base_size=config.image_base_size)
+        self.reso_group = ResolutionGroup(
+            base_size=config.image_base_size,
+            extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
+        )
         self.vae_processor = transforms.Compose(
             [
                 transforms.ToTensor(),

From 3b73eabe9f0b3b087785012ceaecb3fce093e35f Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Mon, 11 May 2026 08:46:54 +0800
Subject: [PATCH 15/43] fix(hunyuan_image3 ar2diffusion): truncate AR cot_text
 at </recaption>/</think>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bridge was forwarding the full AR `generated_text` (including the
`<answer><boi><img_size_*><img_ratio_*>` stage-transition tail) into
`extra.ar_generated_text` for DiT's prompt builder. The tail's
purpose is purely to drive the AR's greedy ratio prediction inside
`_apply_ratio_restriction` — the size/ratio info is already routed to
DiT via `height` / `width` (translated from `ratio_idx`), so the tail
has no remaining job downstream and just contaminates cot_text with
an extra `<boi>` + size + ratio that DiT's prompt builder isn't
expecting.

Mirrors official upstream `HunyuanImage3ForCausalMM.generate_image`
(modeling_hunyuan_image_3.py:3343-3354), which decodes only
`generated_tokens[0, :end_pos + 1]` where `end_pos` is the position
of `</recaption>` (think_recaption / recaption bot_task) or
`</think>` (think-only bot_task).

Adds `_truncate_at_cot_end()` that finds the first cot-end marker in
the generated text, truncates both the text and the token-id stream
at that position (token side uses `</recaption>` / `</think>` token
ids from the tokenizer, cached via `_build_cot_end_token_ids`), and
returns them for downstream consumption.

`ratio_idx` extraction in `_extract_ratio_index` still runs on the
FULL output before truncation, since the ratio token lives in the
trailing segment that we're about to drop.

Addresses PR #3444 review comment from @Bounty-hunter.

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../stage_input_processors/hunyuan_image3.py  | 80 ++++++++++++++++++-
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 63af2f7f1dd..158ea86dbf2 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -58,6 +58,67 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
     return [(int(r.height), int(r.width)) for r in reso_group.data]
 
 
+@lru_cache(maxsize=4)
+def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]:
+    """Return `{'</recaption>': id, '</think>': id}` for cot-boundary
+    truncation. Empty dict on lookup failure so callers degrade to a
+    pure text-based search.
+    """
+    try:
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    except Exception as e:  # pragma: no cover - environment-dependent
+        logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e)
+        return {}
+
+    result: dict[str, int] = {}
+    for marker in ("</recaption>", "</think>"):
+        tid = tokenizer.convert_tokens_to_ids(marker)
+        if tid is not None and tid != tokenizer.unk_token_id:
+            result[marker] = int(tid)
+    return result
+
+
+def _truncate_at_cot_end(
+    generated_text: str,
+    generated_token_ids,
+    model_name_or_path: str,
+) -> tuple[str, list[int]]:
+    """Truncate AR output at first `</recaption>` (or `</think>` fallback).
+
+    Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official
+    upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
+    `cot_text` for DiT. The trailing `<answer><boi><img_size_*><img_ratio_*>`
+    sequence is a stage-transition trigger consumed via `image_size` /
+    height/width — it must NOT be forwarded to DiT's prompt builder, or
+    the extra `<boi>` and ratio tokens drift the DiT's own prompt
+    structure.
+    """
+    token_list = list(generated_token_ids) if generated_token_ids is not None else []
+
+    end_ids = _build_cot_end_token_ids(model_name_or_path)
+
+    for marker in ("</recaption>", "</think>"):
+        idx = generated_text.find(marker)
+        if idx == -1:
+            continue
+        text_end = idx + len(marker)
+        truncated_text = generated_text[:text_end]
+
+        truncated_tokens = token_list
+        end_id = end_ids.get(marker)
+        if end_id is not None and token_list:
+            try:
+                token_end = token_list.index(end_id)
+                truncated_tokens = token_list[: token_end + 1]
+            except ValueError:
+                pass
+        return truncated_text, truncated_tokens
+
+    return generated_text, token_list
+
+
 @lru_cache(maxsize=4)
 def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
     """Return `{token_id: ratio_index}` for `<img_ratio_*>` in the tokenizer.
@@ -206,17 +267,30 @@ def ar2diffusion(
                     width,
                 )
 
+        # Truncate the AR output at `</recaption>` (or `</think>`) before
+        # passing to DiT. Mirrors official `generate_image` which keeps
+        # `cot_text` clean and routes size/ratio via `image_size` only —
+        # we already extracted `ratio_idx` above and translated it into
+        # `height` / `width`, so the `<answer><boi><img_size_*><img_ratio_*>`
+        # tail has no remaining job and would only contaminate DiT's
+        # prompt builder if forwarded.
+        cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(
+            generated_text, generated_token_ids, model_name_or_path
+        )
+
         logger.info(
-            "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d (%s)",
+            "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "
+            "cot_text length=%d, target size=%dx%d (%s)",
             i,
             len(generated_token_ids),
             len(generated_text),
+            len(cot_text_for_dit),
             height,
             width,
             f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
         )
 
-        token_tensor = torch.tensor(generated_token_ids, dtype=torch.long)
+        token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long)
 
         diffusion_input: dict[str, Any] = {
             "prompt": text_prompt,
@@ -224,7 +298,7 @@ def ar2diffusion(
             "width": width,
             "extra": {
                 "ar_token_ids": token_tensor,
-                "ar_generated_text": generated_text,
+                "ar_generated_text": cot_text_for_dit,
             },
         }
 

From 284783940116757be0f23fc80e0402ad74789a62 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Mon, 11 May 2026 11:37:11 +0800
Subject: [PATCH 16/43] chore(hunyuan_image3): apply ruff format

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3.py          | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 08d25e9c896..756a7a27c9b 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -830,9 +830,7 @@ def __init__(self, tokenizer, hf_config, **kwargs: object):
 
         self.reso_group = self.ResolutionGroup(
             base_size=hf_config.image_base_size,
-            extra_resolutions=[
-                HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS
-            ],
+            extra_resolutions=[HunyuanImage3Processor.Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
         )
         self.vision_encoder_processor = Siglip2ImageProcessorFast.from_dict(hf_config.vit_processor)
         self.vae_processor = transforms.Compose(
@@ -1887,9 +1885,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         # inside the bidirectional region (matching how the model was
         # trained).
         sep_token_id = self._mrope_joint_img_sep_token_id
-        sep_input_ids = torch.tensor(
-            [sep_token_id], device=vit_embeddings.device, dtype=torch.long
-        )
+        sep_input_ids = torch.tensor([sep_token_id], device=vit_embeddings.device, dtype=torch.long)
         sep_embed = self.model.embed_input_ids(sep_input_ids).to(vit_embeddings.dtype)
 
         # The <timestep> slot at the head of each per-image scaffold is NOT
@@ -1927,9 +1923,7 @@ def embed_input_ids(
         timestep_mask = input_ids == self._timestep_token_id
         n_timestep = int(timestep_mask.sum().item())
         if n_timestep > 0:
-            timestep_input = torch.zeros(
-                (n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype
-            )
+            timestep_input = torch.zeros((n_timestep,), device=inputs_embeds.device, dtype=inputs_embeds.dtype)
             inputs_embeds[timestep_mask] = self._timestep_encode(timestep_input)
 
         if multimodal_embeddings is None or len(multimodal_embeddings) == 0:

From 3b4f885cf2a2d84275691c7961fb93290c27fa13 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Mon, 11 May 2026 13:08:24 +0800
Subject: [PATCH 17/43] fix(hunyuan_image3): online IT2I multi-image and AR
 bucket override

Two related bugs in the online /v1/images/edits path prevented this PRs
multi-image IT2I from working end-to-end and silently suppressed the AR
ratio decision for AR-driven pipelines:

1. serving_chat._build_multistage_generation_inputs invoked build_prompt
   without num_images, defaulting to 1. N reference images then only got
   a single <img> placeholder in the AR prompt; vLLMs _process_multimodal
   raised AssertionError(Failed to apply prompt replacement for
   mm_items[image][1]) on the second image.

2. edit_images resolved size=auto to the first input images dimensions
   and forwarded them through extra_body to chat_handler.
   generate_diffusion_images, which then built a fresh gen_params with
   those dimensions. Multi-stage AR-driven pipelines (e.g. HunyuanImage-3.0)
   rely on ar2diffusion to override the final bucket from the AR ratio
   token; DiTs pre_process_func only does that when
   sampling_params.width is None (see pipeline_hunyuan_image3.py:290).
   The forwarded input-image size suppressed the AR decision, producing
   the wrong bucket (e.g. 1024x1024 square instead of the AR-decided
   1280x720 landscape for multi-image fusion).

The fix mirrors the offline end2end.py img2img path which never sets
sampling_params.height/width for img2img. Single-stage diffusion
(_generate_with_async_omni path) still pins gen_params.width/height
from input image size for backward compat.

End-to-end smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images via
curl /v1/images/edits with size=auto, same prompt as offline):
- before fix 1: HTTP 500, AssertionError on mm_items[image][1]
- before fix 2: HTTP 200 but 1024x1024 square (wrong bucket)
- after both:  HTTP 200, 1280x720 landscape -- AR ratio_idx=36 honored,
  matches offline end2end.py for the same inputs

Tests:
- tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py::
    test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
  Pins build_prompt(num_images=N) for N=1,2,3 reference images.
- tests/entrypoints/openai_api/test_image_server.py::
    test_image_edits_size_auto_preserves_bridge_size
  Pins diffusion sampling_params.height/width staying None through the
  /v1/images/edits API on the multi-stage path, with multi-image
  placeholder cross-check.
- test_image_edit_parameter_default updated to assert the new contract
  (None on multi-stage); test_image_edit_parameter_default_single_stage
  unchanged.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 .../openai_api/test_image_server.py           | 68 ++++++++++++++++++-
 ...test_serving_chat_multistage_generation.py | 44 ++++++++++++
 vllm_omni/entrypoints/openai/api_server.py    | 18 +++--
 vllm_omni/entrypoints/openai/serving_chat.py  |  6 +-
 4 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index b5ff891f8f6..fb9c126d3fe 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -1349,8 +1349,16 @@ def test_image_edit_parameter_default(async_omni_test_client):
     engine = async_omni_test_client.app.state.engine_client
     captured_sampling_params = engine.captured_sampling_params_list[-1]
 
-    assert captured_sampling_params.width == 24
-    assert captured_sampling_params.height == 16
+    # size="auto" on multi-stage pipelines deliberately leaves the diffusion
+    # stages sampling_params width/height unset so AR-driven pipelines (e.g.
+    # HunyuanImage-3.0) can let ar2diffusion override the final bucket from
+    # the AR-predicted ratio token; see
+    # test_image_edits_size_auto_preserves_bridge_size for the contract.
+    # Single-stage diffusion (test_image_edit_parameter_default_single_stage)
+    # still pins width/height to the input image size via api_servers
+    # gen_params, which is unchanged.
+    assert captured_sampling_params.width is None
+    assert captured_sampling_params.height is None
     assert captured_sampling_params.num_outputs_per_prompt == 1
     assert captured_sampling_params.num_inference_steps == 4
     assert captured_sampling_params.guidance_scale == 7.5
@@ -1649,3 +1657,59 @@ def __init__(self):
     assert len(images) == 1
     assert isinstance(images[0], Image.Image)
     assert images[0].size == (32, 32)
+
+
+def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_only_client):
+    """size=auto must NOT pin the diffusion stage sampling_params.height/width.
+
+    Regression: prior to the fix, edit_images resolved size=auto to the
+    first input image dimensions and forwarded them through gen_params +
+    extra_body to the diffusion stages sampling_params. AR-driven
+    pipelines (e.g. HunyuanImage-3.0) rely on ar2diffusions
+    bridge to override the final bucket via the AR-predicted ratio token,
+    and the DiT pre_process_func only fills sampling_params from the
+    bridge value when sampling_params.width is None (see
+    pipeline_hunyuan_image3.py:290). Non-None width from the input image
+    silently suppressed the AR decision, producing the wrong bucket
+    (e.g. 1024x1024 square instead of the AR-decided 1280x720 landscape
+    for multi-image fusion).
+
+    Cross-pins the multi-image fix at the API level: 2 reference images
+    with bot_task=it2i must produce 2 <img> placeholders in the captured
+    AR prompt (build_prompt called with num_images=2).
+    """
+    img_a = make_test_image_bytes((32, 32))
+    img_b = make_test_image_bytes((128, 64))
+    response = async_omni_stage_configs_only_client.post(
+        "/v1/images/edits",
+        files=[("image", img_a), ("image", img_b)],
+        data={
+            "prompt": "fuse",
+            "size": "auto",
+            "bot_task": "it2i",
+        },
+    )
+    assert response.status_code == 200, response.text
+
+    engine = async_omni_stage_configs_only_client.app.state.engine_client
+    captured = engine.captured_sampling_params_list
+    assert captured is not None
+    assert len(captured) == 2
+
+    diffusion_params = captured[1]
+    assert diffusion_params.height is None, (
+        f"size=auto leaked into diffusion sampling_params.height={diffusion_params.height}; "
+        "must stay None so AR-driven pipelines can apply the bridges decision."
+    )
+    assert diffusion_params.width is None, (
+        f"size=auto leaked into diffusion sampling_params.width={diffusion_params.width}; "
+        "must stay None so AR-driven pipelines can apply the bridges decision."
+    )
+
+    KEY = "prompt"
+    IMG = "<img>"
+    captured_prompt = engine.captured_prompt
+    if isinstance(captured_prompt, dict) and isinstance(captured_prompt.get("prompt"), str):
+        assert captured_prompt["prompt"].count("<img>") == 2, (
+            f"N=2 reference images must emit 2 <img> placeholders in AR prompt; got {captured_prompt[KEY].count(IMG)} -- prompt: {captured_prompt[KEY]!r}"
+        )
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 144a0e97a6c..618c2573078 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -91,3 +91,47 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser
     assert engine.default_sampling_params_list[1].lora_request is None
     assert engine.default_sampling_params_list[2].resolution == 640
     assert engine.default_sampling_params_list[2].lora_request is None
+
+
+def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders(serving_chat):
+    """N reference images with bot_task set must emit N <img> placeholders.
+
+    Regression: prior to the multi-image online fix, build_prompt was
+    called without num_images, defaulting to 1. A 2-image edit request
+    would only get a single <img> placeholder in the AR prompt; vLLMs
+    _process_multimodal then raised
+    AssertionError(Failed to apply prompt replacement for mm_items[image][1])
+    when trying to replace the second image (no placeholder left for it).
+
+    Pins the contract that build_prompt() is invoked with the actual image
+    count so multi-image IT2I is wired correctly through the online
+    /v1/images/edits path.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    IMG = "<img>"
+    images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)]
+
+    for n in (1, 2, 3):
+        engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+            serving_chat,
+            engine=engine,
+            prompt="edit me",
+            extra_body={"bot_task": "it2i"},
+            reference_images=images[:n],
+            gen_params=OmniDiffusionSamplingParams(),
+        )
+        prompt_str = engine_prompt["prompt"]
+        assert prompt_str.count("<img>") == n, (
+            f"N={n}: expected {n} <img> placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}"
+        )
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 06fb0a7f4cb..4227cff2fb6 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1811,7 +1811,8 @@ async def edit_images(
 
         # 3.3 Parse and add size if provided
         width, height = None, None
-        if size.lower() == "auto":
+        size_was_auto = size.lower() == "auto"
+        if size_was_auto:
             if resolution is None:
                 # No resolution specified, use input image size
                 width, height = pil_images[0].size
@@ -1882,10 +1883,17 @@ async def edit_images(
                 "seed": effective_seed,
                 "num_outputs_per_prompt": n,
             }
-            if width is not None:
-                extra_body["width"] = width
-            if height is not None:
-                extra_body["height"] = height
+            # When size="auto", width/height were resolved from the first
+            # input images size (e.g. 512x512 logo), NOT a client-requested
+            # output dimension. Forwarding them to extra_body would override
+            # AR-driven pipelines (e.g. HunyuanImage-3.0) AR `<img_ratio_*>`
+            # token decision via gen_params -> sampling_params. Skip the
+            # forward when auto, matching offline end2end.py img2img.
+            if not size_was_auto:
+                if width is not None:
+                    extra_body["width"] = width
+                if height is not None:
+                    extra_body["height"] = height
             if negative_prompt is not None:
                 extra_body["negative_prompt"] = negative_prompt
             if num_inference_steps is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index a5ca494c89e..022b5d2e95d 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2265,16 +2265,16 @@ def _build_multistage_generation_inputs(
                 build_prompt_tokens,
             )
 
+            num_images = len(reference_images) if reference_images else 1
             prompt_token_ids: list[int] | None = None
             system_prompt_type: str | None = None
             if tokenizer is not None:
-                result = build_prompt_tokens(prompt, tokenizer, task=bot_task)
+                result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images)
                 prompt_token_ids = result.token_ids
                 system_prompt_type = result.system_prompt_type
             else:
-                prompt = build_prompt(prompt, task=bot_task)
+                prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
                 engine_prompt["prompt"] = prompt
-
             if reference_images and len(reference_images) == 1:
                 engine_prompt_data = {"image": reference_images[0]}
                 modalities = ["image"]

From ca830c851b63b9d2deea3d08b53b8315b4a4b5b4 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Mon, 11 May 2026 15:26:07 +0800
Subject: [PATCH 18/43] fix(hunyuan_image3): online IT2I HF byte-equivalent
 prompt path

Follow-up to 815ac732 (online IT2I multi-image + size=auto). Online
still passed the prompt as a string and let the engine BPE-tokenize
the full chat template at once, while offline end2end.py img2img
feeds prompt_token_ids built segment-by-segment via build_prompt_tokens
(mirrors HF apply_chat_template). The two paths produced different
AR input token sequences for the same user inputs:

- offline (build_prompt_tokens): AR 661 tokens / 1118 chars cot
- online  (build_prompt string):  AR 706 tokens / 1190 chars cot

The mismatch silently shifted ARs training distribution (cross-segment
BPE merges, e.g. <full_stop><newline><newline> -> single id, vs HFs
[1811, 271]). AR produced different cot_text and DiT produced a visually
different image even with the same seed/prompt/reference images.

This patch threads the comprehension stages tokenizer through
generate_diffusion_images -> _build_multistage_generation_inputs.
When a tokenizer is available (multi-stage AR-driven path), the helper:

  1. Calls build_prompt_tokens(prompt, tokenizer, task=bot_task,
     num_images=N) and writes engine_prompt[prompt_token_ids];
     engine_prompt[prompt] stays as the raw user text so ar2diffusion
     can hand it through to DiT.
  2. Sets engine_prompt[use_system_prompt] = resolve_sys_type(think)
     -> en_unified, matching offline end2end.py img2img which always
     forwards an explicit use_system_prompt.

Falls back to the original build_prompt string path when no tokenizer
is plumbed (legacy callers / unit tests), so existing flows still work.

E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, 2 ref images, curl
/v1/images/edits with size=auto, seed=42, steps=50, guidance=5.0):

- before: AR 706 / 1190, brushed-metal yin-yang (BPE merges diverged)
- after:  AR 660 / 1148, canvas background restored (1 token / 30 char
  delta vs offline 661 / 1118 is within sampling noise; same en_unified
  sys prompt + <think> trigger on both sides).

Tests:
- test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids
  pins:
    (a) engine_prompt[prompt_token_ids] set when tokenizer is passed,
    (b) engine_prompt[prompt] preserved as raw user text,
    (c) engine_prompt[use_system_prompt] == en_unified,
    (d) N <img> token ids in prompt_token_ids for N=1,2,3.

Follow-ups (separate patches):
  - Public API surface for task / bot_task separation (online callers
    currently pass bot_task in extra_body but the value semantically
    means task; needed to express think_recaption / recaption / vanilla).
  - HF byte-for-byte parity assertion across offline and online once the
    API split lands.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 ...test_serving_chat_multistage_generation.py | 81 +++++++++++++++++++
 vllm_omni/entrypoints/openai/serving_chat.py  | 37 ++++++---
 2 files changed, 108 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 618c2573078..b0871732f6a 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -135,3 +135,84 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
         assert prompt_str.count("<img>") == n, (
             f"N={n}: expected {n} <img> placeholders, got {prompt_str.count(IMG)} -- prompt: {prompt_str!r}"
         )
+
+
+def test_build_multistage_generation_inputs_tokenizer_path_emits_prompt_token_ids(serving_chat):
+    """When a tokenizer is provided, the helper must emit HF byte-for-byte
+    prompt_token_ids and forward use_system_prompt to the engine prompt.
+
+    Regression: prior to the HF-byte-equivalent fix, online IT2I always
+    passed the prompt as a single string. The engine then BPE-merged across
+    chat-template segment boundaries (e.g. user_prompt-ending punctuation
+    plus the trailing \n\n before \"Assistant: \") producing a token
+    sequence that differs from HF apply_chat_template / offline
+    end2end.py. AR generated different cot_text (706 tokens / 1190 chars
+    vs offline 661 / 1118 for the same inputs) and DiT produced a visually
+    different image (yin-yang on brushed-metal vs three-blue swirl on
+    canvas) under the same seed.
+
+    Pins:
+      1. engine_prompt[\"prompt_token_ids\"] is set when tokenizer is passed.
+      2. engine_prompt[\"prompt\"] stays as the raw user prompt -- the DiT
+         side rebuilds its own system prefix via use_system_prompt.
+      3. engine_prompt[\"use_system_prompt\"] == \"en_unified\" so
+         ar2diffusion forwards the matching system prompt to DiT.
+      4. N reference images emit N <img> token ids in the AR sequence.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    # Minimal FakeTokenizer mirroring tests/diffusion/.../test_hunyuan_image3_it2i_multi_image.py
+    class FakeTokenizer:
+        SPECIAL = {
+            "<|startoftext|>": 1,
+            "<img>": 2,
+            "<think>": 3,
+            "<recaption>": 4,
+        }
+
+        def convert_tokens_to_ids(self, tok: str) -> int:
+            return self.SPECIAL.get(tok, 0)
+
+        def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+            return list(range(100, 100 + len(text)))
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    PROMPT_KEY = "prompt"
+    USP_KEY = "use_system_prompt"
+    images = [Image.new("RGB", (32, 32), color="red") for _ in range(3)]
+
+    for n in (1, 2, 3):
+        tok = FakeTokenizer()
+        engine_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+            serving_chat,
+            engine=engine,
+            prompt="edit me",
+            extra_body={"bot_task": "it2i"},
+            reference_images=images[:n],
+            gen_params=OmniDiffusionSamplingParams(),
+            tokenizer=tok,
+        )
+        # (1) prompt_token_ids must be set and non-empty
+        assert "prompt_token_ids" in engine_prompt, f"N={n}: prompt_token_ids missing"
+        token_ids = engine_prompt["prompt_token_ids"]
+        assert isinstance(token_ids, list) and len(token_ids) > 0, f"N={n}: prompt_token_ids empty"
+        # (2) raw prompt preserved (DiT bridge needs raw user text)
+        assert engine_prompt["prompt"] == "edit me", (
+            f"N={n}: prompt must stay raw user text, got {engine_prompt[PROMPT_KEY]!r}"
+        )
+        # (3) use_system_prompt forwarded for ar2diffusion bridge
+        assert engine_prompt.get("use_system_prompt") == "en_unified", (
+            f"N={n}: use_system_prompt must be en_unified, got {engine_prompt.get(USP_KEY)!r}"
+        )
+        # (4) N <img> token ids (id=2 in FakeTokenizer)
+        img_count = token_ids.count(2)
+        assert img_count == n, f"N={n}: expected {n} <img> token ids in prompt_token_ids, got {img_count}"
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 022b5d2e95d..2738f648e09 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2258,7 +2258,8 @@ def _build_multistage_generation_inputs(
             else:
                 engine_prompt_data = {"image": reference_images}
 
-        engine_prompt: OmniTextPrompt = {"prompt": prompt}
+        prompt_token_ids: list[int] | None = None
+        system_prompt_type: str | None = None
         if bot_task:
             from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
                 build_prompt,
@@ -2266,23 +2267,35 @@ def _build_multistage_generation_inputs(
             )
 
             num_images = len(reference_images) if reference_images else 1
-            prompt_token_ids: list[int] | None = None
-            system_prompt_type: str | None = None
             if tokenizer is not None:
-                result = build_prompt_tokens(prompt, tokenizer, task=bot_task, num_images=num_images)
+                # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
+                # so AR sees the same template-tokenization HF apply_chat_template
+                # produces. Without this, the engine BPE-merges across template
+                # segment boundaries (e.g. "。\n\n" -> single id) and AR
+                # diverges from training distribution -- different cot_text,
+                # different DiT input, different final image. Mirrors offline
+                # examples/.../end2end.py img2img which always feeds
+                # prompt_token_ids. See prompt_utils.build_prompt NOTE.
+                result = build_prompt_tokens(
+                    prompt,
+                    tokenizer,
+                    task=bot_task,
+                    num_images=num_images,
+                )
                 prompt_token_ids = result.token_ids
                 system_prompt_type = result.system_prompt_type
             else:
+                # Legacy string path (e.g. unit tests with no tokenizer plumbed).
                 prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
-                engine_prompt["prompt"] = prompt
             if reference_images and len(reference_images) == 1:
                 engine_prompt_data = {"image": reference_images[0]}
                 modalities = ["image"]
-            if prompt_token_ids is not None:
-                engine_prompt["prompt_token_ids"] = prompt_token_ids
-            if system_prompt_type is not None:
-                engine_prompt["use_system_prompt"] = system_prompt_type
 
+        engine_prompt: OmniTextPrompt = {"prompt": prompt}
+        if prompt_token_ids is not None:
+            engine_prompt["prompt_token_ids"] = prompt_token_ids
+        if system_prompt_type is not None:
+            engine_prompt["use_system_prompt"] = system_prompt_type
         engine_prompt["modalities"] = modalities
         if negative_prompt is not None:
             engine_prompt["negative_prompt"] = negative_prompt
@@ -2456,13 +2469,17 @@ async def generate_diffusion_images(
             diffusion_engine = cast(AsyncOmni, engine)
             stage_configs = getattr(diffusion_engine, "stage_configs", None) or []
             if len(stage_configs) > 1:
+                # Pull tokenizer from the comprehension (AR) stage so we can
+                # build HF byte-for-byte prompt_token_ids in the helper. If
+                # the engine doesn"t expose one, fall back to the legacy
+                # string-prompt path (engine re-tokenizes).
                 tokenizer = None
                 get_tok = getattr(diffusion_engine, "get_tokenizer", None)
                 if get_tok is not None:
                     try:
                         tokenizer = await get_tok()
                     except Exception as exc:
-                        logger.warning("get_tokenizer failed: %s", exc)
+                        logger.warning("get_tokenizer failed; falling back to string prompt path: %s", exc)
                 engine_prompt, sampling_params_list = self._build_multistage_generation_inputs(
                     engine=diffusion_engine,
                     prompt=prompt,

From c2ea079927380256fc5424cf513d25d721577f6b Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Mon, 11 May 2026 16:03:25 +0800
Subject: [PATCH 19/43] fix(hunyuan_image3): align DiT tokenization with
 AR-sampled token IDs

Follow-up to 94830bdd (HF byte-equivalent prompt on AR side). DiT side
was still re-encoding the AR-decoded cot text via tokenizer.encode,
which is not lossless when AR-sampled tokens decode to text whose BPE
re-merges differ from ARs original token sequence -- e.g. Chinese
punctuation, escaped quotes, and multi-byte UTF-8 boundaries silently
shift the token count by N for the same content.

For KV-reuse-enabled requests this is fatal: AR caches K/V at AR-tok
positions (length L_ar), but DiT computes positive_reuse_len from
think_recaption_end_pos in its OWN tokenizer_output (length L_dit !=
L_ar). inject_ar_kv_into_layers then silently slices k[:positive_reuse_len]
from a shorter tensor (Python slice tolerates out-of-bounds) and
_cache_prompt_kvs assert q_len + ar_kv_len == seq_len fires with
ar_kv_len = L_ar while seq_len was computed with positive_reuse_len = L_dit.
User-observed: q_len(4105) + ar_kv_len(6740) != seq_len(10854), off by 9
on a Chinese-heavy IT2I prompt.

For non-KV-reuse requests the same drift exists but is silently
absorbed: AR sees its training-distribution tokens, DiT sees a
different prefix prefix, output image quality subtly diverges (the
3-magnet vs 1-magnet pattern in the earlier P0 e2e smoke).

ar2diffusion bridge already forwards extra.ar_token_ids alongside
extra.ar_generated_text since the multi-image PR landed -- this patch
just teaches DiT to consume it.

Surgery points:

1. hunyuan_image3_tokenizer.py: get_cot_sections_from_token_ids
   Mirror of get_cot_sections but splits at <think>/<recaption> marker
   token IDs in AR-sampled space instead of text-split. Emits sections
   carrying pre-tokenized tokens=[...] which encode_text already
   consumes verbatim (line 152-154: if isinstance(text, str): encode;
   else: use as-is).

2. hunyuan_image3_tokenizer.py: apply_chat_template adds optional
   batch_cot_token_ids: list[Any] | None param. When provided per
   batch item, the assistant message is built with context_type=token_ids
   (vs str). Backward compatible: callers passing only batch_cot_text
   keep working.

3. hunyuan_image3_tokenizer.py: process_successive_message handles
   context_type==token_ids for assistant role -- splits on marker IDs
   when both <think>+</think> or <recaption>+</recaption> tokens are
   present, otherwise wraps the full ID sequence as a single text
   section with tokens=... .

4. pipeline_hunyuan_image3.py: forward() extracts extra.ar_token_ids
   alongside extra.ar_generated_text from each prompt and threads
   cot_token_ids through prepare_model_inputs ->
   apply_chat_template.batch_cot_token_ids. Prefer ID path when
   available; fall back to text path otherwise (back-compat for
   non-AR-driven flows that dont set ar_token_ids).

E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl
/v1/images/edits, size=auto, seed=42, steps=50, guidance=5.0,
non-KV-reuse stage configs): HTTP 200, 1280x720 PNG, AR 641 tokens /
1107 chars cot. No regression in existing flows (149 unit tests pass).

KV-reuse e2e validation in this run was blocked by an orthogonal
environment issue (gpu_memory_utilization=0.95 in user yaml + post-load
FusedMoeRunner workspace allocation overshoots) rather than a code
defect; the byte-aligned ar_token_ids path is what the assertion
requires, verified via unit tests.

Tests:
- tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
    * test_get_cot_sections_from_token_ids_round_trips_ar_ids
      pins lossless splitting at AR-tok <think>/</think> markers (no
      re-encode).
    * test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids
      pins end-to-end contract that apply_chat_template emits the
      AR-sampled ID sequence verbatim in the final encoded output.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 .../hunyuan_image3/test_kvreuse_alignment.py  | 135 ++++++++++++++++++
 .../hunyuan_image3_tokenizer.py               | 131 +++++++++++++++--
 .../hunyuan_image3/pipeline_hunyuan_image3.py |  15 ++
 3 files changed, 272 insertions(+), 9 deletions(-)
 create mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py

diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
new file mode 100644
index 00000000000..20faf5487dc
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Regression tests for AR-token-IDs preservation through DiT prompt building.
+
+Pins the KV-reuse alignment contract: when the AR-side stage input
+processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion
+stage, `apply_chat_template` must consume those IDs verbatim (no
+re-encode of the decoded cot text via `tokenizer.encode`) so that the
+DiT-side prompt tokenization matches AR's actually-sampled token
+sequence byte-for-byte.
+
+Why this matters: tokenize-detokenize-tokenize over the cot text is not
+lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries),
+and the resulting length drift breaks AR KV position alignment --
+DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`)
+ends up larger than the actual cached AR KV length, and
+`inject_ar_kv_into_layers` then silently truncates via Python slice,
+leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off
+by N (hard 500 on KV-reuse-enabled requests; see
+`pipeline_hunyuan_image3.py:_cache_prompt_kv`).
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+pytestmark = [pytest.mark.core_model]
+
+
+def _hf_cached(model_id: str) -> bool:
+    hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+    snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
+    return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
+
+
+_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
+
+
+@pytest.mark.skipif(
+    not _hf_cached(_HUNYUAN_MODEL_ID),
+    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
+)
+def test_get_cot_sections_from_token_ids_round_trips_ar_ids():
+    """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the
+    `<think>` / `</think>` token-id positions and emit sections whose
+    concatenated tokens equal the input (no re-encode).
+
+    Catches the failure mode where DiT re-encodes the decoded cot text
+    and the BPE merges differ from AR's sampled tokens (length drift).
+    """
+    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
+        TokenizerWrapper,
+    )
+
+    tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
+
+    think_id = tkw.tokenizer.convert_tokens_to_ids("<think>")
+    end_think_id = tkw.end_think_token_id
+
+    # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens
+    # surrounded by <think>/</think> markers, plus some leading + trailing
+    # tokens (e.g. <answer>/<boi> tail that gets truncated upstream).
+    thought_payload = [1000, 1001, 1002, 1003, 1004]
+    leading = [2000, 2001]
+    trailing = [3000]
+    ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing
+
+    sections = tkw.get_cot_sections_from_token_ids(
+        ar_token_ids,
+        uncond_kwargs={},
+        drop_think=False,
+    )
+
+    # Sections concatenated must equal the input verbatim.
+    out: list[int] = []
+    for sec in sections:
+        assert sec["type"] == "text", f"unexpected section type: {sec}"
+        toks = sec.get("tokens")
+        assert toks is not None, f"section missing 'tokens' field: {sec}"
+        out.extend(toks)
+    assert out == ar_token_ids, (
+        f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; "
+        f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}"
+    )
+
+
+@pytest.mark.skipif(
+    not _hf_cached(_HUNYUAN_MODEL_ID),
+    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
+)
+def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids():
+    """When `batch_cot_token_ids` is passed, the assistant section in the
+    final encoded token sequence must contain the AR-sampled token ids
+    verbatim -- no `tokenizer.encode(cot_text)` round-trip.
+
+    Pins the end-to-end contract that KV-reuse alignment relies on.
+    """
+    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
+        TokenizerWrapper,
+    )
+
+    tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
+    think_id = tkw.tokenizer.convert_tokens_to_ids("<think>")
+    end_think_id = tkw.end_think_token_id
+
+    # Construct a synthetic AR cot id sequence. Use mid-range vocab ids
+    # that are very unlikely to collide with any chat-template specials.
+    payload = [55001, 55002, 55003]
+    ar_token_ids = [think_id] + payload + [end_think_id]
+
+    out_with_ids = tkw.apply_chat_template(
+        batch_prompt=["draw a robot"],
+        batch_system_prompt=[None],
+        batch_cot_token_ids=[ar_token_ids],
+        mode="gen_text",
+        sequence_template="instruct",
+    )
+    tokens_with_ids = out_with_ids["output"].tokens.tolist()[0]  # batched output: take batch 0
+
+    # The exact AR payload must appear as a contiguous subsequence in the
+    # encoded output, sandwiched by the think markers we forwarded.
+    def _find_subseq(haystack: list[int], needle: list[int]) -> int:
+        n = len(needle)
+        for i in range(len(haystack) - n + 1):
+            if haystack[i : i + n] == needle:
+                return i
+        return -1
+
+    full_cot = [think_id] + payload + [end_think_id]
+    idx = _find_subseq(tokens_with_ids, full_cot)
+    assert idx >= 0, (
+        f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; "
+        f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead"
+    )
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index 751bfb21af8..e6e0c9db346 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -903,6 +903,75 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th
             dict(type="text", text=cot_text, **uncond_kwargs),
         ]
 
+    def get_cot_sections_from_token_ids(
+        self,
+        token_ids,
+        uncond_kwargs,
+        cot_max_length=None,
+        drop_think=False,
+    ):
+        """Split AR-sampled token IDs at think/recaption markers without re-encoding.
+
+        Functional mirror of `get_cot_sections` but operates on AR sampled IDs.
+        Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR
+        cot text is not lossless (BPE re-merges across multi-byte UTF-8 and
+        punctuation boundaries). The resulting length drift breaks AR KV
+        position alignment (`positive_reuse_len` computed in DiT-tok space vs
+        the actual cached AR KV in AR-tok space, off by N tokens for prompts
+        containing Chinese + escaped quotes etc.).
+        """
+        if not token_ids:
+            return []
+        ids = list(token_ids)
+
+        think_id = self.tokenizer.convert_tokens_to_ids("<think>")
+        end_think_id = self.end_think_token_id
+        recaption_id = self.tokenizer.convert_tokens_to_ids("<recaption>")
+        end_recaption_id = self.end_recaption_token_id
+
+        def _split_at_pair(seq, start_id, end_id):
+            if start_id is None or end_id is None:
+                return None
+            try:
+                s = seq.index(start_id)
+                e = seq.index(end_id, s + 1)
+            except ValueError:
+                return None
+            return seq[:s], seq[s + 1 : e], seq[e + 1 :]
+
+        # Try <think>...</think> first to mirror text-side split order.
+        split = _split_at_pair(ids, think_id, end_think_id)
+        if split is not None:
+            before, inside, after = split
+            return (
+                self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
+                + (
+                    [
+                        dict(type="text", tokens=[think_id]),
+                        dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
+                        dict(type="text", tokens=[end_think_id]),
+                    ]
+                    if not drop_think
+                    else []
+                )
+                + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
+            )
+
+        split = _split_at_pair(ids, recaption_id, end_recaption_id)
+        if split is not None:
+            before, inside, after = split
+            return (
+                self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
+                + [
+                    dict(type="text", tokens=[recaption_id]),
+                    dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
+                    dict(type="text", tokens=[end_recaption_id]),
+                ]
+                + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
+            )
+
+        return [dict(type="text", tokens=ids, **uncond_kwargs)]
+
     def apply_general_template(
         self,
         message_list,
@@ -953,17 +1022,36 @@ def process_successive_message(
             while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role:
                 message = _message_list[_cur_message_idx]
                 if message["type"] == "text":
-                    text = message["content"]
+                    content = message["content"]
+                    ctx_type = message.get("context_type", "str")
                     if role == "system":
-                        _sub_sections.append(dict(type="text", text=text))
+                        _sub_sections.append(dict(type="text", text=content))
                     elif role == "assistant":
-                        if ("<recaption>" in text and "</recaption>" in text) or (
-                            "<think>" in text and "</think>" in text
-                        ):
-                            _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
+                        if ctx_type == "token_ids":
+                            # Pre-tokenized AR cot tokens; split on marker ids, no re-encode.
+                            if hasattr(content, "tolist"):
+                                content = content.tolist()
+                            think_id = self.tokenizer.convert_tokens_to_ids("<think>")
+                            recaption_id = self.tokenizer.convert_tokens_to_ids("<recaption>")
+                            has_cot = (think_id in content and self.end_think_token_id in content) or (
+                                recaption_id in content and self.end_recaption_token_id in content
+                            )
+                            if has_cot:
+                                _sub_sections.extend(
+                                    self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think)
+                                )
+                            else:
+                                _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs))
                         else:
-                            _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
+                            text = content
+                            if ("<recaption>" in text and "</recaption>" in text) or (
+                                "<think>" in text and "</think>" in text
+                            ):
+                                _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
+                            else:
+                                _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
                     else:
+                        text = content
                         _sub_sections.append(
                             dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs)
                         )
@@ -1088,6 +1176,7 @@ def apply_chat_template(
         batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None,
         batch_system_prompt: list[str] | None = None,
         batch_cot_text: list[str] | None = None,
+        batch_cot_token_ids: list | None = None,
         max_length: int | None = None,
         bot_task: str = "auto",  # auto/image/think/recaption/img_ratio
         image_base_size: int = 1024,
@@ -1116,6 +1205,14 @@ def apply_chat_template(
                 )
             else:
                 batch_cot_text = [None] * batch_size
+            # Optional per-item pre-tokenized AR cot ids (used by KV-reuse).
+            if batch_cot_token_ids is not None:
+                assert len(batch_cot_token_ids) == batch_size, (
+                    f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), "
+                    f"but got {len(batch_cot_token_ids)}."
+                )
+            else:
+                batch_cot_token_ids = [None] * batch_size
             if batch_cond_image_info is not None:
                 assert len(batch_cond_image_info) == batch_size, (
                     f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
@@ -1130,10 +1227,18 @@ def apply_chat_template(
 
             # Convert single round materials into standard message list
             batch_message_list = []
-            for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip(
+            for (
+                prompt,
+                system_prompt,
+                cot_text,
+                cot_token_ids,
+                gen_image_info,
+                cond_image_info_list,
+            ) in zip(
                 batch_prompt,
                 batch_system_prompt,
                 batch_cot_text,
+                batch_cot_token_ids,
                 batch_gen_image_info,
                 batch_cond_image_info,
             ):
@@ -1153,7 +1258,15 @@ def apply_chat_template(
                 #   2.2 text inputs
                 message_list.append(dict(role="user", type="text", content=prompt, context_type="str"))
                 # 3. assistant answer sections
-                if cot_text is not None:
+                if cot_token_ids is not None:
+                    # Use AR-sampled token IDs verbatim. Avoids the
+                    # tokenize-detokenize-tokenize length drift that breaks KV reuse
+                    # (see process_successive_message context_type="token_ids" branch
+                    # and get_cot_sections_from_token_ids docstring).
+                    message_list.append(
+                        dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids")
+                    )
+                elif cot_text is not None:
                     message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str"))
                 if mode == "gen_image":
                     message_list.append(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index b1ba2687f86..5c6ddba0b64 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -750,6 +750,7 @@ def prepare_model_inputs(
         mode="gen_image",
         system_prompt=None,
         cot_text=None,
+        cot_token_ids=None,
         num_inference_steps=50,
         guidance_scale=5.0,
         image_size="auto",
@@ -766,6 +767,7 @@ def prepare_model_inputs(
         batch_message_list = message_list
         batch_prompt = prompt
         batch_cot_text = cot_text
+        batch_cot_token_ids = cot_token_ids
         batch_system_prompt = system_prompt
         batch_gen_image_info = None
         batch_cond_image_info = kwargs.pop("batch_cond_image_info", None)
@@ -844,6 +846,7 @@ def prepare_model_inputs(
             batch_cond_image_info=batch_cond_image_info,
             batch_system_prompt=batch_system_prompt,
             batch_cot_text=batch_cot_text,
+            batch_cot_token_ids=batch_cot_token_ids,
             max_length=kwargs.get("max_length"),
             bot_task=bot_task,
             image_base_size=self.config.image_base_size,
@@ -1376,12 +1379,23 @@ def forward(
         # and ``get_cot_sections()`` can parse the think/recaption structure
         # directly.
         cot_text_list = []
+        cot_token_ids_list = []
         for p in req.prompts:
             extra = p.get("extra", {}) if isinstance(p, dict) else {}
             cot_text_list.append(extra.get("ar_generated_text") or None)
+            cot_token_ids_list.append(extra.get("ar_token_ids"))
         cot_text = (
             [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
         )
+        # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt
+        # tokenization matches ARs actual token sequence byte-for-byte. Required
+        # when KV reuse is enabled: positive_reuse_len computed from DiT-side
+        # tokenization must equal the AR-side KV cache length, otherwise the
+        # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs
+        # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on
+        # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids
+        # in hunyuan_image3_tokenizer.py).
+        cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None
 
         batch_cond_image_info: list[list[JointImageInfo]] | None = None
         if any(not isinstance(p, str) for p in req.prompts):
@@ -1422,6 +1436,7 @@ def forward(
         model_inputs = self.prepare_model_inputs(
             prompt=prompt,
             cot_text=cot_text,
+            cot_token_ids=cot_token_ids,
             system_prompt=system_prompt,
             mode="gen_image",
             generator=generator,

From 1454f441ecf76152bcb67629f6fcb446ad9aa3f4 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Mon, 11 May 2026 16:17:04 +0800
Subject: [PATCH 20/43] fix(hunyuan_image3): split task / bot_task / sys_type
 at /v1/images/edits

Before P1, /v1/images/edits exposed a single  Form field that
was misused: callers passed a  enum value (i2t / it2i / t2i / t2t)
under that name, and _build_multistage_generation_inputs forwarded it
as  to build_prompt with bot_task defaulted to "think". This
blocked clients from expressing:

  - the bot_task semantic (think / recaption / think_recaption / vanilla)
  - sys_type override (offline )

Both knobs are needed to drive the online OpenAI API 1:1 against the
offline examples/.../end2end.py img2img surface.

Changes:

1. api_server.py: edit_images Form params add task: str | None and
   sys_type: str | None. Legacy bot_task=<task-enum-value> is auto
   promoted to task=<value>, bot_task=None so old clients keep working.

2. api_server.py: forward all three keys (task / bot_task / sys_type)
   to extra_body instead of writing a single misleading bot_task key.

3. serving_chat.py:_build_multistage_generation_inputs reads the
   triple, applies the same legacy normalization (defends against
   direct chat_handler callers passing the pre-P1 shape), and threads
   bot_task + sys_type through build_prompt_tokens / build_prompt.
   use_system_prompt forwarded to ar2diffusion now respects the
   override.

Tests (new):

  - test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged
    Legacy extra_body={"bot_task": "it2i"} produces a prompt byte
    identical to extra_body={"task": "it2i"} (back-compat).

  - test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys
    bot_task=think vs bot_task=think_recaption produce different
    rendered prompts (system body differs); pins that bot_task is
    actually plumbed through rather than collapsed to think default.

  - test_build_multistage_generation_inputs_sys_type_override
    sys_type=en_unified over bot_task=think_recaption reproduces the
    same prompt body as bot_task=think (offline override pattern).

Follow-up (not in this patch):

  - Mirror task / bot_task / sys_type on /v1/images/generations JSON
    schema (ImageGenerationRequest) for consistency across endpoints.

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 ...test_serving_chat_multistage_generation.py | 195 ++++++++++++++++++
 vllm_omni/entrypoints/openai/api_server.py    |  29 ++-
 vllm_omni/entrypoints/openai/serving_chat.py  |  38 +++-
 3 files changed, 257 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index b0871732f6a..88d15a684b6 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -216,3 +216,198 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
         # (4) N <img> token ids (id=2 in FakeTokenizer)
         img_count = token_ids.count(2)
         assert img_count == n, f"N={n}: expected {n} <img> token ids in prompt_token_ids, got {img_count}"
+
+
+def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
+    """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under
+    `bot_task` in extra_body. After the P1 task/bot_task split, the helper
+    must still treat that legacy form as `task=<value>, bot_task=None`
+    (i.e. defaults bot_task semantic to "think"), so the resulting prompt
+    is identical to the pre-P1 output.
+
+    Pins the back-compat contract.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
+
+    # Legacy form: only bot_task=<task-enum>.
+    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"bot_task": "it2i"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    # New form: explicit task=<task-enum>, no bot_task.
+    new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    assert legacy_prompt["prompt"] == new_prompt["prompt"], (
+        f"legacy bot_task=<task> form must produce the same prompt as task=<task>; "
+        f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}"
+    )
+
+
+def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
+    """Passing bot_task=think_recaption (vs default "think") must flip the
+    resolved sys_type to en_think_recaption (and trigger tag is still
+    <think>). Pins that the API actually plumbs the bot_task semantic
+    through to build_prompt rather than ignoring it.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red")]
+
+    # Default bot_task (think) -> en_unified system prompt baked into the
+    # legacy string path. Use legacy build_prompt (tokenizer=None) so the
+    # rendered prompt is a string we can grep.
+    think_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i", "bot_task": "think"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    # think_recaption -> en_think_recaption system prompt (different content).
+    recap_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i", "bot_task": "think_recaption"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    assert think_prompt["prompt"] != recap_prompt["prompt"], (
+        "bot_task semantic must change the rendered system prompt: "
+        f"think/think_recaption produced identical strings (len={len(think_prompt['prompt'])})"
+    )
+
+
+def test_build_multistage_generation_inputs_sys_type_override(serving_chat):
+    """Caller-supplied sys_type must override the bot_task-derived default.
+    Mirrors offline `--bot-task think_recaption --sys-type en_unified`
+    where the user wants think_recaptions trigger but the unified system
+    prompt body.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red")]
+
+    # think_recaption defaults sys_type -> en_think_recaption.
+    default_sys, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i", "bot_task": "think_recaption"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    # sys_type=en_unified overrides -> same system body as bot_task=think.
+    overridden, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i", "bot_task": "think_recaption", "sys_type": "en_unified"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    plain_think, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"task": "it2i", "bot_task": "think"},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+
+    # Override must (a) differ from the no-override default, and (b) equal
+    # the prompt that bot_task=think produces (both end up with
+    # en_unified system body + <think> trigger).
+    assert overridden["prompt"] != default_sys["prompt"], (
+        "sys_type override must change the rendered prompt body vs the bot_task default"
+    )
+    assert overridden["prompt"] == plain_think["prompt"], (
+        "sys_type=en_unified + bot_task=think_recaption must produce the same prompt as "
+        "bot_task=think (both = en_unified system body + <think> trigger)"
+    )
+
+
+def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat):
+    """`extra_body["system_prompt"]` must reach build_prompt as
+    `custom_system_prompt`, enabling sys_type="custom" callers to inject
+    a verbatim system body. Without this plumbing the sys_type="custom"
+    branch in get_system_prompt() returns None and silently drops the
+    user-supplied content.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red")]
+
+    QKEY = "prompt"
+    marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ"
+
+    out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={
+            "task": "it2i",
+            "bot_task": "think",
+            "sys_type": "custom",
+            "system_prompt": marker,
+        },
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    assert marker in out["prompt"], (
+        f"custom system_prompt content must reach the rendered prompt; "
+        f"marker {marker!r} not found in prompt of length {len(out['prompt'])}"
+    )
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 4227cff2fb6..77dc026bc97 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1701,6 +1701,14 @@ async def edit_images(
     layers: int | None = Form(None),
     resolution: int | None = Form(None),  # See SUPPORTED_LAYERED_RESOLUTIONS
     bot_task: str | None = Form(None),
+    # P1: task / sys_type / system_prompt split out from the legacy bot_task
+    # field so callers can express the full HunyuanImage-3.0 prompt template
+    # surface (task enum + bot_task semantic + sys_type override + custom
+    # system prompt body). Legacy callers that pass a task-enum value via
+    # bot_task still work (see normalization below).
+    task: str | None = Form(None),
+    sys_type: str | None = Form(None),
+    system_prompt: str | None = Form(None),
 ) -> ImageGenerationResponse:
     """
     OpenAI-compatible image edit endpoint.
@@ -1913,8 +1921,25 @@ async def edit_images(
                 lora_dict = _get_lora_from_json_str(lora)
                 _parse_lora_request(lora_dict)
                 extra_body["lora"] = lora_dict
-            if bot_task is not None:
-                extra_body["bot_task"] = bot_task
+            # P1: normalize legacy `bot_task=<task-enum>` form. Callers historically
+            # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task`
+            # Form field; promote it to `task` here so the chat_handler can
+            # split task vs bot_task semantics cleanly. New callers pass both
+            # `task` and `bot_task` explicitly; we keep them separate.
+            _task = task
+            _bot_task = bot_task
+            _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"}
+            if _task is None and _bot_task in _legacy_task_enum:
+                _task = _bot_task
+                _bot_task = None
+            if _task is not None:
+                extra_body["task"] = _task
+            if _bot_task is not None:
+                extra_body["bot_task"] = _bot_task
+            if sys_type is not None:
+                extra_body["sys_type"] = sys_type
+            if system_prompt is not None:
+                extra_body["system_prompt"] = system_prompt
 
             prompt_text = prompt.get("prompt", "")
             generation_result = await chat_handler.generate_diffusion_images(
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 2738f648e09..d1b2e89ae80 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,7 +2247,22 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
+        # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy
+        # api_server callers may still pass a task-enum value (i2t / it2i /
+        # t2i / t2t) under `bot_task`; normalize it to `task` here so
+        # downstream uses the canonical split. Source the task enum from
+        # prompt_utils so this layer stays in sync with the model side.
+        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+            available_tasks as _hunyuan3_available_tasks,
+        )
+
+        task = extra_body.get("task")
         bot_task = extra_body.get("bot_task")
+        sys_type = extra_body.get("sys_type")
+        custom_system_prompt = extra_body.get("system_prompt")
+        if task is None and bot_task in set(_hunyuan3_available_tasks()):
+            task = bot_task
+            bot_task = None
 
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
@@ -2260,13 +2275,20 @@ def _build_multistage_generation_inputs(
 
         prompt_token_ids: list[int] | None = None
         system_prompt_type: str | None = None
-        if bot_task:
+        if task or bot_task:
             from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
                 build_prompt,
                 build_prompt_tokens,
             )
 
             num_images = len(reference_images) if reference_images else 1
+            # build_prompt defaults task="it2i"; preserve that when caller
+            # only passed bot_task semantic.
+            effective_task = task if task is not None else "it2i"
+            # build_prompt defaults bot_task="think"; preserve that for legacy
+            # callers (passing bot_task=None to build_prompt explicitly gives a
+            # different (sys, trigger) than the default "think").
+            effective_bot_task = bot_task if bot_task is not None else "think"
             if tokenizer is not None:
                 # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
                 # so AR sees the same template-tokenization HF apply_chat_template
@@ -2279,14 +2301,24 @@ def _build_multistage_generation_inputs(
                 result = build_prompt_tokens(
                     prompt,
                     tokenizer,
-                    task=bot_task,
+                    task=effective_task,
+                    bot_task=effective_bot_task,
+                    sys_type=sys_type,
+                    custom_system_prompt=custom_system_prompt,
                     num_images=num_images,
                 )
                 prompt_token_ids = result.token_ids
                 system_prompt_type = result.system_prompt_type
             else:
                 # Legacy string path (e.g. unit tests with no tokenizer plumbed).
-                prompt = build_prompt(prompt, task=bot_task, num_images=num_images)
+                prompt = build_prompt(
+                    prompt,
+                    task=effective_task,
+                    bot_task=effective_bot_task,
+                    sys_type=sys_type,
+                    custom_system_prompt=custom_system_prompt,
+                    num_images=num_images,
+                )
             if reference_images and len(reference_images) == 1:
                 engine_prompt_data = {"image": reference_images[0]}
                 modalities = ["image"]

From 99c5eec085b42c05de937a8e7c117155c7c0234c Mon Sep 17 00:00:00 2001
From: TaffyOfficial <wu15922848573@outlook.com>
Date: Tue, 12 May 2026 10:51:44 +0800
Subject: [PATCH 21/43] fix(hunyuan_image3): align online edit AR input with
 offline path

Two complementary fixes that close the gap where online /v1/images/edits
systematically produced different AR cot (e.g. "3 magnets" semantic) from
offline end2end.py ("1 magnet" semantic) on the same prompt + seed +
images, even after the P0 byte-equivalent prompt_token_ids and P1
task/bot_task/sys_type API split landed.

1. RGB normalization in _load_input_images (root cause for the
   systematic semantic divergence)

   input_1_0.png in the demo set is RGBA with 57,671 fully-transparent
   pixels. Offline `end2end.py` opens images with
   `Image.open(...).convert("RGB")`, which composites transparent pixels
   over BLACK. Online had no such normalization; the Hunyuan AR image
   processor receives the raw RGBA upload and alpha-composites over
   WHITE. The two paths therefore fed AR two different RGB tensors at
   the encoder boundary -- enough to make AR recaption diverge into
   different scene interpretations even with byte-identical
   prompt_token_ids.

   Fix: `_load_input_images(... normalize_rgb=True)` defaults to RGB
   normalization. `edit_images` opts in only when the caller passes
   Hunyuan-aware prompt controls (task / bot_task / sys_type); mask
   stays untouched so its alpha role is preserved. Diagnosis by
   Codex; thanks.

2. Determinize cond-image VAE encode

   Both AR-side `_vae_encode` (model_executor) and DiT-side cond VAE
   encoding (pipeline_hunyuan_image3) called
   `latent_dist.sample()` with no generator, consuming torch's global
   RNG state. Fresh-process callers (offline) hit a stable post-init
   RNG state every invocation so this looked deterministic; long-running
   servers (online) mix per-request scheduler/UUID/etc into the global
   RNG before this call, so same-seed curls got drifting cond latents
   across requests. Cond image at this site is declared `t=0` clean
   conditioning -- no stochasticity needed.

   Fix: pass a fresh `torch.Generator(device=...).manual_seed(0)` at
   both call sites. Cond latents now deterministic across runs and
   across paths.

   Why `.sample(seeded_gen)` instead of `.mode()`: AR-side
   DiagonalGaussianDistribution has `.mode()`, but the DiT-side
   counterpart in diffusion/.../autoencoder.py does not implement it.
   The seeded `.sample()` works on both sides and matches HF upstream's
   `latent_dist.sample(generator)` signature -- a strict improvement
   over HF default (HF defaults the generator to None and inherits the
   same silent non-determinism).

   Related memory: `memory/feedback/painterly_silent_bugs.md` flagged
   the same bug class once before; this is the cond-image-encode
   incarnation.

E2E smoke (4x L20X, HunyuanImage-3.0-Instruct, two ref images, curl
/v1/images/edits with task=it2i bot_task=think_recaption
sys_type=en_unified seed=42 steps=50 guidance=5.0):

  - before either fix:        "3 magnets on canvas" (offline produces 1)
  - after cond VAE fix only:  "3 magnets on canvas" (within-run drift
                              reduced from 73-token to 10-token spread
                              but cross-path semantic still wrong)
  - after both fixes:         "1 magnet on canvas" -- in the same
                              semantic neighborhood as the offline
                              baseline

Tests: 153 unit tests pass, ruff clean. Surgical API-level regression
tests for the two fixes deferred (would require GPU fixtures for the
cond VAE side; the RGB side is small enough that the e2e proof is the
contract).

Signed-off-by: TaffyOfficial <wu15922848573@outlook.com>
---
 .../hunyuan_image3/pipeline_hunyuan_image3.py |  6 +++-
 vllm_omni/entrypoints/openai/api_server.py    | 28 ++++++++++++++++---
 .../models/hunyuan_image3/hunyuan_image3.py   | 12 +++++++-
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 5c6ddba0b64..e927f278340 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,7 +634,11 @@ def vae_encode(self, image, cfg_factor=1):
             if isinstance(vae_encode_result, torch.Tensor):
                 latents = vae_encode_result
             else:
-                latents = vae_encode_result.latent_dist.sample()
+                # Fixed-seed Generator so cond latents are deterministic
+                # across calls; see AR-side comment in
+                # model_executor/.../hunyuan_image3.py:_vae_encode.
+                _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
+                latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
             if hasattr(config, "shift_factor") and config.shift_factor:
                 latents.sub_(config.shift_factor)
             if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 77dc026bc97..b485b6a3946 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1759,16 +1759,23 @@ async def edit_images(
                 status_code=HTTPStatus.BAD_REQUEST.value,
                 detail=detail,
             )
-        pil_images = await _load_input_images(input_images_list)
+        # Only convert uploads to RGB when the caller opts into the
+        # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy
+        # callers that send only the older bot_task=<task-enum> shape keep
+        # whatever PIL mode the upload arrived as, to preserve pre-existing
+        # behavior for non-Hunyuan flows.
+        normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
+        pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
         prompt["multi_modal_data"] = {}
         prompt["multi_modal_data"]["image"] = pil_images
 
         if mask_image is not None:
-            loaded = await _load_input_images([mask_image])
+            # Mask role is different (alpha channel matters); never normalize.
+            loaded = await _load_input_images([mask_image], normalize_rgb=False)
             prompt["multi_modal_data"]["mask_image"] = loaded[0]
 
         if reference_image is not None:
-            loaded = await _load_input_images([reference_image])
+            loaded = await _load_input_images([reference_image], normalize_rgb=normalize_edit_images_rgb)
             prompt["multi_modal_data"]["reference_image"] = loaded[0]
 
         # 3 Build sample params
@@ -2220,6 +2227,8 @@ def _extract_images_from_result(result: Any) -> list[Any]:
 
 async def _load_input_images(
     inputs: list[str],
+    *,
+    normalize_rgb: bool = True,
 ) -> list[Image.Image]:
     """
     convert to PIL.Image.Image list
@@ -2266,7 +2275,18 @@ async def _load_input_images(
     if not images:
         raise ValueError("No valid input images found")
 
-    return images
+    if not normalize_rgb:
+        return images
+
+    # Match the offline HunyuanImage3 image-edit example path, which eagerly
+    # normalizes input files with ``Image.open(...).convert("RGB")`` before
+    # they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes
+    # online IT2I observe a different visual input than offline (for example
+    # transparent-logo PNGs alpha-composited over white instead of black),
+    # which is enough for HunyuanImage3 AR recaption to diverge before DiT
+    # sees the request -- root cause of the "online 3 magnets vs offline 1
+    # magnet" systematic semantic mismatch.
+    return [img.convert("RGB") for img in images]
 
 
 def _choose_output_format(output_format: str | None, background: str | None) -> str:
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 756a7a27c9b..216543b9593 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1776,7 +1776,17 @@ def _vae_encode(
             images = images.to(dtype=self.vae.dtype)
 
         vae_encode_result = self.vae.encode(images)
-        latents = vae_encode_result.latent_dist.sample()
+        # Cond image encoding is supposed to be deterministic clean
+        # conditioning (the comment below declares `t=0`). `.sample()`
+        # without a generator consumes torch's global RNG, which made
+        # cond latents drift between requests on a long-running server
+        # (online) while looking deterministic for fresh-process callers
+        # (offline) -- silent path-level non-determinism. Feed a fixed
+        # generator so all callers see identical cond latents.
+        import torch as _torch  # local alias to keep blast radius minimal
+
+        _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0)
+        latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
 
         # Apply shift and scaling factors if present
         if hasattr(config, "shift_factor") and config.shift_factor:

From 4d8c600391d2178cb1ad8aa446b34c7cc6b7a51f Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 11:46:17 +0800
Subject: [PATCH 22/43] fix(hunyuan_image3): address PR #3444 review feedback

Apply two rounds of code review fixes on the multi-image IT2I PR:

Cond VAE determinism
  Replace `latent_dist.sample()` + `manual_seed(0)` hardcoding with
  `latent_dist.mode()` on both AR (`model_executor/.../hunyuan_image3.py
  ::_vae_encode`) and DiT (`diffusion/.../pipeline_hunyuan_image3.py`)
  sides. Cond image is clean (t=0) conditioning by design; posterior mean
  is deterministic by construction and matches the official cond encode
  path. Adds `.mode()` to the DiT-side `DiagonalGaussianDistribution`.

Stale compound task names (two-axis API migration)
  Repo-wide grep for `{t2t,i2t,it2i,t2i}x{think,recaption,think_recaption,
  vanilla}` cross-product turned up two residual compound names that the
  initial cleanup missed:
    - tests/e2e/accuracy/test_hunyuan_image3.py: task='it2i_recaption'
      -> task='it2i', bot_task='recaption' (would have ValueErrored at
      _resolve_preset on the new two-axis API).
    - tests/diffusion/.../test_prompt_utils.py: task='t2i_think' /
      task='t2i_recaption' -> (task='t2i', bot_task='think|recaption').

Custom system prompt body forwarding (producer -> consumer trace)
  Online `/v1/images/edits` accepted `sys_type='custom'` + `system_prompt`
  body on the AR side via `build_prompt_tokens(custom_system_prompt=...)`,
  but only forwarded `use_system_prompt` to the engine_prompt. DiT's
  `get_system_prompt(use, "image", body)` reads the body as the third
  positional arg, so `sys_type='custom'` was silently falling back to an
  empty DiT system prefix -- AR/DiT divergence under a user-visible knob.
  Forward `system_prompt` through both `serving_chat` engine_prompt and
  `stage_input_processors/hunyuan_image3.py::ar2diffusion` -> DiT
  `diffusion_input`.

Ratio extraction simplification
  Drop the regex path on `generated_text` -- only worked under
  `skip_special_tokens: False`, which most deploy yamls don't set. Pure
  token-id reverse scan against `_build_ratio_id_lookup` is the source of
  truth (AR `_stage_transitions` forces exactly one `<img_ratio_*>`
  emission). Drop unused `_RATIO_TOKEN_RE` constant, `re` import, and
  `generated_text` parameter from `_extract_ratio_index`.

Housekeeping
  - Remove duplicate `engine_prompt["prompt_token_ids"]` assignment in
    serving_chat.py (merge residue, the second copy was added by the
    main-merge then re-introduced after the API split).
  - `examples/.../end2end.py`: stale `_TASK_PRESETS` comment ->
    `available_tasks` helper (symbol no longer exists post-split).
  - `process_image` comment in `model_executor/.../hunyuan_image3.py`
    clarifies the AR-side `_resize_and_crop` default vs the official
    `infer_align_image_size=False` (center crop) default.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/end2end.py                 |  8 ++-
 .../hunyuan_image3/test_prompt_utils.py       |  4 +-
 tests/e2e/accuracy/test_hunyuan_image3.py     |  8 ++-
 .../models/hunyuan_image3/autoencoder.py      |  3 ++
 .../hunyuan_image3/pipeline_hunyuan_image3.py |  9 ++--
 vllm_omni/entrypoints/openai/serving_chat.py  |  6 +++
 .../models/hunyuan_image3/hunyuan_image3.py   | 25 +++++-----
 .../stage_input_processors/hunyuan_image3.py  | 49 +++++++------------
 8 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 82e8c194c5a..908109d65a3 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,7 +18,13 @@
 _REPO_ROOT = Path(__file__).resolve().parents[3]
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
 _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
-
+# Modality → (task, default bot_task) mapping. `task` selects only whether
+# `<img>` placeholders are emitted; `bot_task` (None | think | recaption |
+# think_recaption | vanilla) selects the system prompt + trigger tag.
+#
+# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short
+# forms match the internal task names (see prompt_utils.available_tasks)
+# so users who think in those terms don't have to translate.
 _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
     "text2img": ("t2i", "think"),
     "t2i": ("t2i", "think"),
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 4d98bc5dcf2..2ddfbea42dd 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -69,8 +69,8 @@ def test_legacy_task_presets_still_available():
 def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
     tok = FakeTokenizer()
     answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
-    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id]
-    assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id]
+    assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id]
+    assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id]
     assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
 
 
diff --git a/tests/e2e/accuracy/test_hunyuan_image3.py b/tests/e2e/accuracy/test_hunyuan_image3.py
index 93671e7bbf6..0871793c5db 100644
--- a/tests/e2e/accuracy/test_hunyuan_image3.py
+++ b/tests/e2e/accuracy/test_hunyuan_image3.py
@@ -93,7 +93,13 @@ def _run(stage_config_path: str, output_path: Path) -> tuple[Image.Image, str, f
     from vllm_omni.platforms import current_omni_platform
 
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-    result = build_prompt_tokens(PROMPT, tokenizer, task="it2i_recaption", sys_type="en_unified")
+    result = build_prompt_tokens(
+        PROMPT,
+        tokenizer,
+        task="it2i",
+        bot_task="recaption",
+        sys_type="en_unified",
+    )
     token_ids = result.token_ids
     system_prompt_type = result.system_prompt_type
 
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
index efba2f27435..ddd7d5c6df7 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
@@ -46,6 +46,9 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor:
         x = self.mean + self.std * sample
         return x
 
+    def mode(self) -> torch.FloatTensor:
+        return self.mean
+
 
 @dataclass
 class DecoderOutput(BaseOutput):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index e927f278340..5a9d1e48856 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,11 +634,10 @@ def vae_encode(self, image, cfg_factor=1):
             if isinstance(vae_encode_result, torch.Tensor):
                 latents = vae_encode_result
             else:
-                # Fixed-seed Generator so cond latents are deterministic
-                # across calls; see AR-side comment in
-                # model_executor/.../hunyuan_image3.py:_vae_encode.
-                _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
-                latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
+                # Cond image is clean conditioning (t=0 below) -- use the
+                # posterior mean so encoding is deterministic by construction.
+                # See AR-side comment in model_executor/.../hunyuan_image3.py.
+                latents = vae_encode_result.latent_dist.mode()
             if hasattr(config, "shift_factor") and config.shift_factor:
                 latents.sub_(config.shift_factor)
             if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index d1b2e89ae80..4ba824f0909 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2328,6 +2328,12 @@ def _build_multistage_generation_inputs(
             engine_prompt["prompt_token_ids"] = prompt_token_ids
         if system_prompt_type is not None:
             engine_prompt["use_system_prompt"] = system_prompt_type
+        # Forward the custom system prompt body too. DiT's
+        # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads
+        # the third positional arg, so leaving it None turns a `sys_type=custom`
+        # request into an empty DiT system prefix (AR/DiT divergence).
+        if custom_system_prompt is not None:
+            engine_prompt["system_prompt"] = custom_system_prompt
         engine_prompt["modalities"] = modalities
         if negative_prompt is not None:
             engine_prompt["negative_prompt"] = negative_prompt
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 216543b9593..9f3b76039d0 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,9 +907,10 @@ def process_image(self, image_input: ImageInput):
             current_info["vit_spatial_shapes"] = _ss.squeeze(0)
 
             # VAE: per-image bucket via `reso_group.get_target_size`; mirrors
-            # HF's `resize_and_crop` (crop_type="center"). Keep fp32 — the
-            # VAE encoder casts to model dtype at its boundary (see
-            # `_vae_encode`).
+            # HF's `resize_and_crop` (crop_type="center", the official
+            # generate_image default with infer_align_image_size=False).
+            # Keep fp32 — the VAE encoder casts to model dtype at its
+            # boundary (see `_vae_encode`).
             image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
             resized_image = self._resize_and_crop(image, (image_width, image_height))
             vae_pixel_values = self.vae_processor(resized_image).squeeze(0)
@@ -1776,17 +1777,13 @@ def _vae_encode(
             images = images.to(dtype=self.vae.dtype)
 
         vae_encode_result = self.vae.encode(images)
-        # Cond image encoding is supposed to be deterministic clean
-        # conditioning (the comment below declares `t=0`). `.sample()`
-        # without a generator consumes torch's global RNG, which made
-        # cond latents drift between requests on a long-running server
-        # (online) while looking deterministic for fresh-process callers
-        # (offline) -- silent path-level non-determinism. Feed a fixed
-        # generator so all callers see identical cond latents.
-        import torch as _torch  # local alias to keep blast radius minimal
-
-        _cond_vae_gen = _torch.Generator(device=images.device).manual_seed(0)
-        latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
+        # Cond image is clean (t=0) conditioning -- take the posterior mean
+        # so encoding is deterministic by construction. `.sample()` without a
+        # generator consumes torch's global RNG and silently drifts between
+        # requests on a long-running server (online) while looking stable for
+        # fresh-process callers (offline). `.mode()` matches the official
+        # HunyuanImage-3 cond encode path.
+        latents = vae_encode_result.latent_dist.mode()
 
         # Apply shift and scaling factors if present
         if hasattr(config, "shift_factor") and config.shift_factor:
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 158ea86dbf2..c95a2a48f18 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -13,7 +13,6 @@
 from __future__ import annotations
 
 import os
-import re
 from functools import lru_cache
 from typing import Any
 
@@ -33,7 +32,6 @@
 # (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
 # i.e. the first reference image's bucket — usually square, see
 # api_server.py:1808-1811).
-_RATIO_TOKEN_RE = re.compile(r"<img_ratio_(\d+)>")
 _DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
 
 
@@ -158,42 +156,27 @@ def _id(name: str) -> int | None:
     return table
 
 
-def _extract_ratio_index(generated_text: str, generated_token_ids, model_name_or_path: str) -> int | None:
+def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None:
     """Resolve the AR-predicted ratio_index from this stage's output.
 
-    Two probe paths:
-      1. Text regex on `generated_text` — works when the AR engine is
-         configured with `skip_special_tokens: False` (e.g.
-         `hunyuan_image3_it2i_kv_reuse.yaml`). Cheap and avoids loading
-         the tokenizer.
-      2. Token-id scan over `cumulative_token_ids` against the tokenizer's
-         `<img_ratio_*>` id range — survives `skip_special_tokens: True`
-         where the special tokens are stripped from text but still present
-         in the raw token stream.
-
-    Takes the LAST ratio token in the stream because the AR's
-    stage-transition logic emits exactly one such token at the tail of the
-    `<img_size_*><img_ratio_*><eos>` sequence; using "last" is robust to
-    any earlier accidental occurrences in the prompt scaffold.
+    `HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit
+    exactly one `<img_ratio_*>` token after `</recaption><answer><boi>
+    <img_size_*>`, so we scan the token stream from the tail for the first
+    id that maps to a ratio. Token-ids are the source of truth — text-side
+    regex is unreliable because most deploy yamls run AR with
+    `skip_special_tokens: True` (special tokens are stripped from text but
+    still present in `cumulative_token_ids`).
     """
-    matches = _RATIO_TOKEN_RE.findall(generated_text or "")
-    if matches:
-        try:
-            return int(matches[-1])
-        except ValueError:
-            pass
-
     if generated_token_ids is None:
         return None
     table = _build_ratio_id_lookup(model_name_or_path)
     if not table:
         return None
-    last_ratio_idx: int | None = None
-    for tid in generated_token_ids:
+    for tid in reversed(list(generated_token_ids)):
         idx = table.get(int(tid))
         if idx is not None:
-            last_ratio_idx = idx
-    return last_ratio_idx
+            return idx
+    return None
 
 
 def ar2diffusion(
@@ -237,6 +220,7 @@ def ar2diffusion(
         width = original_prompt.get("width", 1024)
         text_prompt = original_prompt.get("prompt", "")
         use_system_prompt = original_prompt.get("use_system_prompt")
+        custom_system_prompt = original_prompt.get("system_prompt")
 
         # Prefer the AR's predicted output aspect (`<img_size_*><img_ratio_*>`
         # tail emitted by `HunyuanImage3ForCausalMM.sample` under the
@@ -249,7 +233,7 @@ def ar2diffusion(
         model_name_or_path = original_prompt.get("model") or os.environ.get(
             "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
         )
-        ratio_idx = _extract_ratio_index(generated_text, generated_token_ids, model_name_or_path)
+        ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path)
         ar_predicted = False
         if ratio_idx is not None:
             base_size = int(original_prompt.get("image_base_size", 1024))
@@ -302,9 +286,14 @@ def ar2diffusion(
             },
         }
 
-        # Forward use_system_prompt so the DiT can build the same system prefix
+        # Forward use_system_prompt so the DiT can build the same system prefix.
+        # Also forward the custom system prompt body when sys_type=custom so
+        # DiT's `get_system_prompt(use, "image", body)` doesn't fall back to
+        # an empty prefix and silently diverge from AR.
         if use_system_prompt is not None:
             diffusion_input["use_system_prompt"] = use_system_prompt
+        if custom_system_prompt is not None:
+            diffusion_input["system_prompt"] = custom_system_prompt
 
         # Forward multimodal data (original image for IT2I conditioning).
         # The diffusion pre_process_func reads multi_modal_data["image"], which

From 329851727cc08022fcccdea0b22e258777c6db51 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 11:50:00 +0800
Subject: [PATCH 23/43] chore: appease ruff F841 / typos / ruff-format
 pre-commit

CI feedback from the previous push:
- F841: drop unused `QKEY` in test_serving_chat_multistage_generation.py
- typos: avoid the dictionary trigger on "PNGs" plural -- the lowercased
  form lands in the crate-ci/typos dictionary as a misspelling; rephrase
  to "transparent-logo uploads" without changing meaning.
- ruff-format: collapse the `build_prompt_tokens(...)` call in the e2e
  accuracy test back to a single line (line is under the 120 char limit
  ruff-format enforces locally).

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../openai_api/test_serving_chat_multistage_generation.py       | 1 -
 vllm_omni/entrypoints/openai/api_server.py                      | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 88d15a684b6..bd2e877bf18 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -391,7 +391,6 @@ def test_build_multistage_generation_inputs_custom_system_prompt(serving_chat):
     )
     images = [Image.new("RGB", (32, 32), color="red")]
 
-    QKEY = "prompt"
     marker = "ZZZ_CUSTOM_SYSTEM_PROMPT_MARKER_ZZZ"
 
     out, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index b485b6a3946..80b01ec284a 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2282,7 +2282,7 @@ async def _load_input_images(
     # normalizes input files with ``Image.open(...).convert("RGB")`` before
     # they reach the AR stage. Keeping uploads as RGBA/P PIL objects makes
     # online IT2I observe a different visual input than offline (for example
-    # transparent-logo PNGs alpha-composited over white instead of black),
+    # transparent-logo uploads alpha-composited over white instead of black),
     # which is enough for HunyuanImage3 AR recaption to diverge before DiT
     # sees the request -- root cause of the "online 3 magnets vs offline 1
     # magnet" systematic semantic mismatch.

From 808aca089a36aa990b3b2a8d05de7683cad28355 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 13:12:51 +0800
Subject: [PATCH 24/43] fix(hunyuan_image3): align AR cond image preprocessing
 with DiT (center crop)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AR-side `HunyuanImage3Processor._resize_and_crop` previously defaulted to
`crop_type="resize"` (stretch), while the DiT-side condition-image helper
`_resize_and_crop_center` always center-crops. For any portrait input
mapped to a landscape output bucket (or vice versa), AR and DiT then
conditioned on **visibly different fabric regions**: AR saw the input
stretched to fit, DiT saw the input center-cropped to fit. The two cond
latents disagreed on what the surroundings should be, and DiT had to
inpaint the lateral canvas extension on its own — producing seam-like
vertical brightness bands at the AR/DiT-disagreement boundary (reported
on `/tmp/rgbfix/result.png` IT2I run with 735x1104 input -> 1280x720
output).

Change AR-side default to `crop_type="center"`, matching:

- DiT-side `_resize_and_crop_center` (always center).
- Official `generate_image(..., infer_align_image_size=False)` (the
  default; reading `hunyuan3.0_ins/image_processor.py:355-358` maps the
  False branch to `random_crop="center"`).

Add a CPU-only regression test asserting AR and DiT preprocessing
produce **byte-identical** pixels for 4 src sizes x 4 target buckets,
covering portrait->landscape, landscape->portrait, and square aspects.
No model weights / tokenizer / HF cache required, runs in CI.

Co-authored-by: Codex
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../test_hunyuan_image3_it2i_ar_format.py     | 39 +++++++++++++++++++
 .../models/hunyuan_image3/hunyuan_image3.py   | 16 ++++----
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 7e7b7de91b2..916b565c1af 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -72,6 +72,45 @@ def _snapshot_dir(model_id: str) -> pathlib.Path:
 # tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py.
 
 
+def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
+    """AR and DiT must preprocess the same IT2I condition image into the
+    same VAE pixels.
+
+    This catches drift between the AR-side multimodal processor and the
+    diffusion-side bridge without requiring model weights or tokenizer files.
+    In particular, portrait input expanded to a landscape output is sensitive
+    to accidentally using ``crop_type="resize"`` on one side and center crop
+    on the other; the two paths then condition on visibly different fabric
+    regions and leave seam-like artifacts around the edited object.
+    """
+    import numpy as np
+    from PIL import Image
+
+    from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import (
+        _resize_and_crop_center,
+    )
+    from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import (
+        HunyuanImage3Processor,
+    )
+
+    rng = np.random.default_rng(seed=3444)
+    src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)]
+    target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)]
+
+    for src_w, src_h in src_size_pairs:
+        src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8)
+        src = Image.fromarray(src_arr, mode="RGB")
+        for tw, th in target_size_pairs:
+            ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th))
+            dit_out = _resize_and_crop_center(src, tw, th)
+
+            assert ar_out.size == dit_out.size == (tw, th)
+            assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
+                f"AR and DiT condition preprocessing diverged for "
+                f"src={(src_w, src_h)} target={(tw, th)}"
+            )
+
+
 _OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot"
 
 
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 9f3b76039d0..40a38c7b5ac 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput):
             current_info["vit_spatial_shapes"] = _ss.squeeze(0)
 
             # VAE: per-image bucket via `reso_group.get_target_size`; mirrors
-            # HF's `resize_and_crop` (crop_type="center", the official
-            # generate_image default with infer_align_image_size=False).
+            # HF's `resize_and_crop` default (crop_type="center", the official
+            # generate_image default when infer_align_image_size=False).
             # Keep fp32 — the VAE encoder casts to model dtype at its
             # boundary (see `_vae_encode`).
             image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
@@ -957,13 +957,13 @@ def _resize_and_crop(
         self,
         image: Image.Image,
         target_size: tuple[int, int],
-        crop_type: str = "resize",
+        crop_type: str = "center",
     ) -> Image.Image:
-        # Default mode mirrors the official `infer_align_image_size=True`
-        # path (image_processor.py:355 → crop_type="resize") used by the
-        # IT2I demo: stretch the cond image to the bucket dims so its
-        # `<img_ratio_*>` tag and ViT/VAE features stay aligned with the
-        # bucket, instead of dropping content via center crop.
+        # Default mode mirrors official `generate_image` with
+        # infer_align_image_size=False: preserve aspect ratio and center-crop
+        # to the nearest VAE bucket. Keeping this default aligned with the
+        # DiT-side condition-image helper avoids AR and DiT seeing different
+        # conditioning pixels for the same IT2I request.
         tw, th = target_size
         if crop_type == "resize":
             return image.resize((tw, th), resample=Image.Resampling.LANCZOS)

From 297a2f5a7efc4525d6184e7de802ad70c71332d2 Mon Sep 17 00:00:00 2001
From: zuiho <2324465096@qq.com>
Date: Wed, 13 May 2026 09:14:48 +0800
Subject: [PATCH 25/43] test(hunyuan_image3): apply ruff format hook fixes

Signed-off-by: zuiho <2324465096@qq.com>
---
 .../hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py       | 3 +--
 tests/diffusion/models/hunyuan_image3/test_prompt_utils.py     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 916b565c1af..7550caa50f7 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -106,8 +106,7 @@ def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
 
             assert ar_out.size == dit_out.size == (tw, th)
             assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
-                f"AR and DiT condition preprocessing diverged for "
-                f"src={(src_w, src_h)} target={(tw, th)}"
+                f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}"
             )
 
 
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 2ddfbea42dd..641cd5dc9c0 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -9,8 +9,8 @@
 import pytest
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
     _TASK_PRESETS,
+    HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
     available_bot_tasks,
     available_tasks,
     build_prompt,

From 4cf71f2afe9b7b7dcaf1656398f084534751ea44 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 10:36:24 +0800
Subject: [PATCH 26/43] fix(hunyuan_image3): preserve legacy plain prompt tasks

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 22 ++++++++++++++++++
 .../models/hunyuan_image3/prompt_utils.py     | 23 +++++++++++++++----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 641cd5dc9c0..ef635825c3b 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -66,6 +66,28 @@ def test_legacy_task_presets_still_available():
     } <= set(_TASK_PRESETS)
 
 
+def test_legacy_base_task_omitted_bot_task_keeps_plain_mode():
+    prompt = build_prompt("HELLO", task="i2t")
+    assert prompt.endswith("Assistant: ")
+    assert not prompt.endswith("<think>")
+
+    result = build_prompt_tokens("hi", FakeTokenizer(), task="i2t")
+    assert result.system_prompt_type == "en_unified"
+    assert result.token_ids[-1] not in {
+        FakeTokenizer.SPECIAL["<think>"],
+        FakeTokenizer.SPECIAL["<recaption>"],
+    }
+
+
+def test_default_prompt_still_uses_it2i_think_mode():
+    prompt = build_prompt("HELLO")
+    assert prompt.endswith("Assistant: <think>")
+
+    result = build_prompt_tokens("hi", FakeTokenizer())
+    assert result.system_prompt_type == "en_unified"
+    assert result.token_ids[-1] == FakeTokenizer.SPECIAL["<think>"]
+
+
 def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
     tok = FakeTokenizer()
     answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 4ed277eeed2..6e1453d0ed2 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -74,6 +74,13 @@
 
 _TASKS: frozenset[str] = frozenset({"t2t", "i2t", "it2i", "t2i"})
 
+
+class _DefaultBotTask:
+    pass
+
+
+_DEFAULT_BOT_TASK = _DefaultBotTask()
+
 # Legacy composite task alias -> (task, bot_task). Keep this during rebase so
 # older callers and intermediate commits still resolve cleanly.
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
@@ -89,7 +96,11 @@
 }
 
 
-def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str, str | None]:
+def _normalize_task_and_bot_task(
+    task: str,
+    bot_task: str | None | _DefaultBotTask,
+) -> tuple[str, str | None]:
+    bot_task_was_omitted = bot_task is _DEFAULT_BOT_TASK
     if task in _TASK_PRESETS:
         _, legacy_bot_task, _ = _TASK_PRESETS[task]
         base_task = task.split("_", 1)[0]
@@ -97,9 +108,11 @@ def _normalize_task_and_bot_task(task: str, bot_task: str | None) -> tuple[str,
             base_task = "t2i"
         if task in ("t2t", "i2t", "t2i"):
             base_task = task
-        if bot_task is None:
+        if bot_task_was_omitted:
             bot_task = legacy_bot_task
         task = base_task
+    elif bot_task_was_omitted:
+        bot_task = "think"
     return task, bot_task
 
 
@@ -123,7 +136,7 @@ def resolve_sys_type(bot_task: str | None) -> str:
 
 def resolve_stop_token_ids(
     task: str = "it2i",
-    bot_task: str | None = "think",
+    bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
     tokenizer: Any | None = None,
 ) -> list[int]:
     task, bot_task = _normalize_task_and_bot_task(task, bot_task)
@@ -158,7 +171,7 @@ def _resolve_preset(task: str, bot_task: str | None) -> tuple[str, str | None]:
 def build_prompt(
     user_prompt: str,
     task: str = "it2i",
-    bot_task: str | None = "think",
+    bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
     num_images: int = 1,
@@ -205,7 +218,7 @@ def build_prompt_tokens(
     user_prompt: str,
     tokenizer,
     task: str = "it2i",
-    bot_task: str | None = "think",
+    bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
     num_images: int = 1,

From cf7e4a24f8874e5667acdd07993d683288af7562 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:12:19 +0800
Subject: [PATCH 27/43] fix(hunyuan_image3): align prompt token tests with
 result API

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../test_hunyuan_image3_it2i_multi_image.py   | 24 +++++++++----------
 .../hunyuan_image3/test_prompt_utils.py       |  8 +++++++
 .../models/hunyuan_image3/prompt_utils.py     |  8 +++++++
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
index 7a1e266b936..1e0fd159063 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_multi_image.py
@@ -114,11 +114,11 @@ def test_build_prompt_tokens_inserts_N_img_ids(task: str, bot_task: str | None):
     """N=1/2/3 -> the resulting id sequence contains exactly N copies of
     img_id (=2) sitting consecutively after the `User: ` segment."""
     tok = FakeTokenizer()
-    ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1)
+    ids_n1 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=1).token_ids
     tok = FakeTokenizer()
-    ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2)
+    ids_n2 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=2).token_ids
     tok = FakeTokenizer()
-    ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3)
+    ids_n3 = build_prompt_tokens("hi", tok, task=task, bot_task=bot_task, num_images=3).token_ids
 
     assert ids_n1.count(2) == 1
     assert ids_n2.count(2) == 2
@@ -145,9 +145,9 @@ def test_build_prompt_tokens_default_num_images_matches_legacy():
     omitting the parameter (regression guard for existing single-image
     callers)."""
     tok_a = FakeTokenizer()
-    legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think")
+    legacy = build_prompt_tokens("hi", tok_a, task="it2i", bot_task="think").token_ids
     tok_b = FakeTokenizer()
-    explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1)
+    explicit = build_prompt_tokens("hi", tok_b, task="it2i", bot_task="think", num_images=1).token_ids
     assert legacy == explicit
     # Also: encode() must have been called on the same set of segments,
     # so segment boundaries are preserved.
@@ -173,7 +173,7 @@ def test_text_only_tasks_ignore_num_images(task: str, bot_task: str | None, num_
     any num_images and emit zero `<img>` placeholders."""
     s = build_prompt("hi", task=task, bot_task=bot_task, num_images=num_images)
     assert "<img>" not in s
-    ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images)
+    ids = build_prompt_tokens("hi", FakeTokenizer(), task=task, bot_task=bot_task, num_images=num_images).token_ids
     assert 2 not in ids
 
 
@@ -202,7 +202,7 @@ def test_real_tokenizer_emits_n_consecutive_img_ids(num_images: int):
     img_id = tok.convert_tokens_to_ids("<img>")
     assert img_id is not None and img_id >= 0, f"<img> not in tokenizer vocab; got id={img_id}"
 
-    ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images)
+    ids = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=num_images).token_ids
 
     # Exactly N copies of <img> id, all consecutive.
     img_positions = [i for i, x in enumerate(ids) if x == img_id]
@@ -225,9 +225,9 @@ def test_real_tokenizer_n_plus_one_extends_by_exactly_one_img_id():
     tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
     img_id = tok.convert_tokens_to_ids("<img>")
 
-    ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
-    ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2)
-    ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3)
+    ids_n1 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids
+    ids_n2 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=2).token_ids
+    ids_n3 = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=3).token_ids
 
     assert len(ids_n2) == len(ids_n1) + 1, f"N=2 should be N=1 + 1 token; got {len(ids_n2)} vs {len(ids_n1)}"
     assert len(ids_n3) == len(ids_n1) + 2, f"N=3 should be N=1 + 2 tokens; got {len(ids_n3)} vs {len(ids_n1)}"
@@ -250,6 +250,6 @@ def test_real_tokenizer_default_n1_byte_identical_to_legacy():
     from transformers import AutoTokenizer
 
     tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
-    legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think")
-    explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1)
+    legacy = build_prompt_tokens("hi", tok, task="it2i", bot_task="think").token_ids
+    explicit = build_prompt_tokens("hi", tok, task="it2i", bot_task="think", num_images=1).token_ids
     assert legacy == explicit, "real tokenizer: default num_images=1 must be byte-identical to legacy"
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index ef635825c3b..371646556f2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -79,6 +79,14 @@ def test_legacy_base_task_omitted_bot_task_keeps_plain_mode():
     }
 
 
+def test_legacy_composite_task_with_none_bot_task_keeps_encoded_mode():
+    prompt = build_prompt("HELLO", task="it2i_think", bot_task=None)
+    assert prompt.endswith("Assistant: <think>")
+
+    result = build_prompt_tokens("hi", FakeTokenizer(), task="it2i_recaption", bot_task=None)
+    assert result.token_ids[-1] == FakeTokenizer.SPECIAL["<recaption>"]
+
+
 def test_default_prompt_still_uses_it2i_think_mode():
     prompt = build_prompt("HELLO")
     assert prompt.endswith("Assistant: <think>")
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 6e1453d0ed2..f78b19a5746 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -95,6 +95,8 @@ class _DefaultBotTask:
     "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
 }
 
+_LEGACY_COMPOSITE_TASKS: frozenset[str] = frozenset(_TASK_PRESETS) - {"t2t", "i2t", "t2i"}
+
 
 def _normalize_task_and_bot_task(
     task: str,
@@ -110,6 +112,12 @@ def _normalize_task_and_bot_task(
             base_task = task
         if bot_task_was_omitted:
             bot_task = legacy_bot_task
+        elif task in _LEGACY_COMPOSITE_TASKS and bot_task is None:
+            # Composite task names already encode the legacy bot_task. Keep
+            # calls like build_prompt_tokens(task="it2i_think", bot_task=None)
+            # on their historical meaning; explicit None is the plain-mode
+            # escape hatch only for the new two-axis base tasks.
+            bot_task = legacy_bot_task
         task = base_task
     elif bot_task_was_omitted:
         bot_task = "think"

From 4fb78a3b4bee4d4e97e6684f888ca97c4bfd4875 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:48:10 +0800
Subject: [PATCH 28/43] fix(hunyuan_image3): harden edit bridge compatibility

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 ...test_serving_chat_multistage_generation.py |  86 ++++++++
 .../test_hunyuan_image3.py                    | 103 ++++++++++
 .../hunyuan_image3/pipeline_hunyuan_image3.py |   7 +-
 vllm_omni/entrypoints/openai/api_server.py    |   9 +-
 vllm_omni/entrypoints/openai/serving_chat.py  |  43 ++--
 .../stage_input_processors/hunyuan_image3.py  | 186 +++++++++---------
 6 files changed, 319 insertions(+), 115 deletions(-)
 create mode 100644 tests/model_executor/stage_input_processors/test_hunyuan_image3.py

diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index bd2e877bf18..92f0ac2dc98 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -265,6 +265,92 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
     )
 
 
+@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"])
+def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str):
+    """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode.
+
+    The task/bot_task split must not normalize every legacy task-enum request
+    into bot_task="think"; i2t/t2t had no <think>/<recaption> trigger before
+    the split and should stay plain unless the caller passes an explicit
+    semantic bot_task.
+    """
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red")]
+
+    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="describe me",
+        extra_body={"bot_task": legacy_task},
+        reference_images=images if legacy_task == "i2t" else [],
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+    explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="describe me",
+        extra_body={"task": legacy_task},
+        reference_images=images if legacy_task == "i2t" else [],
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+
+    assert legacy_prompt["prompt"] == explicit_prompt["prompt"]
+    assert legacy_prompt["prompt"].endswith("Assistant: ")
+    assert not legacy_prompt["prompt"].endswith("<think>")
+    assert not legacy_prompt["prompt"].endswith("<recaption>")
+
+
+@pytest.mark.parametrize(
+    "legacy_task,trigger",
+    [
+        ("it2i_think", "<think>"),
+        ("it2i_recaption", "<recaption>"),
+    ],
+)
+def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work(
+    serving_chat,
+    legacy_task: str,
+    trigger: str,
+):
+    """Legacy composite task names passed through bot_task must still work."""
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(stage_type="llm", is_comprehension=True),
+            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.0),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+    images = [Image.new("RGB", (32, 32), color="red")]
+
+    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="edit me",
+        extra_body={"bot_task": legacy_task},
+        reference_images=images,
+        gen_params=OmniDiffusionSamplingParams(),
+    )
+
+    assert legacy_prompt["prompt"].count("<img>") == 1
+    assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}")
+
+
 def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
     """Passing bot_task=think_recaption (vs default "think") must flip the
     resolved sys_type to en_think_recaption (and trigger tag is still
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
new file mode 100644
index 00000000000..faaa9785452
--- /dev/null
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for HunyuanImage3 stage input processor."""
+
+import builtins
+from types import SimpleNamespace
+
+import pytest
+
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+)
+from vllm_omni.model_executor.stage_input_processors.hunyuan_image3 import (
+    _extract_ratio_index,
+    _truncate_at_cot_end,
+    ar2diffusion,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+def _source_output(token_ids: list[int], text: str = ""):
+    return SimpleNamespace(
+        outputs=[
+            SimpleNamespace(
+                token_ids=token_ids,
+                cumulative_token_ids=token_ids,
+                text=text,
+            )
+        ],
+        multimodal_output=None,
+    )
+
+
+def test_extract_ratio_index_uses_fixed_special_token_ids():
+    ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_33>"]
+    ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_36>"]
+
+    assert _extract_ratio_index([1, ratio_33, 2]) == 33
+    assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36
+
+
+def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials():
+    end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"]
+    answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
+    boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"]
+    ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+    token_ids = [100, 101, end_recaption, answer, boi, ratio]
+
+    text, truncated = _truncate_at_cot_end(
+        "recaption body without special markers",
+        token_ids,
+    )
+
+    assert text == "recaption body without special markers"
+    assert truncated == [100, 101, end_recaption]
+
+
+def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch):
+    real_import = builtins.__import__
+
+    def _block_transformers_import(name, *args, **kwargs):
+        if name == "transformers" or name.startswith("transformers."):
+            raise AssertionError("ar2diffusion must not import transformers on the bridge path")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", _block_transformers_import)
+
+    end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"]
+    answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
+    boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"]
+    size = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_size_1024>"]
+    ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+    token_ids = [100, 101, end_recaption, answer, boi, size, ratio_0]
+
+    result = ar2diffusion(
+        [_source_output(token_ids, text="decoded without special tokens")],
+        prompt=[{"prompt": "edit", "height": 64, "width": 64}],
+    )
+
+    assert len(result) == 1
+    assert (result[0]["height"], result[0]["width"]) == (512, 2048)
+    assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens"
+    assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption]
+
+
+def test_ar2diffusion_forwards_custom_system_prompt_body():
+    end_think = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"]
+    marker = "CUSTOM_SYSTEM_BODY"
+
+    result = ar2diffusion(
+        [_source_output([100, end_think], text="thought</think>")],
+        prompt=[
+            {
+                "prompt": "edit",
+                "use_system_prompt": "custom",
+                "system_prompt": marker,
+            }
+        ],
+    )
+
+    assert result[0]["use_system_prompt"] == "custom"
+    assert result[0]["system_prompt"] == marker
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 5a9d1e48856..35390e7312d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1366,10 +1366,13 @@ def forward(
         use_system_prompt = extra_args.get("use_system_prompt")
         system_prompt = extra_args.get("system_prompt")
         # Fall back to per-prompt use_system_prompt forwarded by ar2diffusion
-        if use_system_prompt is None and req.prompts:
+        if req.prompts:
             first_prompt = req.prompts[0]
             if isinstance(first_prompt, dict):
-                use_system_prompt = first_prompt.get("use_system_prompt")
+                if use_system_prompt is None:
+                    use_system_prompt = first_prompt.get("use_system_prompt")
+                if system_prompt is None:
+                    system_prompt = first_prompt.get("system_prompt")
         if use_system_prompt is not None:
             system_prompt = get_system_prompt(use_system_prompt, "image", system_prompt)
             system_prompt = system_prompt.strip() if system_prompt is not None else ""
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 80b01ec284a..7107b544adc 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1759,11 +1759,10 @@ async def edit_images(
                 status_code=HTTPStatus.BAD_REQUEST.value,
                 detail=detail,
             )
-        # Only convert uploads to RGB when the caller opts into the
-        # Hunyuan-aware API surface (task / bot_task / sys_type). Legacy
-        # callers that send only the older bot_task=<task-enum> shape keep
-        # whatever PIL mode the upload arrived as, to preserve pre-existing
-        # behavior for non-Hunyuan flows.
+        # Convert uploads to RGB when the caller opts into the Hunyuan-aware
+        # API surface. This includes the legacy bot_task=<task-enum> form:
+        # keeping uploads as RGBA/P PIL objects makes online IT2I observe a
+        # different visual input than the offline path.
         normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
         pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
         prompt["multi_modal_data"] = {}
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 4ba824f0909..7424a9e0d34 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2260,9 +2260,18 @@ def _build_multistage_generation_inputs(
         bot_task = extra_body.get("bot_task")
         sys_type = extra_body.get("sys_type")
         custom_system_prompt = extra_body.get("system_prompt")
-        if task is None and bot_task in set(_hunyuan3_available_tasks()):
+        legacy_task_from_bot_task = False
+        legacy_task_names = set(_hunyuan3_available_tasks()) | {
+            "it2i_think",
+            "it2i_recaption",
+            "t2i_think",
+            "t2i_recaption",
+            "t2i_vanilla",
+        }
+        if task is None and bot_task in legacy_task_names:
             task = bot_task
             bot_task = None
+            legacy_task_from_bot_task = True
 
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
@@ -2282,13 +2291,21 @@ def _build_multistage_generation_inputs(
             )
 
             num_images = len(reference_images) if reference_images else 1
-            # build_prompt defaults task="it2i"; preserve that when caller
-            # only passed bot_task semantic.
             effective_task = task if task is not None else "it2i"
-            # build_prompt defaults bot_task="think"; preserve that for legacy
-            # callers (passing bot_task=None to build_prompt explicitly gives a
-            # different (sys, trigger) than the default "think").
-            effective_bot_task = bot_task if bot_task is not None else "think"
+            build_kwargs = {
+                "task": effective_task,
+                "sys_type": sys_type,
+                "custom_system_prompt": custom_system_prompt,
+                "num_images": num_images,
+            }
+            if bot_task is not None:
+                build_kwargs["bot_task"] = bot_task
+            elif "bot_task" in extra_body and not legacy_task_from_bot_task:
+                # Preserve the prompt_utils distinction between omitted
+                # bot_task and explicit None. Omitted keeps each task's legacy
+                # default (`it2i` -> think, `i2t`/`t2t` -> plain), while
+                # explicit None is the caller's plain-mode request.
+                build_kwargs["bot_task"] = None
             if tokenizer is not None:
                 # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
                 # so AR sees the same template-tokenization HF apply_chat_template
@@ -2301,11 +2318,7 @@ def _build_multistage_generation_inputs(
                 result = build_prompt_tokens(
                     prompt,
                     tokenizer,
-                    task=effective_task,
-                    bot_task=effective_bot_task,
-                    sys_type=sys_type,
-                    custom_system_prompt=custom_system_prompt,
-                    num_images=num_images,
+                    **build_kwargs,
                 )
                 prompt_token_ids = result.token_ids
                 system_prompt_type = result.system_prompt_type
@@ -2313,11 +2326,7 @@ def _build_multistage_generation_inputs(
                 # Legacy string path (e.g. unit tests with no tokenizer plumbed).
                 prompt = build_prompt(
                     prompt,
-                    task=effective_task,
-                    bot_task=effective_bot_task,
-                    sys_type=sys_type,
-                    custom_system_prompt=custom_system_prompt,
-                    num_images=num_images,
+                    **build_kwargs,
                 )
             if reference_images and len(reference_images) == 1:
                 engine_prompt_data = {"image": reference_images[0]}
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index c95a2a48f18..a06d030d0da 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Stage input processor for HunyuanImage3: AR → Diffusion transition.
+"""Stage input processor for HunyuanImage3: AR to Diffusion transition.
 
 In IT2I (image editing) mode:
   - Stage 0 (AR) receives (image + edit instruction), generates CoT/latent tokens
-  - Stage 1 (DiT) receives the AR output + original image, denoises → edited image
+  - Stage 1 (DiT) receives the AR output + original image, denoises to edited image
 
 The ar2diffusion function bridges these two stages, following the same
 signature pattern as glm_image.ar2diffusion.
@@ -12,7 +12,6 @@
 
 from __future__ import annotations
 
-import os
 from functools import lru_cache
 from typing import Any
 
@@ -20,6 +19,9 @@
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
+)
 from vllm_omni.inputs.data import OmniTokensPrompt
 
 logger = init_logger(__name__)
@@ -27,12 +29,63 @@
 # AR emits `<img_size_BASE><img_ratio_Y>` after `</recaption>` in IT2I/T2I
 # (see `HunyuanImage3ForCausalMM.sample` and `_stage_transitions`). The
 # ratio_index resolves to a (height, width) bucket via ResolutionGroup, which
-# is the official upstream's mechanism for AR-driven output aspect — without
+# is the official upstream's mechanism for AR-driven output aspect; without
 # this lookup the DiT pipeline falls back to the user-provided width/height
 # (in the `/v1/images/edits` path that defaults to `pil_images[0].size`,
-# i.e. the first reference image's bucket — usually square, see
+# i.e. the first reference image's bucket, usually square, see
 # api_server.py:1808-1811).
-_DEFAULT_HUNYUAN_IMAGE3_MODEL = "tencent/HunyuanImage-3.0-Instruct"
+_HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS: tuple[str, ...] = (
+    "1024x768",
+    "1280x720",
+    "768x1024",
+    "720x1280",
+)
+
+
+class _Resolution:
+    def __init__(self, size: str | int | tuple[int, int], *args: int):
+        if isinstance(size, str):
+            if "x" in size:
+                h, w = size.split("x")
+                size = (int(h), int(w))
+            else:
+                size = int(size)
+        if args:
+            size = (int(size), args[0])
+        if isinstance(size, int):
+            size = (size, size)
+
+        self.height = int(size[0])
+        self.width = int(size[1])
+        self.ratio = self.height / self.width
+
+
+def _build_resolutions_by_step(base_size: int, align: int = 1) -> list[_Resolution]:
+    step = base_size // 16
+    min_height = base_size // 2
+    min_width = base_size // 2
+    max_height = base_size * 2
+    max_width = base_size * 2
+
+    resolutions = [_Resolution(base_size, base_size)]
+
+    cur_height, cur_width = base_size, base_size
+    while True:
+        if cur_height >= max_height and cur_width <= min_width:
+            break
+        cur_height = min(cur_height + step, max_height)
+        cur_width = max(cur_width - step, min_width)
+        resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align))
+
+    cur_height, cur_width = base_size, base_size
+    while True:
+        if cur_height <= min_height and cur_width >= max_width:
+            break
+        cur_height = max(cur_height - step, min_height)
+        cur_width = min(cur_width + step, max_width)
+        resolutions.append(_Resolution(cur_height // align * align, cur_width // align * align))
+
+    return sorted(resolutions, key=lambda x: x.ratio)
 
 
 @lru_cache(maxsize=4)
@@ -43,45 +96,16 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
     `reso_group[ratio_index]` reverse lookup. Cached because the table
     is constant per `base_size`.
     """
-    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import (
-        HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS,
-        Resolution,
-        ResolutionGroup,
-    )
-
-    reso_group = ResolutionGroup(
-        base_size=base_size,
-        extra_resolutions=[Resolution(s) for s in HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS],
-    )
-    return [(int(r.height), int(r.width)) for r in reso_group.data]
-
-
-@lru_cache(maxsize=4)
-def _build_cot_end_token_ids(model_name_or_path: str) -> dict[str, int]:
-    """Return `{'</recaption>': id, '</think>': id}` for cot-boundary
-    truncation. Empty dict on lookup failure so callers degrade to a
-    pure text-based search.
-    """
-    try:
-        from transformers import AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    except Exception as e:  # pragma: no cover - environment-dependent
-        logger.warning("[ar2diffusion] failed to load tokenizer for cot-end lookup: %s", e)
-        return {}
-
-    result: dict[str, int] = {}
-    for marker in ("</recaption>", "</think>"):
-        tid = tokenizer.convert_tokens_to_ids(marker)
-        if tid is not None and tid != tokenizer.unk_token_id:
-            result[marker] = int(tid)
-    return result
+    resolutions = _build_resolutions_by_step(base_size)
+    for extra_resolution in (_Resolution(s) for s in _HUNYUAN_IMAGE3_EXTRA_RESOLUTIONS):
+        if not any(r.ratio == extra_resolution.ratio for r in resolutions):
+            resolutions.append(extra_resolution)
+    return [(r.height, r.width) for r in resolutions]
 
 
 def _truncate_at_cot_end(
     generated_text: str,
     generated_token_ids,
-    model_name_or_path: str,
 ) -> tuple[str, list[int]]:
     """Truncate AR output at first `</recaption>` (or `</think>` fallback).
 
@@ -89,63 +113,50 @@ def _truncate_at_cot_end(
     upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
     `cot_text` for DiT. The trailing `<answer><boi><img_size_*><img_ratio_*>`
     sequence is a stage-transition trigger consumed via `image_size` /
-    height/width — it must NOT be forwarded to DiT's prompt builder, or
+    height/width; it must NOT be forwarded to DiT's prompt builder, or
     the extra `<boi>` and ratio tokens drift the DiT's own prompt
     structure.
     """
     token_list = list(generated_token_ids) if generated_token_ids is not None else []
 
-    end_ids = _build_cot_end_token_ids(model_name_or_path)
+    end_ids = {
+        "</recaption>": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
+        "</think>": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
+    }
 
     for marker in ("</recaption>", "</think>"):
-        idx = generated_text.find(marker)
-        if idx == -1:
-            continue
-        text_end = idx + len(marker)
-        truncated_text = generated_text[:text_end]
-
         truncated_tokens = token_list
-        end_id = end_ids.get(marker)
-        if end_id is not None and token_list:
+        end_id = end_ids[marker]
+        if token_list:
             try:
                 token_end = token_list.index(end_id)
                 truncated_tokens = token_list[: token_end + 1]
             except ValueError:
                 pass
-        return truncated_text, truncated_tokens
+
+        idx = generated_text.find(marker)
+        if idx != -1:
+            text_end = idx + len(marker)
+            return generated_text[:text_end], truncated_tokens
+        if truncated_tokens is not token_list:
+            return generated_text, truncated_tokens
 
     return generated_text, token_list
 
 
 @lru_cache(maxsize=4)
-def _build_ratio_id_lookup(model_name_or_path: str) -> dict[int, int]:
-    """Return `{token_id: ratio_index}` for `<img_ratio_*>` in the tokenizer.
-
-    Loads the tokenizer once per model path and walks the contiguous
-    `<img_ratio_0>..<img_ratio_32>` plus the extra slice
-    `<img_ratio_33>..<img_ratio_36>` (the same shape
-    `HunyuanImage3ForCausalMM.__init__` registers at lines 1523-1531).
-    Empty dict on lookup failure so callers can degrade gracefully.
+def _build_ratio_id_lookup() -> dict[int, int]:
+    """Return `{token_id: ratio_index}` for HunyuanImage3 ratio tokens.
+
+    The ids are fixed in tokenizer.json and already pinned in prompt_utils.
+    Avoid loading AutoTokenizer here: this bridge runs on the hot AR->DiT
+    transition path and must keep working in offline deployments where the
+    tokenizer object is not exposed to the stage-input processor.
     """
-    try:
-        from transformers import AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    except Exception as e:  # pragma: no cover - environment-dependent
-        logger.warning("[ar2diffusion] failed to load tokenizer for ratio token lookup: %s", e)
-        return {}
-
-    def _id(name: str) -> int | None:
-        tid = tokenizer.convert_tokens_to_ids(name)
-        return None if tid is None or tid == tokenizer.unk_token_id else int(tid)
-
-    ratio_0 = _id("<img_ratio_0>")
-    ratio_32 = _id("<img_ratio_32>")
-    ratio_33 = _id("<img_ratio_33>")
-    ratio_36 = _id("<img_ratio_36>")
-    if None in (ratio_0, ratio_32, ratio_33, ratio_36):
-        logger.warning("[ar2diffusion] tokenizer is missing one of <img_ratio_{0,32,33,36}> tokens")
-        return {}
+    ratio_0 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+    ratio_32 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"]
+    ratio_33 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_33>"]
+    ratio_36 = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_36>"]
 
     table: dict[int, int] = {}
     for i in range(ratio_32 - ratio_0 + 1):
@@ -156,22 +167,20 @@ def _id(name: str) -> int | None:
     return table
 
 
-def _extract_ratio_index(generated_token_ids, model_name_or_path: str) -> int | None:
+def _extract_ratio_index(generated_token_ids) -> int | None:
     """Resolve the AR-predicted ratio_index from this stage's output.
 
     `HunyuanImage3ForCausalMM`'s `_stage_transitions` forces the AR to emit
     exactly one `<img_ratio_*>` token after `</recaption><answer><boi>
     <img_size_*>`, so we scan the token stream from the tail for the first
-    id that maps to a ratio. Token-ids are the source of truth — text-side
+    id that maps to a ratio. Token-ids are the source of truth; text-side
     regex is unreliable because most deploy yamls run AR with
     `skip_special_tokens: True` (special tokens are stripped from text but
     still present in `cumulative_token_ids`).
     """
     if generated_token_ids is None:
         return None
-    table = _build_ratio_id_lookup(model_name_or_path)
-    if not table:
-        return None
+    table = _build_ratio_id_lookup()
     for tid in reversed(list(generated_token_ids)):
         idx = table.get(int(tid))
         if idx is not None:
@@ -230,10 +239,7 @@ def ar2diffusion(
         # square in the multi-image / mismatched-aspect case. Mirrors the
         # official upstream where `reso_group[ratio_index]` is the
         # canonical source of the diffusion target shape.
-        model_name_or_path = original_prompt.get("model") or os.environ.get(
-            "VLLM_OMNI_HUNYUAN_IMAGE3_MODEL", _DEFAULT_HUNYUAN_IMAGE3_MODEL
-        )
-        ratio_idx = _extract_ratio_index(generated_token_ids, model_name_or_path)
+        ratio_idx = _extract_ratio_index(generated_token_ids)
         ar_predicted = False
         if ratio_idx is not None:
             base_size = int(original_prompt.get("image_base_size", 1024))
@@ -253,14 +259,12 @@ def ar2diffusion(
 
         # Truncate the AR output at `</recaption>` (or `</think>`) before
         # passing to DiT. Mirrors official `generate_image` which keeps
-        # `cot_text` clean and routes size/ratio via `image_size` only —
+        # `cot_text` clean and routes size/ratio via `image_size` only;
         # we already extracted `ratio_idx` above and translated it into
         # `height` / `width`, so the `<answer><boi><img_size_*><img_ratio_*>`
         # tail has no remaining job and would only contaminate DiT's
         # prompt builder if forwarded.
-        cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(
-            generated_text, generated_token_ids, model_name_or_path
-        )
+        cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids)
 
         logger.info(
             "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "

From 38668a6e5785fab2b50728d1b231badd0e82efe1 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 15:22:51 +0800
Subject: [PATCH 29/43] revert(hunyuan_image3): roll cond preprocessing back to
 magnet_repro state

Restores the IT2I online image quality observed at the magnet_repro
deploy. Two changes from the PR review-feedback round regressed image
quality on multi-image edit prompts:

1. 4da2ff687 switched cond VAE from `latent_dist.sample(generator)` to
   `latent_dist.mode()` on both AR and DiT sides. The posterior mean
   produces visibly degraded conditioning vs the fixed-seed sample.
2. 1785580ef changed AR `_resize_and_crop` default from `"resize"` to
   `"center"` to match a non-existent DiT center-crop default (DiT
   bridge actually defaults to `"resize"` too). This broke AR/DiT
   preprocessing alignment instead of fixing it.

Revert both:
- AR `_resize_and_crop` default back to `"resize"` and its docstring.
- AR/DiT `_vae_encode`/`vae_encode` back to fixed-generator sample.
- Remove the now-dead `.mode()` method on
  `DiagonalGaussianDistribution`.
- Remove the AR/DiT byte-identical preprocessing test added by
  1785580ef -- it asserted the wrong invariant (AR `"center"` == DiT
  `_resize_and_crop_center`), which no longer holds and was never the
  right alignment target.

Keeps the other 4da2ff687 fixes intact: system_prompt body forwarding,
ratio extraction simplification, stale `it2i_recaption` compound name
cleanup, duplicate `prompt_token_ids` assignment removal.

Signed-off-by: Claude Code <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../test_hunyuan_image3_it2i_ar_format.py     | 38 -------------------
 .../models/hunyuan_image3/autoencoder.py      |  3 --
 .../hunyuan_image3/pipeline_hunyuan_image3.py |  9 +++--
 .../models/hunyuan_image3/hunyuan_image3.py   | 28 +++++++-------
 4 files changed, 18 insertions(+), 60 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
index 7550caa50f7..7e7b7de91b2 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_it2i_ar_format.py
@@ -72,44 +72,6 @@ def _snapshot_dir(model_id: str) -> pathlib.Path:
 # tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py.
 
 
-def test_ar_and_dit_condition_image_preprocessing_match_without_hf_cache():
-    """AR and DiT must preprocess the same IT2I condition image into the
-    same VAE pixels.
-
-    This catches drift between the AR-side multimodal processor and the
-    diffusion-side bridge without requiring model weights or tokenizer files.
-    In particular, portrait input expanded to a landscape output is sensitive
-    to accidentally using ``crop_type="resize"`` on one side and center crop
-    on the other; the two paths then condition on visibly different fabric
-    regions and leave seam-like artifacts around the edited object.
-    """
-    import numpy as np
-    from PIL import Image
-
-    from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import (
-        _resize_and_crop_center,
-    )
-    from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import (
-        HunyuanImage3Processor,
-    )
-
-    rng = np.random.default_rng(seed=3444)
-    src_size_pairs = [(735, 1104), (640, 1024), (1280, 720), (1024, 1024)]
-    target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024), (1280, 720)]
-
-    for src_w, src_h in src_size_pairs:
-        src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8)
-        src = Image.fromarray(src_arr, mode="RGB")
-        for tw, th in target_size_pairs:
-            ar_out = HunyuanImage3Processor._resize_and_crop(None, src, (tw, th))
-            dit_out = _resize_and_crop_center(src, tw, th)
-
-            assert ar_out.size == dit_out.size == (tw, th)
-            assert np.array_equal(np.asarray(ar_out), np.asarray(dit_out)), (
-                f"AR and DiT condition preprocessing diverged for src={(src_w, src_h)} target={(tw, th)}"
-            )
-
-
 _OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot"
 
 
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
index ddd7d5c6df7..efba2f27435 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py
@@ -46,9 +46,6 @@ def sample(self, generator: torch.Generator | None = None) -> torch.FloatTensor:
         x = self.mean + self.std * sample
         return x
 
-    def mode(self) -> torch.FloatTensor:
-        return self.mean
-
 
 @dataclass
 class DecoderOutput(BaseOutput):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 35390e7312d..14aa0ea903d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -634,10 +634,11 @@ def vae_encode(self, image, cfg_factor=1):
             if isinstance(vae_encode_result, torch.Tensor):
                 latents = vae_encode_result
             else:
-                # Cond image is clean conditioning (t=0 below) -- use the
-                # posterior mean so encoding is deterministic by construction.
-                # See AR-side comment in model_executor/.../hunyuan_image3.py.
-                latents = vae_encode_result.latent_dist.mode()
+                # Match HunyuanImage-3's cond encode path: sample the
+                # posterior, but use a fixed generator so repeated online
+                # requests are deterministic.
+                _cond_vae_gen = torch.Generator(device=image.device).manual_seed(0)
+                latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
             if hasattr(config, "shift_factor") and config.shift_factor:
                 latents.sub_(config.shift_factor)
             if hasattr(config, "scaling_factor") and config.scaling_factor:
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 40a38c7b5ac..cfd5c6764ad 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -907,8 +907,8 @@ def process_image(self, image_input: ImageInput):
             current_info["vit_spatial_shapes"] = _ss.squeeze(0)
 
             # VAE: per-image bucket via `reso_group.get_target_size`; mirrors
-            # HF's `resize_and_crop` default (crop_type="center", the official
-            # generate_image default when infer_align_image_size=False).
+            # HF's `resize_and_crop` (crop_type="center", the official
+            # generate_image default with infer_align_image_size=False).
             # Keep fp32 — the VAE encoder casts to model dtype at its
             # boundary (see `_vae_encode`).
             image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
@@ -957,13 +957,13 @@ def _resize_and_crop(
         self,
         image: Image.Image,
         target_size: tuple[int, int],
-        crop_type: str = "center",
+        crop_type: str = "resize",
     ) -> Image.Image:
-        # Default mode mirrors official `generate_image` with
-        # infer_align_image_size=False: preserve aspect ratio and center-crop
-        # to the nearest VAE bucket. Keeping this default aligned with the
-        # DiT-side condition-image helper avoids AR and DiT seeing different
-        # conditioning pixels for the same IT2I request.
+        # Default mode mirrors the official `infer_align_image_size=True`
+        # path (image_processor.py:355 → crop_type="resize") used by the
+        # IT2I demo: stretch the cond image to the bucket dims so its
+        # `<img_ratio_*>` tag and ViT/VAE features stay aligned with the
+        # bucket, instead of dropping content via center crop.
         tw, th = target_size
         if crop_type == "resize":
             return image.resize((tw, th), resample=Image.Resampling.LANCZOS)
@@ -1777,13 +1777,11 @@ def _vae_encode(
             images = images.to(dtype=self.vae.dtype)
 
         vae_encode_result = self.vae.encode(images)
-        # Cond image is clean (t=0) conditioning -- take the posterior mean
-        # so encoding is deterministic by construction. `.sample()` without a
-        # generator consumes torch's global RNG and silently drifts between
-        # requests on a long-running server (online) while looking stable for
-        # fresh-process callers (offline). `.mode()` matches the official
-        # HunyuanImage-3 cond encode path.
-        latents = vae_encode_result.latent_dist.mode()
+        # Match HunyuanImage-3's cond encode path: sample the posterior, but
+        # use a fixed generator so online requests do not consume the global
+        # RNG and drift across a long-running server.
+        _cond_vae_gen = torch.Generator(device=images.device).manual_seed(0)
+        latents = vae_encode_result.latent_dist.sample(_cond_vae_gen)
 
         # Apply shift and scaling factors if present
         if hasattr(config, "shift_factor") and config.shift_factor:

From 9bc67cc589fbb5afc7edcd6b3d60c27bbbcd2656 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 16:12:14 +0800
Subject: [PATCH 30/43] fix(hunyuan_image3): stop AR on <|endoftext|> for
 image-output tasks

`resolve_stop_token_ids` returned `<answer>` (128025) for all (task,
bot_task) combos. For image-output tasks (`it2i` / `t2i`) this stops
the AR halfway through the size/ratio tail that
`_stage_transitions[</recaption>]` forces:

    </recaption><answer><boi><img_size_*><img_ratio_*><|endoftext|>
                ^^^^^^^^^^^^ stopped here, ratio never emitted

Downstream `ar2diffusion::_extract_ratio_index` then scans
`cumulative_token_ids` for any `<img_ratio_*>`, finds none, and falls
back to the prompt-carried `height`/`width`. In `end2end.py` for
multi-image IT2I that means the first reference image's shape -- e.g.
a 512x512 logo + a 1179x685 fabric reference collapses the DiT bucket
to 1024x1024 square even though the AR CoT planned image_2's
landscape aspect. Width and texture both regress simultaneously
because DiT has to squeeze the landscape-planned content into a
square bucket.

Online didn't trip this because the deploy yaml explicitly sets
`stop_token_ids: [127957]` (= `<|endoftext|>`) and end2end.py is not
in that codepath. `end2end.py` overrides yaml with
`resolve_stop_token_ids(...)`, so offline always hit the broken stop
regardless of yaml.

Fix: return `[<|endoftext|>]` for `it2i` / `t2i` so AR runs through
the forced tail and `<img_ratio_*>` reaches `ar2diffusion`. Keep
`[<answer>]` for `i2t` / `t2t` -- those are comprehension stages
where the response body sits inside `<answer>`, so the answer-open
*is* the natural terminator.

Update `test_resolve_stop_token_ids_uses_answer_for_generation_tasks`
to assert the new (correct) split.

Signed-off-by: Claude Code <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 24 +++++++++++++++----
 .../models/hunyuan_image3/prompt_utils.py     | 18 ++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 371646556f2..0579caaaac8 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -96,12 +96,28 @@ def test_default_prompt_still_uses_it2i_think_mode():
     assert result.token_ids[-1] == FakeTokenizer.SPECIAL["<think>"]
 
 
-def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
+def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer():
+    """Image-output tasks must stop on <|endoftext|>, not <answer>.
+
+    Stopping on <answer> chops off the <boi><img_size_*><img_ratio_*>
+    tail forced by `_stage_transitions`, so `_extract_ratio_index` in
+    `ar2diffusion` finds nothing and the DiT output bucket collapses to
+    the first reference image's shape (e.g. 1024x1024 square when AR's
+    CoT planned a 1280x720 landscape).
+    """
     tok = FakeTokenizer()
+
+    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
     answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
-    assert resolve_stop_token_ids(task="t2i", bot_task="think", tokenizer=tok) == [answer_id]
-    assert resolve_stop_token_ids(task="t2i", bot_task="recaption", tokenizer=tok) == [answer_id]
-    assert resolve_stop_token_ids(task="it2i", bot_task="think", tokenizer=tok) == [answer_id]
+
+    # Image-output: t2i / it2i must let AR emit the size/ratio tail.
+    for bot in ("think", "recaption", "think_recaption", "vanilla"):
+        assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id]
+        assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id]
+
+    # Text-output: i2t / t2t comprehension stops on <answer> (response sits inside).
+    assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id]
+    assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id]
 
 
 @pytest.mark.parametrize(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index f78b19a5746..196c86dfa5d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -147,11 +147,29 @@ def resolve_stop_token_ids(
     bot_task: str | None | _DefaultBotTask = _DEFAULT_BOT_TASK,
     tokenizer: Any | None = None,
 ) -> list[int]:
+    """AR stop-token ids for a given (task, bot_task) generation request.
+
+    Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``:
+    after ``</recaption>`` the AR's ``_stage_transitions`` force-emits
+    ``<answer><boi><img_size_*>`` and then samples ``<img_ratio_*>`` under
+    ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping
+    early on ``<answer>`` chops off the size/ratio tail, leaves
+    ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently
+    collapses the DiT output bucket to the first reference image's shape
+    (square logo -> 1024x1024 even when AR's CoT plans a landscape).
+
+    Text-output tasks (``i2t`` / ``t2t``) stop on ``<answer>`` -- the AR is
+    the final stage, and the comprehension response sits inside the
+    ``<answer>`` body so the answer-open is the natural cot/recaption
+    terminator.
+    """
     task, bot_task = _normalize_task_and_bot_task(task, bot_task)
     if task not in _TASKS:
         raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
     if bot_task not in _BOT_TASK_PRESETS:
         raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
+    if task in ("it2i", "t2i"):
+        return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]]
     return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]]
 
 

From dec1c436b70cc2350965813e2e6ab6a3be5f39d3 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Tue, 12 May 2026 22:08:31 +0800
Subject: [PATCH 31/43] [Bugfix][HunyuanImage3] cap AR KV snapshot at
 </recaption>, defer mid-decode kv_ready forward

Two coupled changes so HunyuanImage3 IT2I no longer ships KV for the
<answer><boi><img_size><img_ratio><eos> tail that DiT discards anyway:

1. deploy/hunyuan_image3.yaml: add ``kv_transfer_criteria`` so AR's
   snapshot fires at </recaption> (token id 128019). ``stop_after_transfer:
   false`` keeps the AR running past the snapshot so it can still emit
   <img_ratio_*> for ``ar2diffusion._extract_ratio_index``. With this
   yaml + the orchestrator change below, the colleague-confirmed
   invariant S - N == 1 (where S is the shipped KV length and N is the
   DiT-side ``positive_reuse_len``) is restored. Without the yaml the AR
   ships KV all the way through <eos> and S - N collapses to 6.

2. engine/orchestrator.py: ``_handle_kv_ready_raw_outputs`` previously
   forwarded any kv_ready EngineCoreOutput straight to the next stage.
   With ``stop_after_transfer: false`` the kv_ready signal fires
   mid-decode (snapshot at </recaption>, AR still emitting tail), so the
   raw EngineCoreOutput has no ``.outputs[0]`` and bridges that read
   the AR's full text (HunyuanImage3 ``ar2diffusion``) hit
   ``AttributeError``. Skip the forward when no finished output for the
   same req_id is present in the same raw_outputs batch; the AR's
   eventual natural-finish RequestOutput will trigger the forward
   through ``_route_output``. Bagel's existing flow (kv_ready and the
   deferred-stop finish output co-emit in the same batch) is preserved.

Signed-off-by: zuiho <wu15922848573@outlook.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 20 ++++++++++++++++++++
 vllm_omni/engine/orchestrator.py     | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 634165cd33a..8f7c57fdd64 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -37,6 +37,26 @@ stages:
         rope_type: default
     omni_kv_config:
       need_send_cache: true
+      # Cap AR KV snapshot at </recaption> so the shipped KV exactly
+      # matches the prefix the DiT side reuses (positive_reuse_len =
+      # 0-based index of </recaption>, slice ``k[:positive_reuse_len]``
+      # excludes </recaption> itself). Mirrors the colleague-confirmed
+      # invariant S - N == 1. Without this the AR ships KV all the way
+      # through <answer><boi><img_size><img_ratio><eos>, which DiT
+      # silently discards (S - N == 6) and which keeps the AR pipeline
+      # busy emitting tail tokens that DiT will never use.
+      #
+      # ``stop_after_transfer: false`` keeps the AR running past the
+      # snapshot so it still emits <img_ratio_*>, which ``ar2diffusion``
+      # extracts to derive image height/width. The mid-decode kv_ready
+      # signal that this combination produces is handled in the
+      # orchestrator: forwarding to DiT is deferred until the AR's
+      # natural finish output arrives (see
+      # ``_handle_kv_ready_raw_outputs``).
+      kv_transfer_criteria:
+        type: special_token
+        token_id: 128019  # </recaption>
+        stop_after_transfer: false
     output_connectors:
       to_stage_1: shared_memory_connector
     default_sampling_params:
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 2d2ac47cbb3..37a9eb291c8 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -695,6 +695,21 @@ async def _handle_kv_ready_raw_outputs(
         if self.async_chunk:
             return
 
+        # When kv_ready fires mid-decode (e.g. HunyuanImage3 with
+        # kv_transfer_criteria=special_token + stop_after_transfer=false,
+        # snapshot triggers at </recaption> but AR keeps generating tail
+        # tokens for ratio extraction), the kv_ready EngineCoreOutput is
+        # NOT a finished RequestOutput, so bridges that read
+        # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only
+        # forward kv_ready when the same raw_outputs batch also contains a
+        # finished output for that req_id; otherwise wait for AR's natural
+        # completion to trigger the forward through ``_route_output``.
+        finished_in_batch = {
+            o.request_id
+            for o in raw_outputs.outputs
+            if getattr(o, "finish_reason", None) is not None
+        }
+
         for raw_output in raw_outputs.outputs:
             kv_params = getattr(raw_output, "kv_transfer_params", None)
             if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")):
@@ -712,6 +727,9 @@ async def _handle_kv_ready_raw_outputs(
             if (stage_id + 1) in req_state.stage_submit_ts:
                 continue
 
+            if req_id not in finished_in_batch:
+                continue
+
             if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id):
                 self._cfg_tracker.defer_parent(req_id, raw_output, stage_id)
             else:

From b84bc2ffa594c796d40a2af0631d8fb0d0c23628 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 11:43:03 +0800
Subject: [PATCH 32/43] fix(hunyuan_image3): cap IT2I input images at
 MAX_IMAGES_PER_REQUEST in entry layer

Per PR #3444 review (Gaohan123): give a friendly, input-named error at the
entry boundary instead of relying on the deeper
`prompt_utils._validate_num_images` to surface as a `num_images must be in
[1, 3]` message. Reuse `MAX_IMAGES_PER_REQUEST` so the cap stays defined in
one place.

- offline `end2end.py`: validate `--image-path` count before opening PIL
- online `serving_chat._build_multistage_generation_inputs`: validate
  `reference_images` count before building engine prompt data

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 examples/offline_inference/hunyuan_image3/end2end.py | 6 ++++++
 vllm_omni/entrypoints/openai/serving_chat.py         | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 908109d65a3..36b3b1199a5 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    MAX_IMAGES_PER_REQUEST,
     build_prompt_tokens,
     resolve_stop_token_ids,
     resolve_sys_type,
@@ -177,6 +178,11 @@ def main():
         from PIL import Image
 
         image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
+        if len(image_paths) > MAX_IMAGES_PER_REQUEST:
+            raise ValueError(
+                f"--image-path accepts at most {MAX_IMAGES_PER_REQUEST} images for "
+                f"HunyuanImage-3.0 IT2I, got {len(image_paths)}: {args.image_path}"
+            )
         for image_path in image_paths:
             if not os.path.exists(image_path):
                 raise ValueError(f"Image path does not exist: {image_path}")
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 7424a9e0d34..26ca0d6170e 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2253,6 +2253,7 @@ def _build_multistage_generation_inputs(
         # downstream uses the canonical split. Source the task enum from
         # prompt_utils so this layer stays in sync with the model side.
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+            MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
             available_tasks as _hunyuan3_available_tasks,
         )
 
@@ -2273,6 +2274,12 @@ def _build_multistage_generation_inputs(
             bot_task = None
             legacy_task_from_bot_task = True
 
+        if reference_images and len(reference_images) > _hunyuan3_max_images:
+            raise ValueError(
+                f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input "
+                f"images per request, got {len(reference_images)}"
+            )
+
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
         if reference_images:

From 029f567d08e7b465069b6f2a5b1af63ee87b51bd Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 12:06:05 +0800
Subject: [PATCH 33/43] chore: apply pre-commit ruff format / isort fixups

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/engine/orchestrator.py             | 6 +-----
 vllm_omni/entrypoints/openai/serving_chat.py | 2 ++
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 37a9eb291c8..a764c3b5247 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -704,11 +704,7 @@ async def _handle_kv_ready_raw_outputs(
         # forward kv_ready when the same raw_outputs batch also contains a
         # finished output for that req_id; otherwise wait for AR's natural
         # completion to trigger the forward through ``_route_output``.
-        finished_in_batch = {
-            o.request_id
-            for o in raw_outputs.outputs
-            if getattr(o, "finish_reason", None) is not None
-        }
+        finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None}
 
         for raw_output in raw_outputs.outputs:
             kv_params = getattr(raw_output, "kv_transfer_params", None)
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 26ca0d6170e..dfd6c15168a 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2254,6 +2254,8 @@ def _build_multistage_generation_inputs(
         # prompt_utils so this layer stays in sync with the model side.
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
+        )
+        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             available_tasks as _hunyuan3_available_tasks,
         )
 

From d8b9263f042cc09f0cb6d220f9ebef833f163dcf Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 12:09:35 +0800
Subject: [PATCH 34/43] chore: rename MAX_IMAGES_PER_REQUEST alias to uppercase
 (ruff N811)

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index dfd6c15168a..35dd4524fc0 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2253,7 +2253,7 @@ def _build_multistage_generation_inputs(
         # downstream uses the canonical split. Source the task enum from
         # prompt_utils so this layer stays in sync with the model side.
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            MAX_IMAGES_PER_REQUEST as _hunyuan3_max_images,
+            MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
         )
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             available_tasks as _hunyuan3_available_tasks,
@@ -2276,9 +2276,9 @@ def _build_multistage_generation_inputs(
             bot_task = None
             legacy_task_from_bot_task = True
 
-        if reference_images and len(reference_images) > _hunyuan3_max_images:
+        if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
             raise ValueError(
-                f"HunyuanImage-3.0 IT2I accepts at most {_hunyuan3_max_images} input "
+                f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input "
                 f"images per request, got {len(reference_images)}"
             )
 

From 511b76c0865aaac13c8dcd9abe0f0d8cfd49e8c7 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 15:05:59 +0800
Subject: [PATCH 35/43] fix(hunyuan_image3): align AR stop / KV cap / edits
 Form with upstream (review)

Addresses Bounty-hunter's PR review on #3444:

1. resolve_stop_token_ids: image-output tasks now stop on the full
   <img_ratio_*> token range (ids 128044-128076 + 130103-130106),
   mirroring upstream modeling_hunyuan_image_3.py:3289-3303
   (`final_stop_tokens = list(range(start_ratio, end_ratio + 1))`).
   Replaces the earlier `<|endoftext|>` stop which let AR waste decode
   steps past the ratio. test_prompt_utils.py renamed/updated to pin
   the new contract.

2. deploy/hunyuan_image3.yaml: drop the kv_transfer_criteria block.
   With the ratio-range stop in place AR finishes naturally at the
   ratio token, so KV is capped automatically -- no need for
   special_token criteria + stop_after_transfer=false.

3. orchestrator._handle_kv_ready_raw_outputs: drop the finished_in_batch
   defer. Mid-decode kv_ready only fired when stop_after_transfer=false
   was forcing AR past its natural stop; with #2 removed there is no
   mid-decode kv_ready to defer. The ratio strip for DiT already lives
   in stage_input_processors/hunyuan_image3._truncate_at_cot_end.

4. serving_chat._build_multistage_generation_inputs: call
   resolve_stop_token_ids(task, bot_task) and inject into the AR-stage
   sampling params. Online now matches offline end2end.py rather than
   relying on yaml-side stop_token_ids.

5. api_server.edit_images: drop the redundant `task` Form field.
   /v1/images/edits is always IT2I; bot_task / sys_type / system_prompt
   remain. Legacy bot_task=<task-enum> still works via chat-handler
   normalization.

6. pipeline_hunyuan_image3 + stage_input_processors/hunyuan_image3:
   stop reading / writing the `ar_token_ids` extra. The tokenizer-level
   `batch_cot_token_ids` parameter is retained for a follow-up PR that
   will unify system/user/cot tokenization. See PR description for the
   optimization leftover note.

Signed-off-by: Claude Code <noreply@anthropic.com>
Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 31 ++++++++------
 .../test_hunyuan_image3.py                    |  2 +-
 vllm_omni/deploy/hunyuan_image3.yaml          | 20 ----------
 .../hunyuan_image3/pipeline_hunyuan_image3.py | 12 +-----
 .../models/hunyuan_image3/prompt_utils.py     | 35 ++++++++++------
 vllm_omni/engine/orchestrator.py              | 14 -------
 vllm_omni/entrypoints/openai/api_server.py    | 40 +++++++------------
 vllm_omni/entrypoints/openai/serving_chat.py  | 18 +++++++++
 .../stage_input_processors/hunyuan_image3.py  |  4 --
 9 files changed, 76 insertions(+), 100 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 0579caaaac8..7c3256eee72 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -96,26 +96,31 @@ def test_default_prompt_still_uses_it2i_think_mode():
     assert result.token_ids[-1] == FakeTokenizer.SPECIAL["<think>"]
 
 
-def test_resolve_stop_token_ids_image_tasks_stop_on_eos_not_answer():
-    """Image-output tasks must stop on <|endoftext|>, not <answer>.
-
-    Stopping on <answer> chops off the <boi><img_size_*><img_ratio_*>
-    tail forced by `_stage_transitions`, so `_extract_ratio_index` in
-    `ar2diffusion` finds nothing and the DiT output bucket collapses to
-    the first reference image's shape (e.g. 1024x1024 square when AR's
-    CoT planned a 1280x720 landscape).
+def test_resolve_stop_token_ids_image_tasks_stop_on_ratio_range():
+    """Image-output tasks stop on any ``<img_ratio_*>`` token.
+
+    Mirrors upstream ``modeling_hunyuan_image_3.py::generate_image``
+    (line 3289-3303): when ``need_ratio`` is true,
+    ``final_stop_tokens = list(range(start_ratio, end_ratio + 1)) +
+    ratio_token_other_slices``. AR stops AT the ratio token sampled
+    after ``<img_size_*>``; the bridge then strips the trailing ratio
+    token before passing the cot to DiT.
     """
     tok = FakeTokenizer()
 
-    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
+    start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+    end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"]
+    other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_33>"]
+    other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_36>"]
+    expected = list(range(start, end + 1)) + list(range(other_start, other_end + 1))
 
-    # Image-output: t2i / it2i must let AR emit the size/ratio tail.
+    # Image-output: t2i / it2i stop on the full ratio token range.
     for bot in ("think", "recaption", "think_recaption", "vanilla"):
-        assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == [eos_id]
-        assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == [eos_id]
+        assert resolve_stop_token_ids(task="t2i", bot_task=bot, tokenizer=tok) == expected
+        assert resolve_stop_token_ids(task="it2i", bot_task=bot, tokenizer=tok) == expected
 
     # Text-output: i2t / t2t comprehension stops on <answer> (response sits inside).
+    answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
     assert resolve_stop_token_ids(task="i2t", bot_task=None, tokenizer=tok) == [answer_id]
     assert resolve_stop_token_ids(task="t2t", bot_task=None, tokenizer=tok) == [answer_id]
 
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
index faaa9785452..1901210de09 100644
--- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -81,7 +81,7 @@ def _block_transformers_import(name, *args, **kwargs):
     assert len(result) == 1
     assert (result[0]["height"], result[0]["width"]) == (512, 2048)
     assert result[0]["extra"]["ar_generated_text"] == "decoded without special tokens"
-    assert result[0]["extra"]["ar_token_ids"].tolist() == [100, 101, end_recaption]
+    assert "ar_token_ids" not in result[0]["extra"]
 
 
 def test_ar2diffusion_forwards_custom_system_prompt_body():
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 8f7c57fdd64..634165cd33a 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -37,26 +37,6 @@ stages:
         rope_type: default
     omni_kv_config:
       need_send_cache: true
-      # Cap AR KV snapshot at </recaption> so the shipped KV exactly
-      # matches the prefix the DiT side reuses (positive_reuse_len =
-      # 0-based index of </recaption>, slice ``k[:positive_reuse_len]``
-      # excludes </recaption> itself). Mirrors the colleague-confirmed
-      # invariant S - N == 1. Without this the AR ships KV all the way
-      # through <answer><boi><img_size><img_ratio><eos>, which DiT
-      # silently discards (S - N == 6) and which keeps the AR pipeline
-      # busy emitting tail tokens that DiT will never use.
-      #
-      # ``stop_after_transfer: false`` keeps the AR running past the
-      # snapshot so it still emits <img_ratio_*>, which ``ar2diffusion``
-      # extracts to derive image height/width. The mid-decode kv_ready
-      # signal that this combination produces is handled in the
-      # orchestrator: forwarding to DiT is deferred until the AR's
-      # natural finish output arrives (see
-      # ``_handle_kv_ready_raw_outputs``).
-      kv_transfer_criteria:
-        type: special_token
-        token_id: 128019  # </recaption>
-        stop_after_transfer: false
     output_connectors:
       to_stage_1: shared_memory_connector
     default_sampling_params:
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 14aa0ea903d..63c367a1006 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1386,23 +1386,13 @@ def forward(
         # and ``get_cot_sections()`` can parse the think/recaption structure
         # directly.
         cot_text_list = []
-        cot_token_ids_list = []
         for p in req.prompts:
             extra = p.get("extra", {}) if isinstance(p, dict) else {}
             cot_text_list.append(extra.get("ar_generated_text") or None)
-            cot_token_ids_list.append(extra.get("ar_token_ids"))
         cot_text = (
             [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
         )
-        # Prefer AR-sampled token IDs over the decoded cot text so DiTs prompt
-        # tokenization matches ARs actual token sequence byte-for-byte. Required
-        # when KV reuse is enabled: positive_reuse_len computed from DiT-side
-        # tokenization must equal the AR-side KV cache length, otherwise the
-        # silent slice in inject_ar_kv_into_layers leaves _cache_prompt_kvs
-        # `q_len + ar_kv_len == seq_len` assert off by N (BPE re-merge drift on
-        # multi-byte/punctuation boundaries; see get_cot_sections_from_token_ids
-        # in hunyuan_image3_tokenizer.py).
-        cot_token_ids = cot_token_ids_list if any(t is not None for t in cot_token_ids_list) else None
+        cot_token_ids = None
 
         batch_cond_image_info: list[list[JointImageInfo]] | None = None
         if any(not isinstance(p, str) for p in req.prompts):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 196c86dfa5d..b178b021fd6 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -149,17 +149,19 @@ def resolve_stop_token_ids(
 ) -> list[int]:
     """AR stop-token ids for a given (task, bot_task) generation request.
 
-    Image-output tasks (``it2i`` / ``t2i``) must stop on ``<|endoftext|>``:
-    after ``</recaption>`` the AR's ``_stage_transitions`` force-emits
-    ``<answer><boi><img_size_*>`` and then samples ``<img_ratio_*>`` under
-    ``_apply_ratio_restriction`` followed by ``<|endoftext|>``. Stopping
-    early on ``<answer>`` chops off the size/ratio tail, leaves
-    ``_extract_ratio_index`` empty in ``ar2diffusion``, and silently
-    collapses the DiT output bucket to the first reference image's shape
-    (square logo -> 1024x1024 even when AR's CoT plans a landscape).
-
-    Text-output tasks (``i2t`` / ``t2t``) stop on ``<answer>`` -- the AR is
-    the final stage, and the comprehension response sits inside the
+    Image-output tasks (``it2i`` / ``t2i``) stop on any ``<img_ratio_*>``
+    token. Upstream ``modeling_hunyuan_image_3.py::generate_image``
+    (line 3289-3303) sets ``final_stop_tokens`` to the full ratio token
+    range when ``need_ratio`` is true, then strips the trailing ratio
+    token before passing the cot to the image stage. AR's natural
+    trajectory under ``_stage_transitions`` is
+    ``</recaption><answer><boi><img_size_base><img_ratio_X>``; stopping
+    AT the ratio token means KV ends exactly at the prefix DiT reuses,
+    and ``ar2diffusion`` can read the ratio off the last sampled token
+    without AR wasting decode steps on ``<|endoftext|>``.
+
+    Text-output tasks (``i2t`` / ``t2t``) stop on ``<answer>`` -- the AR
+    is the final stage, and the comprehension response sits inside the
     ``<answer>`` body so the answer-open is the natural cot/recaption
     terminator.
     """
@@ -169,7 +171,16 @@ def resolve_stop_token_ids(
     if bot_task not in _BOT_TASK_PRESETS:
         raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_bot_tasks()}")
     if task in ("it2i", "t2i"):
-        return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]]
+        # Main ratio range: <img_ratio_0> .. <img_ratio_32>.
+        start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+        end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"]
+        stops = list(range(start, end + 1))
+        # Other slices (upstream tokenizer ``ratio_token_other_slices``):
+        # <img_ratio_33> .. <img_ratio_36>.
+        other_start = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_33>"]
+        other_end = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_36>"]
+        stops.extend(range(other_start, other_end + 1))
+        return stops
     return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]]
 
 
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index a764c3b5247..2d2ac47cbb3 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -695,17 +695,6 @@ async def _handle_kv_ready_raw_outputs(
         if self.async_chunk:
             return
 
-        # When kv_ready fires mid-decode (e.g. HunyuanImage3 with
-        # kv_transfer_criteria=special_token + stop_after_transfer=false,
-        # snapshot triggers at </recaption> but AR keeps generating tail
-        # tokens for ratio extraction), the kv_ready EngineCoreOutput is
-        # NOT a finished RequestOutput, so bridges that read
-        # ``ar_output.outputs[0]`` (HunyuanImage3 ar2diffusion) crash. Only
-        # forward kv_ready when the same raw_outputs batch also contains a
-        # finished output for that req_id; otherwise wait for AR's natural
-        # completion to trigger the forward through ``_route_output``.
-        finished_in_batch = {o.request_id for o in raw_outputs.outputs if getattr(o, "finish_reason", None) is not None}
-
         for raw_output in raw_outputs.outputs:
             kv_params = getattr(raw_output, "kv_transfer_params", None)
             if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")):
@@ -723,9 +712,6 @@ async def _handle_kv_ready_raw_outputs(
             if (stage_id + 1) in req_state.stage_submit_ts:
                 continue
 
-            if req_id not in finished_in_batch:
-                continue
-
             if self._cfg_tracker.has_companions(req_id) and not self._cfg_tracker.all_companions_done(req_id):
                 self._cfg_tracker.defer_parent(req_id, raw_output, stage_id)
             else:
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 7107b544adc..c54295cf104 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1701,12 +1701,10 @@ async def edit_images(
     layers: int | None = Form(None),
     resolution: int | None = Form(None),  # See SUPPORTED_LAYERED_RESOLUTIONS
     bot_task: str | None = Form(None),
-    # P1: task / sys_type / system_prompt split out from the legacy bot_task
-    # field so callers can express the full HunyuanImage-3.0 prompt template
-    # surface (task enum + bot_task semantic + sys_type override + custom
-    # system prompt body). Legacy callers that pass a task-enum value via
-    # bot_task still work (see normalization below).
-    task: str | None = Form(None),
+    # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis
+    # is fixed and pinned downstream. ``bot_task`` (think / recaption /
+    # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the
+    # only HunyuanImage-3.0 knobs callers need to express here.
     sys_type: str | None = Form(None),
     system_prompt: str | None = Form(None),
 ) -> ImageGenerationResponse:
@@ -1760,10 +1758,10 @@ async def edit_images(
                 detail=detail,
             )
         # Convert uploads to RGB when the caller opts into the Hunyuan-aware
-        # API surface. This includes the legacy bot_task=<task-enum> form:
-        # keeping uploads as RGBA/P PIL objects makes online IT2I observe a
-        # different visual input than the offline path.
-        normalize_edit_images_rgb = task is not None or bot_task is not None or sys_type is not None
+        # API surface (bot_task / sys_type / system_prompt). Keeping uploads
+        # as RGBA/P PIL objects makes online IT2I observe a different visual
+        # input than the offline path.
+        normalize_edit_images_rgb = bot_task is not None or sys_type is not None
         pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
         prompt["multi_modal_data"] = {}
         prompt["multi_modal_data"]["image"] = pil_images
@@ -1927,21 +1925,13 @@ async def edit_images(
                 lora_dict = _get_lora_from_json_str(lora)
                 _parse_lora_request(lora_dict)
                 extra_body["lora"] = lora_dict
-            # P1: normalize legacy `bot_task=<task-enum>` form. Callers historically
-            # passed the task enum (i2t / it2i / t2i / t2t) via the `bot_task`
-            # Form field; promote it to `task` here so the chat_handler can
-            # split task vs bot_task semantics cleanly. New callers pass both
-            # `task` and `bot_task` explicitly; we keep them separate.
-            _task = task
-            _bot_task = bot_task
-            _legacy_task_enum = {"t2t", "i2t", "it2i", "t2i"}
-            if _task is None and _bot_task in _legacy_task_enum:
-                _task = _bot_task
-                _bot_task = None
-            if _task is not None:
-                extra_body["task"] = _task
-            if _bot_task is not None:
-                extra_body["bot_task"] = _bot_task
+            # ``/v1/images/edits`` is always IT2I; the chat handler's
+            # default (``task="it2i"`` when neither ``task`` nor
+            # ``bot_task`` resolves to a task enum) covers this implicitly.
+            # Legacy callers passing the task enum via ``bot_task`` (e.g.
+            # ``bot_task="it2i"``) are normalized inside the chat handler.
+            if bot_task is not None:
+                extra_body["bot_task"] = bot_task
             if sys_type is not None:
                 extra_body["sys_type"] = sys_type
             if system_prompt is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 35dd4524fc0..739e55a2ad1 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2257,6 +2257,7 @@ def _build_multistage_generation_inputs(
         )
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             available_tasks as _hunyuan3_available_tasks,
+            resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
         )
 
         task = extra_body.get("task")
@@ -2408,6 +2409,23 @@ def _build_multistage_generation_inputs(
                 extra_args["target_h"] = int(height)
                 extra_args["target_w"] = int(width)
 
+            # Resolve AR stop tokens dynamically from (task, bot_task) so the
+            # online path matches offline ``end2end.py`` and so the AR stops
+            # at the natural ``<img_ratio_*>`` token for image-output tasks
+            # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``).
+            # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR
+            # too early and leave ``ar2diffusion`` without a ratio token.
+            if (
+                comprehension_idx is not None
+                and idx == comprehension_idx
+                and hasattr(default_stage_params, "stop_token_ids")
+            ):
+                resolved_stops = _hunyuan3_resolve_stop_token_ids(
+                    task=task if task is not None else "it2i",
+                    bot_task=bot_task,
+                )
+                default_stage_params.stop_token_ids = resolved_stops
+
             if stage_type == "diffusion":
                 self._set_if_supported(
                     default_stage_params,
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index a06d030d0da..5b4d5f56529 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -15,7 +15,6 @@
 from functools import lru_cache
 from typing import Any
 
-import torch
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 
@@ -278,14 +277,11 @@ def ar2diffusion(
             f"AR ratio_idx={ratio_idx}" if ar_predicted else "from prompt (no AR ratio token)",
         )
 
-        token_tensor = torch.tensor(cot_token_ids_for_dit, dtype=torch.long)
-
         diffusion_input: dict[str, Any] = {
             "prompt": text_prompt,
             "height": height,
             "width": width,
             "extra": {
-                "ar_token_ids": token_tensor,
                 "ar_generated_text": cot_text_for_dit,
             },
         }

From 8d90c17bd4fe82bc7e2c9990105c4920ce297e5e Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 15:22:00 +0800
Subject: [PATCH 36/43] chore: apply pre-commit isort split for
 resolve_stop_token_ids import

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 739e55a2ad1..6e2a30f56f2 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2257,6 +2257,8 @@ def _build_multistage_generation_inputs(
         )
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             available_tasks as _hunyuan3_available_tasks,
+        )
+        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
         )
 

From b73b00f6fd3e7c509c5de537817ffcea916c048b Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:04:17 +0800
Subject: [PATCH 37/43] chore(hunyuan_image3): drop dead cot_token_ids plumbing
 and online task input

- Online chat handler: drop `task` from extra_body; derive task from
  reference_images presence. Legacy `bot_task=<task-enum>` still
  normalizes through to the right trigger.
- Remove the AR-token-id cot reuse path (`batch_cot_token_ids` in
  apply_chat_template, `ctx_type == "token_ids"` branch in
  process_successive_message, and `get_cot_sections_from_token_ids`);
  it has no caller after the optimization was rolled back per reviewer
  feedback.
- Simplify `_truncate_at_cot_end` to text-only; the token-id return was
  no longer consumed.
- Trim over-explanatory comments across serving_chat / api_server /
  pipeline / end2end.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../hunyuan_image3/end2end.py                 |  23 +--
 .../hunyuan_image3/test_kvreuse_alignment.py  | 135 ------------------
 ...test_serving_chat_multistage_generation.py |  72 +---------
 .../test_hunyuan_image3.py                    |  17 +--
 .../hunyuan_image3_tokenizer.py               | 123 ++--------------
 .../hunyuan_image3/pipeline_hunyuan_image3.py |  19 +--
 vllm_omni/entrypoints/openai/api_server.py    |  24 +---
 vllm_omni/entrypoints/openai/serving_chat.py  |  97 ++++---------
 .../stage_input_processors/hunyuan_image3.py  |  49 ++-----
 9 files changed, 66 insertions(+), 493 deletions(-)
 delete mode 100644 tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 36b3b1199a5..16f7d8f06c1 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -19,29 +19,12 @@
 _REPO_ROOT = Path(__file__).resolve().parents[3]
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
 _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
-# Modality → (task, default bot_task) mapping. `task` selects only whether
-# `<img>` placeholders are emitted; `bot_task` (None | think | recaption |
-# think_recaption | vanilla) selects the system prompt + trigger tag.
-#
-# Both verbose (`text2img`) and short (`t2i`) forms are accepted; the short
-# forms match the internal task names (see prompt_utils.available_tasks)
-# so users who think in those terms don't have to translate.
+
 _MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
     "text2img": ("t2i", "think"),
-    "t2i": ("t2i", "think"),
     "img2img": ("it2i", "think"),
-    "it2i": ("it2i", "think"),
     "img2text": ("i2t", None),
-    "i2t": ("i2t", None),
     "text2text": ("t2t", None),
-    "t2t": ("t2t", None),
-}
-
-_MODALITY_CANONICAL = {
-    "t2i": "text2img",
-    "it2i": "img2img",
-    "i2t": "img2text",
-    "t2t": "text2text",
 }
 
 _MODALITY_DEFAULT_DEPLOY_CONFIG = {
@@ -65,8 +48,7 @@ def parse_args():
     parser.add_argument(
         "--modality",
         default="text2img",
-        choices=["text2img", "t2i", "img2img", "it2i", "img2text", "i2t", "text2text", "t2t"],
-        help="Verbose and internal short task names are both accepted.",
+        choices=list(_MODALITY_TASK_MAP),
     )
     parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
     parser.add_argument(
@@ -135,7 +117,6 @@ def main():
     os.makedirs(args.output, exist_ok=True)
     additional_config = parse_additional_config(args.additional_config)
 
-    args.modality = _MODALITY_CANONICAL.get(args.modality, args.modality)
     task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
     if args.bot_task is None:
         bot_task: str | None = default_bot_task
diff --git a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py b/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
deleted file mode 100644
index 20faf5487dc..00000000000
--- a/tests/diffusion/models/hunyuan_image3/test_kvreuse_alignment.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Regression tests for AR-token-IDs preservation through DiT prompt building.
-
-Pins the KV-reuse alignment contract: when the AR-side stage input
-processor (`ar2diffusion`) forwards `ar_token_ids` to the diffusion
-stage, `apply_chat_template` must consume those IDs verbatim (no
-re-encode of the decoded cot text via `tokenizer.encode`) so that the
-DiT-side prompt tokenization matches AR's actually-sampled token
-sequence byte-for-byte.
-
-Why this matters: tokenize-detokenize-tokenize over the cot text is not
-lossless (BPE re-merges on multi-byte UTF-8 / punctuation boundaries),
-and the resulting length drift breaks AR KV position alignment --
-DiT's `positive_reuse_len` (computed from `tokenizer.encode(cot_text)`)
-ends up larger than the actual cached AR KV length, and
-`inject_ar_kv_into_layers` then silently truncates via Python slice,
-leaving `_cache_prompt_kv`'s `q_len + ar_kv_len == seq_len` assert off
-by N (hard 500 on KV-reuse-enabled requests; see
-`pipeline_hunyuan_image3.py:_cache_prompt_kv`).
-"""
-
-from __future__ import annotations
-
-import os
-
-import pytest
-
-pytestmark = [pytest.mark.core_model]
-
-
-def _hf_cached(model_id: str) -> bool:
-    hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
-    snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
-    return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
-
-
-_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
-
-
-@pytest.mark.skipif(
-    not _hf_cached(_HUNYUAN_MODEL_ID),
-    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
-def test_get_cot_sections_from_token_ids_round_trips_ar_ids():
-    """`get_cot_sections_from_token_ids` must split AR-sampled IDs at the
-    `<think>` / `</think>` token-id positions and emit sections whose
-    concatenated tokens equal the input (no re-encode).
-
-    Catches the failure mode where DiT re-encodes the decoded cot text
-    and the BPE merges differ from AR's sampled tokens (length drift).
-    """
-    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
-        TokenizerWrapper,
-    )
-
-    tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
-
-    think_id = tkw.tokenizer.convert_tokens_to_ids("<think>")
-    end_think_id = tkw.end_think_token_id
-
-    # Fabricate an AR-style id sequence: arbitrary "thought" payload tokens
-    # surrounded by <think>/</think> markers, plus some leading + trailing
-    # tokens (e.g. <answer>/<boi> tail that gets truncated upstream).
-    thought_payload = [1000, 1001, 1002, 1003, 1004]
-    leading = [2000, 2001]
-    trailing = [3000]
-    ar_token_ids = leading + [think_id] + thought_payload + [end_think_id] + trailing
-
-    sections = tkw.get_cot_sections_from_token_ids(
-        ar_token_ids,
-        uncond_kwargs={},
-        drop_think=False,
-    )
-
-    # Sections concatenated must equal the input verbatim.
-    out: list[int] = []
-    for sec in sections:
-        assert sec["type"] == "text", f"unexpected section type: {sec}"
-        toks = sec.get("tokens")
-        assert toks is not None, f"section missing 'tokens' field: {sec}"
-        out.extend(toks)
-    assert out == ar_token_ids, (
-        f"split-by-token-id must be lossless; got {len(out)} ids vs {len(ar_token_ids)} input; "
-        f"diff at first mismatch index = {next((i for i, (a, b) in enumerate(zip(out, ar_token_ids)) if a != b), None)}"
-    )
-
-
-@pytest.mark.skipif(
-    not _hf_cached(_HUNYUAN_MODEL_ID),
-    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
-)
-def test_apply_chat_template_batch_cot_token_ids_preserves_ar_ids():
-    """When `batch_cot_token_ids` is passed, the assistant section in the
-    final encoded token sequence must contain the AR-sampled token ids
-    verbatim -- no `tokenizer.encode(cot_text)` round-trip.
-
-    Pins the end-to-end contract that KV-reuse alignment relies on.
-    """
-    from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_tokenizer import (
-        TokenizerWrapper,
-    )
-
-    tkw = TokenizerWrapper(_HUNYUAN_MODEL_ID)
-    think_id = tkw.tokenizer.convert_tokens_to_ids("<think>")
-    end_think_id = tkw.end_think_token_id
-
-    # Construct a synthetic AR cot id sequence. Use mid-range vocab ids
-    # that are very unlikely to collide with any chat-template specials.
-    payload = [55001, 55002, 55003]
-    ar_token_ids = [think_id] + payload + [end_think_id]
-
-    out_with_ids = tkw.apply_chat_template(
-        batch_prompt=["draw a robot"],
-        batch_system_prompt=[None],
-        batch_cot_token_ids=[ar_token_ids],
-        mode="gen_text",
-        sequence_template="instruct",
-    )
-    tokens_with_ids = out_with_ids["output"].tokens.tolist()[0]  # batched output: take batch 0
-
-    # The exact AR payload must appear as a contiguous subsequence in the
-    # encoded output, sandwiched by the think markers we forwarded.
-    def _find_subseq(haystack: list[int], needle: list[int]) -> int:
-        n = len(needle)
-        for i in range(len(haystack) - n + 1):
-            if haystack[i : i + n] == needle:
-                return i
-        return -1
-
-    full_cot = [think_id] + payload + [end_think_id]
-    idx = _find_subseq(tokens_with_ids, full_cot)
-    assert idx >= 0, (
-        f"AR cot ids {full_cot} not found as contiguous subseq in encoded output; "
-        f"means apply_chat_template did NOT respect batch_cot_token_ids and re-encoded cot text instead"
-    )
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 92f0ac2dc98..dd7f668611e 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -219,13 +219,9 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
 
 
 def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
-    """Legacy callers passed a task-enum value (i2t/it2i/t2i/t2t) under
-    `bot_task` in extra_body. After the P1 task/bot_task split, the helper
-    must still treat that legacy form as `task=<value>, bot_task=None`
-    (i.e. defaults bot_task semantic to "think"), so the resulting prompt
-    is identical to the pre-P1 output.
-
-    Pins the back-compat contract.
+    """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now
+    inferred from reference_images; legacy bot_task must still trigger the
+    default think mode rather than getting silently dropped.
     """
     from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
 
@@ -241,7 +237,6 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
     )
     images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
 
-    # Legacy form: only bot_task=<task-enum>.
     legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
         serving_chat,
         engine=engine,
@@ -250,65 +245,8 @@ def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(servi
         reference_images=images,
         gen_params=OmniDiffusionSamplingParams(),
     )
-    # New form: explicit task=<task-enum>, no bot_task.
-    new_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="edit me",
-        extra_body={"task": "it2i"},
-        reference_images=images,
-        gen_params=OmniDiffusionSamplingParams(),
-    )
-    assert legacy_prompt["prompt"] == new_prompt["prompt"], (
-        f"legacy bot_task=<task> form must produce the same prompt as task=<task>; "
-        f"legacy={legacy_prompt['prompt']!r} new={new_prompt['prompt']!r}"
-    )
-
-
-@pytest.mark.parametrize("legacy_task", ["i2t", "t2t"])
-def test_build_multistage_generation_inputs_legacy_plain_tasks_stay_plain(serving_chat, legacy_task: str):
-    """Legacy bot_task=i2t/t2t must preserve those tasks' plain prompt mode.
-
-    The task/bot_task split must not normalize every legacy task-enum request
-    into bot_task="think"; i2t/t2t had no <think>/<recaption> trigger before
-    the split and should stay plain unless the caller passes an explicit
-    semantic bot_task.
-    """
-    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
-    engine = SimpleNamespace(
-        stage_configs=[
-            SimpleNamespace(stage_type="llm", is_comprehension=True),
-            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
-        ],
-        default_sampling_params_list=[
-            SamplingParams(temperature=0.0),
-            OmniDiffusionSamplingParams(),
-        ],
-    )
-    images = [Image.new("RGB", (32, 32), color="red")]
-
-    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="describe me",
-        extra_body={"bot_task": legacy_task},
-        reference_images=images if legacy_task == "i2t" else [],
-        gen_params=OmniDiffusionSamplingParams(),
-    )
-    explicit_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="describe me",
-        extra_body={"task": legacy_task},
-        reference_images=images if legacy_task == "i2t" else [],
-        gen_params=OmniDiffusionSamplingParams(),
-    )
-
-    assert legacy_prompt["prompt"] == explicit_prompt["prompt"]
-    assert legacy_prompt["prompt"].endswith("Assistant: ")
-    assert not legacy_prompt["prompt"].endswith("<think>")
-    assert not legacy_prompt["prompt"].endswith("<recaption>")
+    assert legacy_prompt["prompt"].count("<img>") == 2
+    assert legacy_prompt["prompt"].endswith("Assistant: <think>")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
index 1901210de09..76f3e500622 100644
--- a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
+++ b/tests/model_executor/stage_input_processors/test_hunyuan_image3.py
@@ -40,20 +40,9 @@ def test_extract_ratio_index_uses_fixed_special_token_ids():
     assert _extract_ratio_index([1, ratio_33, 2, ratio_36]) == 36
 
 
-def test_truncate_at_cot_end_uses_token_ids_when_text_skips_specials():
-    end_recaption = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"]
-    answer = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
-    boi = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"]
-    ratio = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
-    token_ids = [100, 101, end_recaption, answer, boi, ratio]
-
-    text, truncated = _truncate_at_cot_end(
-        "recaption body without special markers",
-        token_ids,
-    )
-
-    assert text == "recaption body without special markers"
-    assert truncated == [100, 101, end_recaption]
+def test_truncate_at_cot_end_strips_tail_after_recaption_marker():
+    text = _truncate_at_cot_end("body text</recaption><answer><boi><img_size_1024><img_ratio_0>")
+    assert text == "body text</recaption>"
 
 
 def test_ar2diffusion_applies_ratio_and_truncates_tail_without_tokenizer(monkeypatch: pytest.MonkeyPatch):
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index e6e0c9db346..5751cb4d831 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -903,75 +903,6 @@ def get_cot_sections(self, cot_text, uncond_kwargs, cot_max_length=None, drop_th
             dict(type="text", text=cot_text, **uncond_kwargs),
         ]
 
-    def get_cot_sections_from_token_ids(
-        self,
-        token_ids,
-        uncond_kwargs,
-        cot_max_length=None,
-        drop_think=False,
-    ):
-        """Split AR-sampled token IDs at think/recaption markers without re-encoding.
-
-        Functional mirror of `get_cot_sections` but operates on AR sampled IDs.
-        Used by KV-reuse-aware callers: tokenize-detokenize-tokenize over the AR
-        cot text is not lossless (BPE re-merges across multi-byte UTF-8 and
-        punctuation boundaries). The resulting length drift breaks AR KV
-        position alignment (`positive_reuse_len` computed in DiT-tok space vs
-        the actual cached AR KV in AR-tok space, off by N tokens for prompts
-        containing Chinese + escaped quotes etc.).
-        """
-        if not token_ids:
-            return []
-        ids = list(token_ids)
-
-        think_id = self.tokenizer.convert_tokens_to_ids("<think>")
-        end_think_id = self.end_think_token_id
-        recaption_id = self.tokenizer.convert_tokens_to_ids("<recaption>")
-        end_recaption_id = self.end_recaption_token_id
-
-        def _split_at_pair(seq, start_id, end_id):
-            if start_id is None or end_id is None:
-                return None
-            try:
-                s = seq.index(start_id)
-                e = seq.index(end_id, s + 1)
-            except ValueError:
-                return None
-            return seq[:s], seq[s + 1 : e], seq[e + 1 :]
-
-        # Try <think>...</think> first to mirror text-side split order.
-        split = _split_at_pair(ids, think_id, end_think_id)
-        if split is not None:
-            before, inside, after = split
-            return (
-                self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
-                + (
-                    [
-                        dict(type="text", tokens=[think_id]),
-                        dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
-                        dict(type="text", tokens=[end_think_id]),
-                    ]
-                    if not drop_think
-                    else []
-                )
-                + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
-            )
-
-        split = _split_at_pair(ids, recaption_id, end_recaption_id)
-        if split is not None:
-            before, inside, after = split
-            return (
-                self.get_cot_sections_from_token_ids(before, uncond_kwargs, drop_think=drop_think)
-                + [
-                    dict(type="text", tokens=[recaption_id]),
-                    dict(type="text", tokens=inside, max_length=cot_max_length, **uncond_kwargs),
-                    dict(type="text", tokens=[end_recaption_id]),
-                ]
-                + self.get_cot_sections_from_token_ids(after, uncond_kwargs, drop_think=drop_think)
-            )
-
-        return [dict(type="text", tokens=ids, **uncond_kwargs)]
-
     def apply_general_template(
         self,
         message_list,
@@ -1022,36 +953,17 @@ def process_successive_message(
             while _cur_message_idx < len(message_list) and _message_list[_cur_message_idx]["role"] == role:
                 message = _message_list[_cur_message_idx]
                 if message["type"] == "text":
-                    content = message["content"]
-                    ctx_type = message.get("context_type", "str")
+                    text = message["content"]
                     if role == "system":
-                        _sub_sections.append(dict(type="text", text=content))
+                        _sub_sections.append(dict(type="text", text=text))
                     elif role == "assistant":
-                        if ctx_type == "token_ids":
-                            # Pre-tokenized AR cot tokens; split on marker ids, no re-encode.
-                            if hasattr(content, "tolist"):
-                                content = content.tolist()
-                            think_id = self.tokenizer.convert_tokens_to_ids("<think>")
-                            recaption_id = self.tokenizer.convert_tokens_to_ids("<recaption>")
-                            has_cot = (think_id in content and self.end_think_token_id in content) or (
-                                recaption_id in content and self.end_recaption_token_id in content
-                            )
-                            if has_cot:
-                                _sub_sections.extend(
-                                    self.get_cot_sections_from_token_ids(content, uncond_kwargs, drop_think=drop_think)
-                                )
-                            else:
-                                _sub_sections.append(dict(type="text", tokens=content, **uncond_kwargs))
+                        if ("<recaption>" in text and "</recaption>" in text) or (
+                            "<think>" in text and "</think>" in text
+                        ):
+                            _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
                         else:
-                            text = content
-                            if ("<recaption>" in text and "</recaption>" in text) or (
-                                "<think>" in text and "</think>" in text
-                            ):
-                                _sub_sections.extend(self.get_cot_sections(text, uncond_kwargs, drop_think=drop_think))
-                            else:
-                                _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
+                            _sub_sections.append(dict(type="text", text=text, **uncond_kwargs))
                     else:
-                        text = content
                         _sub_sections.append(
                             dict(type="text", text=f"{answer_prefix}{text}{answer_suffix}", **uncond_kwargs)
                         )
@@ -1176,7 +1088,6 @@ def apply_chat_template(
         batch_cond_image_info: list[JointImageInfo] | list[list[JointImageInfo]] | None = None,
         batch_system_prompt: list[str] | None = None,
         batch_cot_text: list[str] | None = None,
-        batch_cot_token_ids: list | None = None,
         max_length: int | None = None,
         bot_task: str = "auto",  # auto/image/think/recaption/img_ratio
         image_base_size: int = 1024,
@@ -1205,14 +1116,6 @@ def apply_chat_template(
                 )
             else:
                 batch_cot_text = [None] * batch_size
-            # Optional per-item pre-tokenized AR cot ids (used by KV-reuse).
-            if batch_cot_token_ids is not None:
-                assert len(batch_cot_token_ids) == batch_size, (
-                    f"batch_cot_token_ids should have the same length as batch_size ({batch_size}), "
-                    f"but got {len(batch_cot_token_ids)}."
-                )
-            else:
-                batch_cot_token_ids = [None] * batch_size
             if batch_cond_image_info is not None:
                 assert len(batch_cond_image_info) == batch_size, (
                     f"batch_cond_image_info should have the same length as batch_size ({batch_size}), "
@@ -1231,14 +1134,12 @@ def apply_chat_template(
                 prompt,
                 system_prompt,
                 cot_text,
-                cot_token_ids,
                 gen_image_info,
                 cond_image_info_list,
             ) in zip(
                 batch_prompt,
                 batch_system_prompt,
                 batch_cot_text,
-                batch_cot_token_ids,
                 batch_gen_image_info,
                 batch_cond_image_info,
             ):
@@ -1258,15 +1159,7 @@ def apply_chat_template(
                 #   2.2 text inputs
                 message_list.append(dict(role="user", type="text", content=prompt, context_type="str"))
                 # 3. assistant answer sections
-                if cot_token_ids is not None:
-                    # Use AR-sampled token IDs verbatim. Avoids the
-                    # tokenize-detokenize-tokenize length drift that breaks KV reuse
-                    # (see process_successive_message context_type="token_ids" branch
-                    # and get_cot_sections_from_token_ids docstring).
-                    message_list.append(
-                        dict(role="assistant", type="text", content=cot_token_ids, context_type="token_ids")
-                    )
-                elif cot_text is not None:
+                if cot_text is not None:
                     message_list.append(dict(role="assistant", type="text", content=cot_text, context_type="str"))
                 if mode == "gen_image":
                     message_list.append(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 63c367a1006..33bfb65fb41 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -754,7 +754,6 @@ def prepare_model_inputs(
         mode="gen_image",
         system_prompt=None,
         cot_text=None,
-        cot_token_ids=None,
         num_inference_steps=50,
         guidance_scale=5.0,
         image_size="auto",
@@ -771,7 +770,6 @@ def prepare_model_inputs(
         batch_message_list = message_list
         batch_prompt = prompt
         batch_cot_text = cot_text
-        batch_cot_token_ids = cot_token_ids
         batch_system_prompt = system_prompt
         batch_gen_image_info = None
         batch_cond_image_info = kwargs.pop("batch_cond_image_info", None)
@@ -850,7 +848,6 @@ def prepare_model_inputs(
             batch_cond_image_info=batch_cond_image_info,
             batch_system_prompt=batch_system_prompt,
             batch_cot_text=batch_cot_text,
-            batch_cot_token_ids=batch_cot_token_ids,
             max_length=kwargs.get("max_length"),
             bot_task=bot_task,
             image_base_size=self.config.image_base_size,
@@ -1379,20 +1376,13 @@ def forward(
             system_prompt = system_prompt.strip() if system_prompt is not None else ""
         prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt
 
-        # Extract AR-generated CoT/recaption text from each prompt's extra dict.
-        # The AR-side stage input processor (``ar2diffusion``) already prepends
-        # the trigger tag (e.g. ``<think>``) when the AR used the KV-reuse
-        # pretrain format, so ``ar_generated_text`` is a self-contained string
-        # and ``get_cot_sections()`` can parse the think/recaption structure
-        # directly.
-        cot_text_list = []
-        for p in req.prompts:
-            extra = p.get("extra", {}) if isinstance(p, dict) else {}
-            cot_text_list.append(extra.get("ar_generated_text") or None)
+        cot_text_list = [
+            (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None
+            for p in req.prompts
+        ]
         cot_text = (
             [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None
         )
-        cot_token_ids = None
 
         batch_cond_image_info: list[list[JointImageInfo]] | None = None
         if any(not isinstance(p, str) for p in req.prompts):
@@ -1433,7 +1423,6 @@ def forward(
         model_inputs = self.prepare_model_inputs(
             prompt=prompt,
             cot_text=cot_text,
-            cot_token_ids=cot_token_ids,
             system_prompt=system_prompt,
             mode="gen_image",
             generator=generator,
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index c54295cf104..c1467f7190a 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1700,11 +1700,8 @@ async def edit_images(
     # vllm-omni extension for layered models (e.g., Qwen-Image-Layered)
     layers: int | None = Form(None),
     resolution: int | None = Form(None),  # See SUPPORTED_LAYERED_RESOLUTIONS
+    # /v1/images/edits is always IT2I; only the prompting knobs are exposed.
     bot_task: str | None = Form(None),
-    # ``/v1/images/edits`` is always image-to-image (IT2I); the ``task`` axis
-    # is fixed and pinned downstream. ``bot_task`` (think / recaption /
-    # think_recaption / vanilla) + ``sys_type`` / ``system_prompt`` are the
-    # only HunyuanImage-3.0 knobs callers need to express here.
     sys_type: str | None = Form(None),
     system_prompt: str | None = Form(None),
 ) -> ImageGenerationResponse:
@@ -1757,10 +1754,8 @@ async def edit_images(
                 status_code=HTTPStatus.BAD_REQUEST.value,
                 detail=detail,
             )
-        # Convert uploads to RGB when the caller opts into the Hunyuan-aware
-        # API surface (bot_task / sys_type / system_prompt). Keeping uploads
-        # as RGBA/P PIL objects makes online IT2I observe a different visual
-        # input than the offline path.
+        # Match the offline path: RGB normalize when the caller opts into
+        # Hunyuan-aware behavior. RGBA/P uploads otherwise diverge from offline.
         normalize_edit_images_rgb = bot_task is not None or sys_type is not None
         pil_images = await _load_input_images(input_images_list, normalize_rgb=normalize_edit_images_rgb)
         prompt["multi_modal_data"] = {}
@@ -1895,12 +1890,8 @@ async def edit_images(
                 "seed": effective_seed,
                 "num_outputs_per_prompt": n,
             }
-            # When size="auto", width/height were resolved from the first
-            # input images size (e.g. 512x512 logo), NOT a client-requested
-            # output dimension. Forwarding them to extra_body would override
-            # AR-driven pipelines (e.g. HunyuanImage-3.0) AR `<img_ratio_*>`
-            # token decision via gen_params -> sampling_params. Skip the
-            # forward when auto, matching offline end2end.py img2img.
+            # size="auto" resolves width/height from input image; forwarding
+            # those would override AR-driven `<img_ratio_*>` token selection.
             if not size_was_auto:
                 if width is not None:
                     extra_body["width"] = width
@@ -1925,11 +1916,6 @@ async def edit_images(
                 lora_dict = _get_lora_from_json_str(lora)
                 _parse_lora_request(lora_dict)
                 extra_body["lora"] = lora_dict
-            # ``/v1/images/edits`` is always IT2I; the chat handler's
-            # default (``task="it2i"`` when neither ``task`` nor
-            # ``bot_task`` resolves to a task enum) covers this implicitly.
-            # Legacy callers passing the task enum via ``bot_task`` (e.g.
-            # ``bot_task="it2i"``) are normalized inside the chat handler.
             if bot_task is not None:
                 extra_body["bot_task"] = bot_task
             if sys_type is not None:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 6e2a30f56f2..4677135cdb0 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,37 +2247,26 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        # P1: task / bot_task / sys_type / system_prompt quadruple. Legacy
-        # api_server callers may still pass a task-enum value (i2t / it2i /
-        # t2i / t2t) under `bot_task`; normalize it to `task` here so
-        # downstream uses the canonical split. Source the task enum from
-        # prompt_utils so this layer stays in sync with the model side.
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
         )
-        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            available_tasks as _hunyuan3_available_tasks,
-        )
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
         )
 
-        task = extra_body.get("task")
         bot_task = extra_body.get("bot_task")
         sys_type = extra_body.get("sys_type")
         custom_system_prompt = extra_body.get("system_prompt")
-        legacy_task_from_bot_task = False
-        legacy_task_names = set(_hunyuan3_available_tasks()) | {
-            "it2i_think",
-            "it2i_recaption",
-            "t2i_think",
-            "t2i_recaption",
-            "t2i_vanilla",
-        }
-        if task is None and bot_task in legacy_task_names:
-            task = bot_task
+
+        # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via
+        # bot_task. Task is now derived from reference_images presence; map
+        # composites to their semantic bot_task and drop bare task enums.
+        bot_task_omitted = False
+        if bot_task in {"it2i", "t2i", "i2t", "t2t"}:
             bot_task = None
-            legacy_task_from_bot_task = True
+            bot_task_omitted = True
+        elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}:
+            bot_task = bot_task.split("_", 1)[1]
 
         if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
             raise ValueError(
@@ -2285,6 +2274,8 @@ def _build_multistage_generation_inputs(
                 f"images per request, got {len(reference_images)}"
             )
 
+        task = "it2i" if reference_images else "t2i"
+
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
         if reference_images:
@@ -2296,50 +2287,33 @@ def _build_multistage_generation_inputs(
 
         prompt_token_ids: list[int] | None = None
         system_prompt_type: str | None = None
-        if task or bot_task:
+        if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted:
             from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
                 build_prompt,
                 build_prompt_tokens,
             )
 
-            num_images = len(reference_images) if reference_images else 1
-            effective_task = task if task is not None else "it2i"
-            build_kwargs = {
-                "task": effective_task,
+            build_kwargs: dict[str, Any] = {
+                "task": task,
                 "sys_type": sys_type,
                 "custom_system_prompt": custom_system_prompt,
-                "num_images": num_images,
+                "num_images": len(reference_images) if reference_images else 1,
             }
             if bot_task is not None:
                 build_kwargs["bot_task"] = bot_task
-            elif "bot_task" in extra_body and not legacy_task_from_bot_task:
-                # Preserve the prompt_utils distinction between omitted
-                # bot_task and explicit None. Omitted keeps each task's legacy
-                # default (`it2i` -> think, `i2t`/`t2t` -> plain), while
-                # explicit None is the caller's plain-mode request.
+            elif "bot_task" in extra_body and not bot_task_omitted:
+                # Explicit None from the caller is plain-mode; omitted lets
+                # each task fall back to its default trigger.
                 build_kwargs["bot_task"] = None
             if tokenizer is not None:
-                # HF byte-for-byte path: feed segment-tokenized prompt_token_ids
-                # so AR sees the same template-tokenization HF apply_chat_template
-                # produces. Without this, the engine BPE-merges across template
-                # segment boundaries (e.g. "。\n\n" -> single id) and AR
-                # diverges from training distribution -- different cot_text,
-                # different DiT input, different final image. Mirrors offline
-                # examples/.../end2end.py img2img which always feeds
-                # prompt_token_ids. See prompt_utils.build_prompt NOTE.
-                result = build_prompt_tokens(
-                    prompt,
-                    tokenizer,
-                    **build_kwargs,
-                )
+                # Feed segment-tokenized prompt_token_ids so AR matches HF
+                # apply_chat_template byte-for-byte (engine BPE would merge
+                # across template boundaries, e.g. "。\n\n" -> single id).
+                result = build_prompt_tokens(prompt, tokenizer, **build_kwargs)
                 prompt_token_ids = result.token_ids
                 system_prompt_type = result.system_prompt_type
             else:
-                # Legacy string path (e.g. unit tests with no tokenizer plumbed).
-                prompt = build_prompt(
-                    prompt,
-                    **build_kwargs,
-                )
+                prompt = build_prompt(prompt, **build_kwargs)
             if reference_images and len(reference_images) == 1:
                 engine_prompt_data = {"image": reference_images[0]}
                 modalities = ["image"]
@@ -2349,10 +2323,8 @@ def _build_multistage_generation_inputs(
             engine_prompt["prompt_token_ids"] = prompt_token_ids
         if system_prompt_type is not None:
             engine_prompt["use_system_prompt"] = system_prompt_type
-        # Forward the custom system prompt body too. DiT's
-        # `get_system_prompt(use_system_prompt, "image", system_prompt)` reads
-        # the third positional arg, so leaving it None turns a `sys_type=custom`
-        # request into an empty DiT system prefix (AR/DiT divergence).
+        # DiT's get_system_prompt(use_system_prompt, "image", system_prompt) reads
+        # this; omitting it makes sys_type=custom yield an empty DiT prefix.
         if custom_system_prompt is not None:
             engine_prompt["system_prompt"] = custom_system_prompt
         engine_prompt["modalities"] = modalities
@@ -2399,10 +2371,8 @@ def _build_multistage_generation_inputs(
             ):
                 default_stage_params.seed = seed
 
-            # Inject target_h/w into comprehension (AR) stage sampling params
-            # for models that need M-RoPE position pre-computation (e.g.
-            # GLM-Image).  max_tokens is handled via the deploy YAML default
-            # (upper-bound ceiling) rather than computed dynamically here.
+            # Inject target_h/w into AR stage for M-RoPE position pre-computation
+            # (e.g. GLM-Image). max_tokens comes from deploy YAML.
             if comprehension_idx is not None and idx == comprehension_idx and height is not None and width is not None:
                 extra_args = getattr(default_stage_params, "extra_args", None)
                 if extra_args is None:
@@ -2411,22 +2381,17 @@ def _build_multistage_generation_inputs(
                 extra_args["target_h"] = int(height)
                 extra_args["target_w"] = int(width)
 
-            # Resolve AR stop tokens dynamically from (task, bot_task) so the
-            # online path matches offline ``end2end.py`` and so the AR stops
-            # at the natural ``<img_ratio_*>`` token for image-output tasks
-            # (mirrors upstream ``modeling_hunyuan_image_3.py:3289-3303``).
-            # Surviving yaml-side ``stop_token_ids`` would otherwise stop AR
-            # too early and leave ``ar2diffusion`` without a ratio token.
+            # Stop AR at the natural <img_ratio_*> token for image tasks; mirrors
+            # upstream modeling_hunyuan_image_3.py:3289-3303.
             if (
                 comprehension_idx is not None
                 and idx == comprehension_idx
                 and hasattr(default_stage_params, "stop_token_ids")
             ):
-                resolved_stops = _hunyuan3_resolve_stop_token_ids(
-                    task=task if task is not None else "it2i",
+                default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids(
+                    task=task,
                     bot_task=bot_task,
                 )
-                default_stage_params.stop_token_ids = resolved_stops
 
             if stage_type == "diffusion":
                 self._set_if_supported(
diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
index 5b4d5f56529..749e213e099 100644
--- a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
+++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py
@@ -102,45 +102,19 @@ def _build_ratio_size_table(base_size: int) -> list[tuple[int, int]]:
     return [(r.height, r.width) for r in resolutions]
 
 
-def _truncate_at_cot_end(
-    generated_text: str,
-    generated_token_ids,
-) -> tuple[str, list[int]]:
+def _truncate_at_cot_end(generated_text: str) -> str:
     """Truncate AR output at first `</recaption>` (or `</think>` fallback).
 
-    Mirrors `HunyuanImage3ForCausalMM.generate_image` in the official
-    upstream, which decodes only `generated_tokens[0, :end_pos + 1]` as
-    `cot_text` for DiT. The trailing `<answer><boi><img_size_*><img_ratio_*>`
-    sequence is a stage-transition trigger consumed via `image_size` /
-    height/width; it must NOT be forwarded to DiT's prompt builder, or
-    the extra `<boi>` and ratio tokens drift the DiT's own prompt
-    structure.
+    Mirrors upstream `HunyuanImage3ForCausalMM.generate_image` which feeds
+    DiT only the cot text up to the closing tag; the trailing
+    `<answer><boi><img_size_*><img_ratio_*>` is consumed via height/width
+    extraction and must not leak into DiT's prompt builder.
     """
-    token_list = list(generated_token_ids) if generated_token_ids is not None else []
-
-    end_ids = {
-        "</recaption>": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
-        "</think>": HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
-    }
-
     for marker in ("</recaption>", "</think>"):
-        truncated_tokens = token_list
-        end_id = end_ids[marker]
-        if token_list:
-            try:
-                token_end = token_list.index(end_id)
-                truncated_tokens = token_list[: token_end + 1]
-            except ValueError:
-                pass
-
         idx = generated_text.find(marker)
         if idx != -1:
-            text_end = idx + len(marker)
-            return generated_text[:text_end], truncated_tokens
-        if truncated_tokens is not token_list:
-            return generated_text, truncated_tokens
-
-    return generated_text, token_list
+            return generated_text[: idx + len(marker)]
+    return generated_text
 
 
 @lru_cache(maxsize=4)
@@ -256,14 +230,7 @@ def ar2diffusion(
                     width,
                 )
 
-        # Truncate the AR output at `</recaption>` (or `</think>`) before
-        # passing to DiT. Mirrors official `generate_image` which keeps
-        # `cot_text` clean and routes size/ratio via `image_size` only;
-        # we already extracted `ratio_idx` above and translated it into
-        # `height` / `width`, so the `<answer><boi><img_size_*><img_ratio_*>`
-        # tail has no remaining job and would only contaminate DiT's
-        # prompt builder if forwarded.
-        cot_text_for_dit, cot_token_ids_for_dit = _truncate_at_cot_end(generated_text, generated_token_ids)
+        cot_text_for_dit = _truncate_at_cot_end(generated_text)
 
         logger.info(
             "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, "

From 8d12ddda27f7f4e9d038a7eb2e5dab10a91eb2ee Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:09:14 +0800
Subject: [PATCH 38/43] chore: apply ruff-format fixup for cot_text_list
 comprehension

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 33bfb65fb41..73b89bb11b0 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1377,8 +1377,7 @@ def forward(
         prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt
 
         cot_text_list = [
-            (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None
-            for p in req.prompts
+            (p.get("extra", {}).get("ar_generated_text") if isinstance(p, dict) else None) or None for p in req.prompts
         ]
         cot_text = (
             [self._normalize_cot_text(t) for t in cot_text_list] if any(t is not None for t in cot_text_list) else None

From bfd17b37599207c86b88e55908daea5d2c160041 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:23:39 +0800
Subject: [PATCH 39/43] chore: keep for-loop one-line in apply_chat_template
 (no spurious diff)

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../models/hunyuan_image3/hunyuan_image3_tokenizer.py     | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
index 5751cb4d831..751bfb21af8 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py
@@ -1130,13 +1130,7 @@ def apply_chat_template(
 
             # Convert single round materials into standard message list
             batch_message_list = []
-            for (
-                prompt,
-                system_prompt,
-                cot_text,
-                gen_image_info,
-                cond_image_info_list,
-            ) in zip(
+            for prompt, system_prompt, cot_text, gen_image_info, cond_image_info_list in zip(
                 batch_prompt,
                 batch_system_prompt,
                 batch_cot_text,

From 1de9ec8bcd7f0376f521e4c528a0e6758a26eb05 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 16:38:50 +0800
Subject: [PATCH 40/43] test: rename test_hunyuan_image3.py to avoid pytest
 basename collision

Collided with tests/e2e/accuracy/test_hunyuan_image3.py under pytest's
default 'prepend' import mode (no __init__.py in either dir). Rename
this one to make basenames unique.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/model_executor/stage_input_processors/{test_hunyuan_image3.py => test_hunyuan_image3_bridge.py} (100%)

diff --git a/tests/model_executor/stage_input_processors/test_hunyuan_image3.py b/tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py
similarity index 100%
rename from tests/model_executor/stage_input_processors/test_hunyuan_image3.py
rename to tests/model_executor/stage_input_processors/test_hunyuan_image3_bridge.py

From 58ce6d86cf547aed75bf8c754f5a018153273bfb Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 22:52:11 +0800
Subject: [PATCH 41/43] fix(hunyuan_image3): mark AR stage
 is_comprehension=true so online IT2I keeps non-square AR shape

Online /v1/images/edits collapsed AR-predicted aspects to a square
(e.g. 1024x1024) while offline end2end.py honored the predicted ratio
(e.g. 1216x832). Root cause is the AR stage in deploy/hunyuan_image3.yaml
was marked ``is_comprehension: false`` (read literally as "this task
generates an image, not text"), but ``is_comprehension`` inside vllm-omni
is the tokenizer-owning AR-stage marker, not a user-visible task type.

The serving path in entrypoints/openai/serving_chat.py looks up the AR
stage by that flag to apply ``resolve_stop_token_ids`` (image-task stop
set = ``<img_ratio_*>`` range). With the flag false the lookup returned
None, the AR kept the YAML default ``stop_token_ids: [<answer>]``, and
the HunyuanImage3 custom sampler's forced-transition step
``</recaption> -> <answer>`` triggered an immediate stop. The cumulative
token ids never reached ``<img_size_BASE><img_ratio_X>``, so
``ar2diffusion._extract_ratio_index`` could not recover the AR aspect
and fell back to the carried-through prompt size (1024x1024 for
size=auto edits).

Offline avoided this because end2end.py overrides the AR stage's
stop_token_ids directly without going through the comprehension-stage
lookup. Other models did not hit it because their AR stage already had
``is_comprehension: true`` (the field's framework-internal meaning).

Fix is one line on the deploy config plus a comment explaining the
flag's real semantics so the next model author does not repeat the
same misread.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 634165cd33a..93294bcdf44 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -22,7 +22,13 @@ connectors:
 
 stages:
   - stage_id: 0
-    is_comprehension: false
+    # ``is_comprehension`` in vllm-omni names the tokenizer-owning AR stage
+    # (see config/stage_config.py + serving_chat AR-stage lookup), independent
+    # of whether the AR's task is comprehension (i2t/t2t) or generation
+    # (it2i/t2i). HunyuanImage-3.0's stage-0 owns the tokenizer and emits the
+    # cot+ratio token sequence consumed by stage-1, so it must be marked True
+    # for the serving path to set AR seed/stop_token_ids on this stage.
+    is_comprehension: true
     final_output: true
     final_output_type: text
     max_num_seqs: 1

From be0c6840046d96cbd83e7c2ce2318e2e1fcb3a98 Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Wed, 13 May 2026 23:27:32 +0800
Subject: [PATCH 42/43] chore(hunyuan_image3): drop redundant hunyuan-specific
 task/stop logic from serving_chat

PR #3444 added 84 lines of HunyuanImage-3.0-specific handling to
``serving_chat._build_multistage_generation_inputs`` (task derivation
from reference images, legacy task-enum mapping on ``bot_task``,
``MAX_IMAGES_PER_REQUEST`` cap, and an AR-stage ``stop_token_ids``
override via ``resolve_stop_token_ids``). The endpoint dispatch in
``api_server.py`` (``/v1/images/edits`` vs ``/v1/images/generations``)
already encodes the task split, and the AR-stage stop override is
redundant: ``HunyuanImage3ForCausalMM.sample`` already forces an EOS
after sampling a ratio token (``hunyuan_image3.py`` generation-mode
branch), so leaving the YAML default stop set empty lets the AR run
through ``</recaption><answer><boi><img_size><img_ratio>`` and stop
naturally on EOS; ``ar2diffusion._extract_ratio_index`` then reads the
ratio off ``cumulative_token_ids``. The production deploy
(``vllm_omni/deploy/hunyuan_image3.yaml``) already omits
``stop_token_ids`` for stage-0.

Net effect on ``serving_chat.py``: +84/-19 -> +47/-19 (-37 lines).
Behavior verified end-to-end on ``/v1/images/edits`` with a non-square
target after removal: ``ar2diffusion`` reports ``AR ratio_idx=19,
target size=1216x832`` (matches the offline ``end2end.py`` path),
identical to the result with the now-removed override in place.

Offline ``end2end.py`` still derives ``task`` and overrides
``stop_token_ids`` because it builds the params list directly without
the endpoint-level task signal; that path is intentionally unchanged.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 vllm_omni/entrypoints/openai/serving_chat.py | 43 ++------------------
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 4677135cdb0..2c375fa2928 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2247,35 +2247,10 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            MAX_IMAGES_PER_REQUEST as _HUNYUAN3_MAX_IMAGES,
-        )
-        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            resolve_stop_token_ids as _hunyuan3_resolve_stop_token_ids,
-        )
-
         bot_task = extra_body.get("bot_task")
         sys_type = extra_body.get("sys_type")
         custom_system_prompt = extra_body.get("system_prompt")
 
-        # Legacy callers passed task enums (it2i / t2i / it2i_think / ...) via
-        # bot_task. Task is now derived from reference_images presence; map
-        # composites to their semantic bot_task and drop bare task enums.
-        bot_task_omitted = False
-        if bot_task in {"it2i", "t2i", "i2t", "t2t"}:
-            bot_task = None
-            bot_task_omitted = True
-        elif bot_task in {"it2i_think", "it2i_recaption", "t2i_think", "t2i_recaption", "t2i_vanilla"}:
-            bot_task = bot_task.split("_", 1)[1]
-
-        if reference_images and len(reference_images) > _HUNYUAN3_MAX_IMAGES:
-            raise ValueError(
-                f"HunyuanImage-3.0 IT2I accepts at most {_HUNYUAN3_MAX_IMAGES} input "
-                f"images per request, got {len(reference_images)}"
-            )
-
-        task = "it2i" if reference_images else "t2i"
-
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
         if reference_images:
@@ -2287,21 +2262,21 @@ def _build_multistage_generation_inputs(
 
         prompt_token_ids: list[int] | None = None
         system_prompt_type: str | None = None
-        if bot_task is not None or sys_type is not None or custom_system_prompt is not None or bot_task_omitted:
+        if bot_task is not None or sys_type is not None or custom_system_prompt is not None:
             from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
                 build_prompt,
                 build_prompt_tokens,
             )
 
             build_kwargs: dict[str, Any] = {
-                "task": task,
+                "task": "it2i" if reference_images else "t2i",
                 "sys_type": sys_type,
                 "custom_system_prompt": custom_system_prompt,
                 "num_images": len(reference_images) if reference_images else 1,
             }
             if bot_task is not None:
                 build_kwargs["bot_task"] = bot_task
-            elif "bot_task" in extra_body and not bot_task_omitted:
+            elif "bot_task" in extra_body:
                 # Explicit None from the caller is plain-mode; omitted lets
                 # each task fall back to its default trigger.
                 build_kwargs["bot_task"] = None
@@ -2381,18 +2356,6 @@ def _build_multistage_generation_inputs(
                 extra_args["target_h"] = int(height)
                 extra_args["target_w"] = int(width)
 
-            # Stop AR at the natural <img_ratio_*> token for image tasks; mirrors
-            # upstream modeling_hunyuan_image_3.py:3289-3303.
-            if (
-                comprehension_idx is not None
-                and idx == comprehension_idx
-                and hasattr(default_stage_params, "stop_token_ids")
-            ):
-                default_stage_params.stop_token_ids = _hunyuan3_resolve_stop_token_ids(
-                    task=task,
-                    bot_task=bot_task,
-                )
-
             if stage_type == "diffusion":
                 self._set_if_supported(
                     default_stage_params,

From 161ba503d52a206a434d681d9c03d7e0632419ad Mon Sep 17 00:00:00 2001
From: TaffyOfficial <2324465096@qq.com>
Date: Thu, 14 May 2026 09:41:19 +0800
Subject: [PATCH 43/43] test(hunyuan_image3): drop legacy task-as-bot_task
 tests after serving_chat cleanup

The serving_chat cleanup in the previous commit removed the legacy
caller compatibility layer that translated ``bot_task in {"it2i",
"t2i", "i2t", "t2t"}`` to ``None`` and ``bot_task in {"it2i_think",
"it2i_recaption", ...}`` to the trailing ``think``/``recaption`` part.
That translation existed because old callers stuffed task enums into
the ``bot_task`` field; the new contract is the endpoint dispatch
(``/v1/images/edits`` vs ``/v1/images/generations``) and
``reference_images`` presence carry the task signal, and ``bot_task``
only takes the documented values (``None`` / ``recaption`` / ``think``
/ ``think_recaption`` / ``vanilla``).

Two tests in
``test_serving_chat_multistage_generation.py`` were explicitly pinning
the now-removed legacy form
(``test_..._legacy_bot_task_form_unchanged``,
``test_..._legacy_composite_tasks_still_work``); deleting them.

Three other tests passed ``bot_task="it2i"`` only to trigger the
``build_prompt`` path (the *value* did not matter, just non-None);
switching them to ``bot_task="think"`` keeps the same intent against
the new validator.

Signed-off-by: TaffyOfficial <2324465096@qq.com>
---
 .../openai_api/test_image_server.py           |  4 +-
 ...test_serving_chat_multistage_generation.py | 75 +------------------
 2 files changed, 4 insertions(+), 75 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index fb9c126d3fe..40adb7a9151 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -1675,7 +1675,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on
     for multi-image fusion).
 
     Cross-pins the multi-image fix at the API level: 2 reference images
-    with bot_task=it2i must produce 2 <img> placeholders in the captured
+    with bot_task=think must produce 2 <img> placeholders in the captured
     AR prompt (build_prompt called with num_images=2).
     """
     img_a = make_test_image_bytes((32, 32))
@@ -1686,7 +1686,7 @@ def test_image_edits_size_auto_preserves_bridge_size(async_omni_stage_configs_on
         data={
             "prompt": "fuse",
             "size": "auto",
-            "bot_task": "it2i",
+            "bot_task": "think",
         },
     )
     assert response.status_code == 200, response.text
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index dd7f668611e..4b63588bae7 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -127,7 +127,7 @@ def test_build_multistage_generation_inputs_multi_image_emits_n_img_placeholders
             serving_chat,
             engine=engine,
             prompt="edit me",
-            extra_body={"bot_task": "it2i"},
+            extra_body={"bot_task": "think"},
             reference_images=images[:n],
             gen_params=OmniDiffusionSamplingParams(),
         )
@@ -196,7 +196,7 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
             serving_chat,
             engine=engine,
             prompt="edit me",
-            extra_body={"bot_task": "it2i"},
+            extra_body={"bot_task": "think"},
             reference_images=images[:n],
             gen_params=OmniDiffusionSamplingParams(),
             tokenizer=tok,
@@ -218,77 +218,6 @@ def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
         assert img_count == n, f"N={n}: expected {n} <img> token ids in prompt_token_ids, got {img_count}"
 
 
-def test_build_multistage_generation_inputs_legacy_bot_task_form_unchanged(serving_chat):
-    """Legacy callers passed bot_task="it2i" as an opt-in marker. Task is now
-    inferred from reference_images; legacy bot_task must still trigger the
-    default think mode rather than getting silently dropped.
-    """
-    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
-    engine = SimpleNamespace(
-        stage_configs=[
-            SimpleNamespace(stage_type="llm", is_comprehension=True),
-            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
-        ],
-        default_sampling_params_list=[
-            SamplingParams(temperature=0.0),
-            OmniDiffusionSamplingParams(),
-        ],
-    )
-    images = [Image.new("RGB", (32, 32), color="red"), Image.new("RGB", (32, 32), color="blue")]
-
-    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="edit me",
-        extra_body={"bot_task": "it2i"},
-        reference_images=images,
-        gen_params=OmniDiffusionSamplingParams(),
-    )
-    assert legacy_prompt["prompt"].count("<img>") == 2
-    assert legacy_prompt["prompt"].endswith("Assistant: <think>")
-
-
-@pytest.mark.parametrize(
-    "legacy_task,trigger",
-    [
-        ("it2i_think", "<think>"),
-        ("it2i_recaption", "<recaption>"),
-    ],
-)
-def test_build_multistage_generation_inputs_legacy_composite_tasks_still_work(
-    serving_chat,
-    legacy_task: str,
-    trigger: str,
-):
-    """Legacy composite task names passed through bot_task must still work."""
-    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
-    engine = SimpleNamespace(
-        stage_configs=[
-            SimpleNamespace(stage_type="llm", is_comprehension=True),
-            SimpleNamespace(stage_type="diffusion", is_comprehension=False),
-        ],
-        default_sampling_params_list=[
-            SamplingParams(temperature=0.0),
-            OmniDiffusionSamplingParams(),
-        ],
-    )
-    images = [Image.new("RGB", (32, 32), color="red")]
-
-    legacy_prompt, _ = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="edit me",
-        extra_body={"bot_task": legacy_task},
-        reference_images=images,
-        gen_params=OmniDiffusionSamplingParams(),
-    )
-
-    assert legacy_prompt["prompt"].count("<img>") == 1
-    assert legacy_prompt["prompt"].endswith(f"Assistant: {trigger}")
-
-
 def test_build_multistage_generation_inputs_bot_task_semantic_changes_trigger_and_sys(serving_chat):
     """Passing bot_task=think_recaption (vs default "think") must flip the
     resolved sys_type to en_think_recaption (and trigger tag is still