diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 3cd8fa01b2e..3eb3bfbff6f 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -135,17 +135,19 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
 
 ## Prompt Format
 
-HunyuanImage-3.0 uses a pretrain template format:
+HunyuanImage-3.0-Instruct uses an instruct chat template:
 
 ```
-<|startoftext|>{system_prompt}{<img>}{trigger_tag}{user_prompt}
+<|startoftext|>{system_prompt}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger_tag?}
 ```
 
-- `<img>`: Placeholder for each input image (auto-inserted by `prompt_utils.py`)
-- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning)
+- `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline)
+- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning) — placed AFTER `Assistant: `
 - System prompt: Auto-selected based on task
+- `t2i_vanilla` is the only task that uses the bare pretrain template (no chat structure)
 
-The `prompt_utils.build_prompt()` handles this formatting automatically.
+The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
+helper handles segment-by-segment tokenization (matches HF `apply_chat_template` byte-for-byte).
 
 ------
 
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 2cea303888e..fc4b75e78d5 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -16,23 +16,12 @@
 import argparse
 import os
 
-from vllm_omni.diffusion.models.hunyuan_image3.system_prompt import (
-    get_system_prompt,
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    build_prompt_tokens,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
 
-# task → (sys_type, bot_task, trigger_tag)
-_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
-    "t2t": ("en_unified", None, None),
-    "i2t": ("en_unified", None, None),
-    "it2i_think": ("en_unified", "think", "<think>"),
-    "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_think": ("en_unified", "think", "<think>"),
-    "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_vanilla": ("en_vanilla", "image", None),
-}
-
 # Modality → prompt_utils task mapping
 _MODALITY_TASK_MAP = {
     "text2img": "t2i_think",
@@ -42,36 +31,6 @@
 }
 
 
-def build_prompt(
-    user_prompt: str,
-    task: str = "it2i_think",
-    sys_type: str | None = None,
-    custom_system_prompt: str | None = None,
-) -> str:
-    """Build a HunyuanImage-3.0 prompt using pretrain template format."""
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {sorted(_TASK_PRESETS)}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
-    effective_sys_type = sys_type or preset_sys_type
-
-    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
-    sys_text = system_prompt.strip() if system_prompt else ""
-
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
-
-    parts = ["<|startoftext|>"]
-    if sys_text:
-        parts.append(sys_text)
-    if has_image_input:
-        parts.append("<img>")
-    if trigger_tag:
-        parts.append(trigger_tag)
-    parts.append(user_prompt)
-
-    return "".join(parts)
-
-
 # Modality → default stage config
 _MODALITY_DEFAULT_CONFIG = {
     "text2img": "hunyuan_image3_t2i.yaml",
@@ -179,12 +138,18 @@ def main():
 
         input_image = Image.open(args.image_path).convert("RGB")
 
+    # Load tokenizer for segment-wise prompt tokenization (matches HF
+    # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+
     # Format prompts
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
-        formatted_text = build_prompt(p, task=task, sys_type=args.sys_type)
+        token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
 
-        prompt_dict: dict = {"prompt": formatted_text}
+        prompt_dict: dict = {"prompt_token_ids": token_ids}
 
         if args.modality == "text2img":
             prompt_dict["modalities"] = ["image"]
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
new file mode 100644
index 00000000000..501664fe688
--- /dev/null
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Regression tests for HunyuanImage3 prompt construction (PR #3243).
+
+Two layers:
+  1. Pure-logic tests with a recording fake tokenizer -- protect the
+     prompt template structure (BOS, User:/Assistant: framing, trigger
+     placement, image placeholder position) and protect the segment-
+     by-segment tokenization contract (each segment must hit
+     `tokenizer.encode` in isolation).
+  2. Real-tokenizer regression -- run when the HunyuanImage3-Instruct
+     tokenizer is in the local HF cache. Asserts the segment-tokenized
+     output diverges from the naive full-string encode, which is the
+     bug-tripping fixture for the cross-segment BPE merge fix
+     (commit 7bd429ed).
+"""
+
+from __future__ import annotations
+
+import ast
+import os
+import pathlib
+
+import pytest
+
+from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    available_tasks,
+    build_prompt,
+    build_prompt_tokens,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+# -------------------- Pure-logic structural tests --------------------
+
+
+class FakeTokenizer:
+    """Minimal tokenizer stub that records every encode() call.
+
+    Returns deterministic ids: special tokens map to small ints (1-4),
+    encode() returns one id per character starting at 100. This lets
+    tests both verify segmentation (by inspecting `encode_calls`) and
+    locate substrings inside the returned id list.
+    """
+
+    SPECIAL = {
+        "<|startoftext|>": 1,
+        "<img>": 2,
+        "<think>": 3,
+        "<recaption>": 4,
+    }
+
+    def __init__(self) -> None:
+        self.encode_calls: list[str] = []
+
+    def convert_tokens_to_ids(self, tok: str) -> int:
+        return self.SPECIAL.get(tok, 0)
+
+    def encode(self, text: str, add_special_tokens: bool = False) -> list[int]:
+        self.encode_calls.append(text)
+        return list(range(100, 100 + len(text)))
+
+
+def test_available_tasks_covers_all_modalities():
+    tasks = set(available_tasks())
+    assert tasks >= {
+        "t2t",
+        "i2t",
+        "it2i_think",
+        "it2i_recaption",
+        "t2i_think",
+        "t2i_recaption",
+        "t2i_vanilla",
+    }
+
+
+@pytest.mark.parametrize(
+    "task",
+    [
+        "t2t",
+        "i2t",
+        "it2i_think",
+        "it2i_recaption",
+        "t2i_think",
+        "t2i_recaption",
+    ],
+)
+def test_build_prompt_string_structure_chat_template(task: str):
+    """Chat-template tasks must produce <|startoftext|>...User: ...Assistant: ...
+    with image placeholder (when applicable) and trigger tag AFTER `Assistant: `."""
+    s = build_prompt("HELLO", task=task)
+
+    assert s.startswith("<|startoftext|>")
+    assert "User: " in s
+    assert "Assistant: " in s
+    assert s.index("User: ") < s.index("HELLO") < s.index("Assistant: ")
+
+    if task.startswith(("i2t", "it2i")):
+        assert s.index("User: ") < s.index("<img>") < s.index("HELLO"), (
+            "<img> placeholder must sit between `User: ` and the user prompt"
+        )
+    else:
+        assert "<img>" not in s
+
+    # Trigger tag must be the FINAL token of the prompt (after `Assistant: `).
+    # Note: the system prompt itself mentions <think>/<recaption> as mode
+    # documentation, so substring index() catches the wrong occurrence -- use
+    # endswith() which directly captures "trigger is at the tail" (the Part A
+    # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
+    if task in ("it2i_think", "t2i_think"):
+        assert s.endswith("Assistant: <think>"), (
+            f"Trigger <think> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
+        )
+    if task in ("it2i_recaption", "t2i_recaption"):
+        assert s.endswith("Assistant: <recaption>"), (
+            f"Trigger <recaption> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
+        )
+    if task in ("t2t", "i2t"):
+        assert s.endswith("Assistant: "), "Plain (no-trigger) task must end at `Assistant: ` with no trailing tag."
+
+
+def test_build_prompt_vanilla_uses_pretrain_template():
+    """t2i_vanilla is the only task that bypasses chat structure -- direct
+    text->image generation driven by the vanilla system prompt."""
+    s = build_prompt("HELLO", task="t2i_vanilla")
+    assert s.startswith("<|startoftext|>")
+    assert "User: " not in s
+    assert "Assistant: " not in s
+    assert "<think>" not in s
+    assert "<recaption>" not in s
+    assert s.endswith("HELLO")
+
+
+def test_build_prompt_unknown_task_raises():
+    with pytest.raises(ValueError, match="Unknown task"):
+        build_prompt("x", task="bogus")
+    with pytest.raises(ValueError, match="Unknown task"):
+        build_prompt_tokens("x", FakeTokenizer(), task="bogus")
+
+
+def test_build_prompt_tokens_segments_each_boundary():
+    """Regression for cross-segment BPE merge bug (commit 7bd429ed):
+    each template segment must hit tokenizer.encode() independently;
+    user_prompt MUST NOT be concatenated with the following separator
+    in the same encode() call."""
+    tok = FakeTokenizer()
+    build_prompt_tokens("写诗。", tok, task="i2t")
+
+    # Each canonical segment is encoded in its own call.
+    assert "User: " in tok.encode_calls
+    assert "写诗。" in tok.encode_calls, (
+        "user_prompt must be encoded alone -- if it is concatenated with the "
+        "trailing separator, BPE will merge across the boundary (the PR-#3243 bug)."
+    )
+    assert "\n\nAssistant: " in tok.encode_calls
+
+    # No call must contain user_prompt glued to neighboring text.
+    for call in tok.encode_calls:
+        if call != "写诗。":
+            assert "写诗。" not in call, f"user_prompt leaked into a multi-segment encode call: {call!r}"
+
+
+def test_build_prompt_tokens_image_placeholder_present_for_image_tasks():
+    tok = FakeTokenizer()
+    ids = build_prompt_tokens("hi", tok, task="i2t")
+    assert ids[0] == 1, "BOS (<|startoftext|>) must be the first token"
+    assert 2 in ids, "<img> placeholder must be present for i2t/it2i tasks"
+
+
+def test_build_prompt_tokens_no_image_for_text_only_tasks():
+    tok = FakeTokenizer()
+    ids = build_prompt_tokens("hi", tok, task="t2t")
+    assert 2 not in ids, "<img> must NOT appear for text-only tasks"
+
+
+@pytest.mark.parametrize(
+    "task,trigger_id",
+    [("it2i_think", 3), ("t2i_think", 3), ("it2i_recaption", 4), ("t2i_recaption", 4)],
+)
+def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
+    """Trigger tag id must be the LAST token (after `Assistant: ` segment)."""
+    tok = FakeTokenizer()
+    ids = build_prompt_tokens("hi", tok, task=task)
+    assert ids[-1] == trigger_id
+
+
+def test_build_prompt_tokens_no_trigger_for_plain_tasks():
+    """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id."""
+    tok = FakeTokenizer()
+    ids = build_prompt_tokens("hi", tok, task="t2t")
+    assert ids[-1] not in {3, 4}  # neither <think> nor <recaption>
+
+
+# -------------------- end2end.py wiring guard --------------------
+
+
+def _repo_root() -> pathlib.Path:
+    # tests/diffusion/models/hunyuan_image3/test_prompt_utils.py -> repo root
+    return pathlib.Path(__file__).resolve().parents[4]
+
+
+def test_end2end_routes_through_shared_prompt_utils():
+    """Regression for the *delivery vector* of PR #3243.
+
+    Background: the wrong-template bug that PR #3243 fixes was introduced
+    when end2end.py grew its own hand-rolled prompt builder that diverged
+    from the canonical instruct chat template. To prevent that exact
+    failure mode from recurring, end2end.py MUST:
+      1. Import the prompt builders from the shared prompt_utils module.
+      2. NOT redefine `build_prompt` or `build_prompt_tokens` locally.
+
+    A local redefinition is precisely how a future merge can silently
+    re-introduce a pretrain-style template (trigger BEFORE user_prompt,
+    no User:/Assistant: framing, etc.) without touching prompt_utils,
+    bypassing every other test in this file.
+    """
+    end2end_path = _repo_root() / "examples" / "offline_inference" / "hunyuan_image3" / "end2end.py"
+    assert end2end_path.is_file(), f"end2end.py not found at {end2end_path}"
+
+    tree = ast.parse(end2end_path.read_text(encoding="utf-8"))
+
+    local_func_names = {n.name for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)}
+    forbidden = {"build_prompt", "build_prompt_tokens"}
+    redefined = local_func_names & forbidden
+    assert not redefined, (
+        f"end2end.py defines {sorted(redefined)} locally. This is exactly how "
+        "the wrong prompt template re-entered the example before PR #3243. "
+        "Use the shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils` "
+        "helpers instead."
+    )
+
+    imported_from_prompt_utils: set[str] = set()
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
+            imported_from_prompt_utils.update(alias.name for alias in node.names)
+    assert "build_prompt_tokens" in imported_from_prompt_utils, (
+        "end2end.py must import build_prompt_tokens from "
+        "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared "
+        "helper is the single source of truth for the AR-prefill template."
+    )
+
+
+# -------------------- Real-tokenizer regression --------------------
+
+
+_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"
+
+
+def _hf_cached(model_id: str) -> bool:
+    hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
+    snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
+    return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))
+
+
+@pytest.mark.skipif(
+    not _hf_cached(_HUNYUAN_MODEL_ID),
+    reason=f"{_HUNYUAN_MODEL_ID} tokenizer not in HF cache",
+)
+def test_segment_tokenize_diverges_from_full_string_encode():
+    """Regression for PR #3243 segment-tokenization fix.
+
+    The naive `tokenizer.encode(build_prompt(...))` lets BPE merge tokens
+    across segment boundaries (notably `。\\n\\n` -> a single id), which
+    drifts the AR prefill away from HF's apply_chat_template output. The
+    segment-by-segment build_prompt_tokens must produce a STRICTLY
+    DIFFERENT id sequence on a prompt that triggers the merge.
+
+    If someone "simplifies" build_prompt_tokens to call encode() on the
+    full string, this assertion fires.
+    """
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(_HUNYUAN_MODEL_ID, trust_remote_code=True)
+
+    user_prompt = "写一首关于夜的诗。"
+    seg_ids = build_prompt_tokens(user_prompt, tok, task="i2t")
+    full_ids = tok.encode(build_prompt(user_prompt, task="i2t"), add_special_tokens=False)
+
+    assert seg_ids != full_ids, (
+        "build_prompt_tokens output equals naive full-string encode -- "
+        "the BPE-merge-bypass behavior is no longer exercised. This means "
+        "the segment-by-segment fix from PR #3243 has been silently undone."
+    )
+
+    # Segmenting prevents merges, so the segment id list should have AT LEAST
+    # as many tokens as the merged version (a merge consumes 2+ ids -> 1).
+    assert len(seg_ids) >= len(full_ids), (
+        f"segment-encoded length ({len(seg_ids)}) shorter than full-string "
+        f"merged length ({len(full_ids)}) -- impossible if segmentation is "
+        f"genuinely bypassing merges."
+    )
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
new file mode 100644
index 00000000000..6e8efac3133
--- /dev/null
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shared prompt-template construction for HunyuanImage-3.0-Instruct.
+
+Single source of truth for the AR-prefill prompt format used by the
+example scripts and any downstream caller that needs to build
+HunyuanImage3 chat-template token sequences without invoking the full
+diffusion pipeline tokenizer wrapper.
+
+The DiT pipeline (`pipeline_hunyuan_image3.py`) builds prompts through
+`TokenizerWrapper.apply_chat_template`, which eagerly consumes
+`JointImageInfo` objects produced by image preprocessing. The example
+flow uses an `<img>` placeholder + `multi_modal_data` instead, so it
+needs a lighter-weight builder that only requires a HF tokenizer. This
+module provides that builder; the task -> template mapping below is the
+canonical mapping for both flows.
+"""
+
+from __future__ import annotations
+
+from .system_prompt import get_system_prompt
+
+# task -> (sys_type, bot_task, trigger_tag)
+_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
+    "t2t": ("en_unified", None, None),
+    "i2t": ("en_unified", None, None),
+    "it2i_think": ("en_unified", "think", "<think>"),
+    "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
+    "t2i_think": ("en_unified", "think", "<think>"),
+    "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
+    "t2i_vanilla": ("en_vanilla", "image", None),
+}
+
+
+def available_tasks() -> list[str]:
+    """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
+    return sorted(_TASK_PRESETS)
+
+
+def build_prompt(
+    user_prompt: str,
+    task: str = "it2i_think",
+    sys_type: str | None = None,
+    custom_system_prompt: str | None = None,
+) -> str:
+    """Build a HunyuanImage-3.0 prompt as a string (legacy/compat path).
+
+    NOTE: when this string is passed to the engine, the engine's tokenizer
+    will run a single BPE pass over the whole string, which can merge
+    tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
+    inputs that need to match HF baseline byte-for-byte, use
+    `build_prompt_tokens` instead and feed the result via prompt_token_ids.
+    """
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    effective_sys_type = sys_type or preset_sys_type
+
+    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
+    sys_text = system_prompt.strip() if system_prompt else ""
+
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+
+    # t2i_vanilla: pretrain mode for direct text->image generation. The
+    # vanilla system prompt drives the model with no chat structure.
+    if task == "t2i_vanilla":
+        parts = ["<|startoftext|>"]
+        if sys_text:
+            parts.append(sys_text)
+        parts.append(user_prompt)
+        return "".join(parts)
+
+    # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
+    # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
+    #   <|startoftext|>{system?}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger?}
+    # generation_config.json declares sequence_template="instruct", so the
+    # AR prefill MUST use this template -- verified to match HF's
+    # apply_chat_template output token-for-token (modulo BPE boundary merges).
+    # The trigger_tag (e.g. <think>) MUST come AFTER the `Assistant: ` prefix:
+    # if it goes BEFORE user_prompt (the old pretrain layout) the model puts
+    # the user's instructions inside the "thinking section" and collapses
+    # into repetition garbage under greedy decoding.
+    parts = ["<|startoftext|>"]
+    if sys_text:
+        parts.append(f"{sys_text}\n\n")
+    parts.append("User: ")
+    if has_image_input:
+        parts.append("<img>")
+    parts.append(user_prompt)
+    parts.append("\n\nAssistant: ")
+    if trigger_tag:
+        parts.append(trigger_tag)
+
+    return "".join(parts)
+
+
+def build_prompt_tokens(
+    user_prompt: str,
+    tokenizer,
+    task: str = "it2i_think",
+    sys_type: str | None = None,
+    custom_system_prompt: str | None = None,
+) -> list[int]:
+    """Segment-by-segment tokenization that matches HF apply_chat_template.
+
+    Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE
+    merge tokens across segment boundaries (e.g. user_prompt ends with `。`
+    and the next segment is `\\n\\n` -> they merge into a single token id
+    3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes
+    each segment independently and concatenates token_ids, so no cross-
+    boundary merge happens. We replicate that here and feed the result to
+    Omni via OmniTokensPrompt (prompt_token_ids).
+    """
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    effective_sys_type = sys_type or preset_sys_type
+
+    bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
+    img_id = tokenizer.convert_tokens_to_ids("<img>")
+    trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
+
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+
+    # t2i_vanilla uses pretrain template with no chat structure; the vanilla
+    # system prompt drives the model directly. No segment boundaries to
+    # protect, fall back to whole-string encode.
+    if task == "t2i_vanilla":
+        s = build_prompt(user_prompt, task, sys_type, custom_system_prompt)
+        return tokenizer.encode(s, add_special_tokens=False)
+
+    system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
+    # Do NOT strip -- HF apply_chat_template keeps the system prompt's
+    # natural trailing newline; stripping it would shift one token id.
+    sys_text = system_prompt or ""
+
+    ids: list[int] = [bos_id]
+    if sys_text:
+        ids += tokenizer.encode(sys_text, add_special_tokens=False)
+        ids += tokenizer.encode("\n\n", add_special_tokens=False)
+    ids += tokenizer.encode("User: ", add_special_tokens=False)
+    if has_image_input:
+        ids += [img_id]
+    ids += tokenizer.encode(user_prompt, add_special_tokens=False)
+    ids += tokenizer.encode("\n\nAssistant: ", add_special_tokens=False)
+    if trig_id is not None:
+        ids += [trig_id]
+    return ids
+
+
+__all__ = ["build_prompt", "build_prompt_tokens", "available_tasks"]
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index e2f600eaa46..d2552bdddbf 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
 import math
 import typing
 from collections.abc import Callable, Iterable, Mapping, Sequence
@@ -17,14 +18,29 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import get_pp_group
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.inputs import MultiModalDataDict
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import fused_moe_make_expert_params_mapping
+
+try:
+    from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+except ImportError:
+    # PyPI vllm 0.20.x neither exports `SharedFusedMoE` from the package top-level
+    # nor ships a `shared_fused_moe.py` submodule. The functionality lives on
+    # `FusedMoE` directly (which gained a `shared_experts` parameter), so alias
+    # the symbol — call sites only use the classmethod `make_expert_params_mapping`
+    # and `__init__(shared_experts=..., ...)` which are present on `FusedMoE`.
+    from vllm.model_executor.layers.fused_moe import FusedMoE as SharedFusedMoE
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
+    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -37,7 +53,9 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.model_executor.models.hunyuan_v1 import (
+    HunYuanMLP,
     HunYuanModel,
+    HunYuanSparseMoeBlock,
     _get_cla_factor,
     _is_moe,
 )
@@ -842,8 +860,6 @@ def process_image(self, image_input: ImageInput):
         else:
             raise TypeError(f"Unsupported image type: {type(image_input)}.")
 
-        torch_dtype = getattr(self.hf_config, "torch_dtype", torch.bfloat16)
-
         batch_data = []
         for image in images:
             current_info = {}
@@ -853,20 +869,38 @@ def process_image(self, image_input: ImageInput):
 
             # VIT processing
             vit_pixel_values = self.vision_encoder_processor(image)
-            # shape: (seq_len, num_channels * patch_size * patch_size)
-            current_info["vit_pixel_values"] = vit_pixel_values["pixel_values"].squeeze(0)
-            # shape: (seq_len, )
-            current_info["vit_pixel_attention_mask"] = vit_pixel_values["pixel_attention_mask"].squeeze(0)
-            # shape: (2, )
-            current_info["vit_spatial_shapes"] = vit_pixel_values["spatial_shapes"].squeeze(0)
-
-            # VAE processing
+            # transformers>=5.x returns lists; stack to tensor when needed
+            _pv = vit_pixel_values["pixel_values"]
+            if isinstance(_pv, list):
+                _pv = torch.stack(_pv, dim=0)
+            current_info["vit_pixel_values"] = _pv.squeeze(0)
+            _pam = vit_pixel_values["pixel_attention_mask"]
+            if isinstance(_pam, list):
+                _pam = torch.stack(_pam, dim=0)
+            current_info["vit_pixel_attention_mask"] = _pam.squeeze(0)
+            _ss = vit_pixel_values["spatial_shapes"]
+            if isinstance(_ss, list):
+                _ss = torch.tensor(_ss, dtype=torch.long)
+            current_info["vit_spatial_shapes"] = _ss.squeeze(0)
+
+            # VAE processing.
+            # The resize/crop math here mirrors HF's `resize_and_crop` with
+            # crop_type="center" (hunyuan3.0_ins/image_processor.py:61). VAE
+            # normalize uses the same transforms.Compose([ToTensor,
+            # Normalize([0.5], [0.5])]) as HF's `pil_image_to_tensor`. So
+            # numerical output of this branch should match HF up to floating-
+            # point reduction order.
             image_width, image_height = self.reso_group.get_target_size(image.width, image.height)
             resized_image = self._resize_and_crop(image, (image_width, image_height))
             vae_pixel_values = self.vae_processor(resized_image)
             token_height = image_height // (self.hf_config.vae_downsample_factor[0] * self.hf_config.patch_size)
             token_width = image_width // (self.hf_config.vae_downsample_factor[1] * self.hf_config.patch_size)
-            current_info["vae_pixel_values"] = vae_pixel_values.squeeze(0).to(dtype=torch_dtype)
+            # Keep fp32 — the VAE encoder casts to model dtype at its boundary
+            # (see _vae_encode). Casting to bf16 here costs ~7e-4 mean-abs-diff
+            # bf16 quantization error on every pixel vs HF (which keeps fp32
+            # in build_cond_images), measurable as a real numerical drift in
+            # downstream image embeddings.
+            current_info["vae_pixel_values"] = vae_pixel_values.squeeze(0)
             current_info["vae_token_grid_hw"] = torch.tensor([token_height, token_width])
 
             # size
@@ -1052,6 +1086,25 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
             if ratio_token_id is None:
                 raise ValueError(f"Ratio token '<img_ratio_{_ratio_index}>' not found in tokenizer vocabulary")
 
+            # NOTE on the timestep slot:
+            # HF's apply_chat_template emits the literal <timestep> token id
+            # 128017 here. HF's modeling forward (`instantiate_continuous_tokens`,
+            # see hunyuan3.0_ins/modeling_hunyuan_image_3.py:1964) then *scatter-
+            # replaces* the embedding at that position with `timestep_emb(0)`
+            # for cond images. So the wte embedding of <timestep> is irrelevant
+            # at runtime — what matters is the timestep_emb injection.
+            #
+            # vllm-omni achieves the same effect via the multimodal-embedding
+            # merger: we put an <img> (128006) placeholder here and ship a
+            # `timestep_emb(0)` tensor at the head of `embed_multimodal()`'s
+            # combined_embeddings. The merger replaces this placeholder's
+            # embedding with the timestep tensor, yielding a final hidden
+            # state numerically equivalent to HF at that position.
+            #
+            # Keep this slot as <img> (NOT <timestep>): switching to <timestep>
+            # requires either (a) a second PromptReplacement targeting 128017,
+            # or (b) the merger's embed_token_id to be a list — neither is
+            # currently supported by PromptUpdateDetails.select_token_id.
             replacement = (
                 [boi_token_id]
                 + [base_size_token_id]
@@ -1070,6 +1123,197 @@ def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
         ]
 
 
+def _hunyuan_image3_unpack_packed_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Unpack pre-computed ``(topk_weights, topk_indices)`` packed by
+    :class:`HunyuanImage3SparseMoeBlock` into ``gating_output``.
+
+    Used as ``custom_routing_function`` for the underlying ``SharedFusedMoE``,
+    bypassing its bf16 ``topk_softmax`` CUDA op so the routing decision can
+    be made in fp32 (matching the reference implementation).
+
+    Layout of ``gating_output`` (shape ``[num_tokens, top_k * 2]``)::
+
+        [:, :top_k]  -> topk_weights (already softmax'd + renormalized in fp32,
+                                      stored as fp32 for transport)
+        [:, top_k:]  -> topk_indices (cast to fp32 for transport, restored to int32)
+    """
+    topk_weights = gating_output[:, :topk].contiguous()
+    topk_indices = gating_output[:, topk:]
+    return topk_weights.to(torch.float32), topk_indices.to(torch.int32)
+
+
+class HunyuanImage3SparseMoeBlock(HunYuanSparseMoeBlock):
+    """MoE block with FP32 routing for byte-level alignment with HF.
+
+    The reference ``modeling_hunyuan_image_3.py`` runs the router in fp32:
+
+    - ``HunyuanTopKGate.wg`` is constructed as ``nn.Linear(..., dtype=torch.float32)``
+      and ``hidden_states`` is cast to fp32 before the matmul (line 1114-1116).
+    - ``HunyuanMoE.forward`` wraps the gate call in
+      ``with torch.autocast('cuda', enabled=False):`` to defeat any AMP cast
+      (line 1204-1205), then calls ``easy_topk`` which does
+      ``F.softmax`` → ``torch.topk`` → divide by
+      ``torch.clamp(weight_sums, min=1e-8)`` → cast back to bf16, all in fp32
+      (line 1132-1139, 1206-1207).
+
+    vLLM's stock ``HunYuanSparseMoeBlock`` instead builds the gate as a
+    default-dtype ``ReplicatedLinear`` (bf16) and lets ``SharedFusedMoE``'s
+    ``topk_softmax`` CUDA op consume bf16 logits, which can flip top-k
+    boundary decisions vs HF on close routing scores. With ``num_experts=64``,
+    ``top_k=8`` per layer × 32 MoE layers, even a small per-token flip rate
+    cascades into divergent expert outputs and KV-cache state, eventually
+    flipping the top-1 decoded token.
+
+    This subclass:
+
+    1. Replaces ``self.gate`` with a fp32 ``ReplicatedLinear``.
+    2. Replaces ``self.experts`` with a ``SharedFusedMoE`` whose routing is a
+       no-op unpack of our pre-computed (topk_weights, topk_indices) — the
+       fp32 softmax/topk/renormalize is done in :meth:`forward` here, exactly
+       mirroring HF's ``easy_topk`` math (including ``clamp(min=1e-8)``).
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config=None,
+        layer_id: int = -1,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        # Bypass ``HunYuanSparseMoeBlock.__init__`` — it would build a wasteful
+        # bf16 gate + a stub ``SharedFusedMoE`` we'd then have to del+recreate
+        # (which trips ``ValueError: Duplicate layer name`` because the stub
+        # already registered itself in ``compilation_config.static_forward_context``).
+        # Instead, set up ``nn.Module`` ourselves and construct the fp32 gate
+        # + ``custom_routing_function``-driven ``SharedFusedMoE`` directly,
+        # mirroring the parent's structure 1:1 except for the routing dtype.
+        nn.Module.__init__(self)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than the number of experts {config.num_experts}."
+            )
+
+        if isinstance(config.moe_topk, list):
+            top_k = config.moe_topk[layer_id]
+        else:
+            top_k = config.moe_topk
+        self.top_k = top_k
+
+        intermediate_size = config.intermediate_size
+        if config.moe_intermediate_size is not None:
+            intermediate_size = (
+                config.moe_intermediate_size
+                if isinstance(config.moe_intermediate_size, int)
+                else config.moe_intermediate_size[layer_id]
+            )
+
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = self.physical_expert_start + self.n_local_physical_experts
+
+        # FP32 router gate (HF: ``wg = nn.Linear(..., dtype=torch.float32)``).
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            params_dtype=torch.float32,
+            prefix=f"{prefix}.gate",
+        )
+
+        if config.use_mixed_mlp_moe > 0:
+            num_shared_expert = (
+                config.num_shared_expert[layer_id]
+                if isinstance(config.num_shared_expert, list)
+                else config.num_shared_expert
+            )
+            self.shared_mlp = HunYuanMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size * num_shared_expert,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_mlp",
+            )
+        else:
+            self.shared_mlp = None
+
+        # Experts with our ``_hunyuan_image3_unpack_packed_topk`` custom
+        # routing — we feed it (topk_weights, topk_indices) packed into
+        # ``router_logits`` in ``forward()`` so the bf16 ``topk_softmax``
+        # CUDA op is bypassed entirely. ``renormalize=False`` because we
+        # already did clamp+divide in fp32 to match HF's
+        # ``topk_weight = topk_weight_1 / clamp(sum, min=1e-8)``.
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_mlp,
+            num_experts=self.n_routed_experts,
+            top_k=top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            renormalize=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            custom_routing_function=_hunyuan_image3_unpack_packed_topk,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # FP32 router (HF: `with torch.autocast('cuda', enabled=False): ...`
+        # plus `if self.wg.weight.dtype == torch.float32: hidden_states.float()`).
+        # ``self.gate.weight`` is fp32 (params_dtype=torch.float32), so the
+        # ReplicatedLinear matmul runs in fp32 once we cast the input.
+        router_logits, _ = self.gate(hidden_states.float())
+
+        # softmax + topk + clamp-divide renormalization, all in fp32 — matches
+        # ``HunyuanTopKGate.easy_topk`` exactly.
+        gates = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+        topk_weights, topk_indices = torch.topk(gates, self.top_k, dim=-1)
+        weight_sums = topk_weights.sum(dim=-1, keepdim=True)
+        topk_weights = topk_weights / weight_sums.clamp(min=1e-8)
+
+        # Cast topk weights to model dtype for the expert MLP combine.
+        # HF: ``topk_weights = topk_weights.to(hidden_states.dtype)`` (line 1207).
+        topk_weights = topk_weights.to(hidden_states.dtype)
+
+        # Pack (weights, indices) into the ``router_logits`` slot so
+        # ``_hunyuan_image3_unpack_packed_topk`` can pull them back out
+        # inside ``SharedFusedMoE``. Both halves are stored as fp32 for
+        # transport — the indices get cast back to int32 on unpack.
+        packed_routing = torch.cat([topk_weights.float(), topk_indices.to(torch.float32)], dim=-1)
+
+        # vllm 0.20+ FusedMoE merges shared-experts internally and runs the
+        # TP all-reduce inside its forward (we no longer pass
+        # `reduce_results=False`). The tuple `(routed, shared)` return shape
+        # from the legacy SharedFusedMoE is gone; the result is the
+        # already-combined, already-reduced tensor.
+        final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=packed_routing)
+        return final_hidden_states.view(orig_shape)
+
+
 class HunyuanImage3RotaryEmbedding(nn.Module):
     """Custom interleaved 2D Rotary Embedding for HunyuanImage3.
 
@@ -1273,8 +1517,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # For comprehension mode, block image generation tokens but allow
         # text structure tokens (<think>, <answer>, etc.) so the model can
-        # follow its natural generation pattern. Stop tokens in YAML will
-        # terminate at </answer> or EOS.
+        # follow its natural generation pattern. The yaml stop_token_ids
+        # for i2t/t2t now includes </think> (128024) so the AR-only output
+        # terminates after the analysis section, matching HF's
+        # `bot_task="think"` behavior. Without that stop, the model
+        # continues into a recaption section even in comprehension mode
+        # (the stage-transition processor only fires in generation mode,
+        # but the instruct-tuned model writes recaption on its own from
+        # internal habit).
         self._blocked_token_ids: set[int] = set()
         if self._is_comprehension:
             self._blocked_token_ids.update(
@@ -1306,6 +1556,75 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._eos_token_id: int = tokenizer.eos_token_id
 
         self._replace_rotary_embeddings()
+        self._patch_moe_blocks()
+
+    def _patch_moe_blocks(self):
+        """Replace stock ``HunYuanSparseMoeBlock`` instances with
+        :class:`HunyuanImage3SparseMoeBlock`, which routes in fp32 to match
+        the HF reference (``modeling_hunyuan_image_3.HunyuanMoE``).
+
+        Stock vLLM builds the router gate as a default-dtype (bf16)
+        ``ReplicatedLinear`` and lets ``SharedFusedMoE``'s ``topk_softmax``
+        kernel consume bf16 logits, which is the largest deterministic
+        precision gap remaining vs HF after the prompt/preprocessing
+        alignment fixes. See ``HunyuanImage3SparseMoeBlock`` docstring for
+        the full rationale.
+
+        Must run before weight loading (still inside ``__init__``) so the
+        replacement gate's fp32 ``params_dtype`` is honored when the
+        checkpoint is loaded.
+        """
+        if not _is_moe(self.config):
+            return
+        enable_eplb = getattr(self.vllm_config.parallel_config, "enable_eplb", False)
+        ccfg = self.vllm_config.compilation_config
+        replaced = 0
+        for layer_id, layer in enumerate(self.model.layers):
+            mlp = getattr(layer, "mlp", None)
+            if isinstance(mlp, HunYuanSparseMoeBlock) and not isinstance(mlp, HunyuanImage3SparseMoeBlock):
+                # Pop the OLD experts' registration from
+                # ``static_forward_context`` first — otherwise the new
+                # ``SharedFusedMoE`` built inside
+                # :class:`HunyuanImage3SparseMoeBlock` will trip
+                # ``ValueError: Duplicate layer name`` (see
+                # vllm/model_executor/layers/fused_moe/layer.py:327).
+                old_prefix = f"model.layers.{layer_id}.mlp.experts"
+                ccfg.static_forward_context.pop(old_prefix, None)
+                if old_prefix in ccfg.static_all_moe_layers:
+                    ccfg.static_all_moe_layers.remove(old_prefix)
+
+                # Free the OLD MoE block's GPU buffers BEFORE allocating
+                # the replacement. The parent ``SharedFusedMoE`` pre-
+                # allocates the full ``[num_experts, ...]`` expert weight
+                # tensors at ``__init__`` (~750 MiB per layer per worker
+                # on this 80B model with TP=2), so without this drop we
+                # transiently double the MoE footprint and OOM near the
+                # gpu_memory_utilization cap.
+                layer.mlp = None
+                del mlp
+                gc.collect()
+                torch.cuda.empty_cache()
+
+                layer.mlp = HunyuanImage3SparseMoeBlock(
+                    config=self.config,
+                    quant_config=self.quant_config,
+                    layer_id=layer_id,
+                    prefix=f"model.layers.{layer_id}.mlp",
+                    enable_eplb=enable_eplb,
+                )
+                replaced += 1
+        logger.info(
+            "Replaced %d HunYuanSparseMoeBlock layers with "
+            "HunyuanImage3SparseMoeBlock (fp32 router matching HF reference)",
+            replaced,
+        )
+        if replaced == 0:
+            logger.warning(
+                "HunyuanImage3: _patch_moe_blocks replaced 0 layers. "
+                "Routing will run in bf16 instead of fp32 — output will "
+                "diverge from the HF reference more than necessary. "
+                "Check that model.layers[*].mlp is HunYuanSparseMoeBlock."
+            )
 
     def _replace_rotary_embeddings(self):
         """Replace vLLM's standard MRotaryEmbedding with the custom
@@ -1384,6 +1703,18 @@ def _vae_encode(
         """
         config = self.vae.config
 
+        # Cast pixel input to model dtype here (at the encoder boundary)
+        # rather than inside HunyuanImage3Processor.process_image. This
+        # matches HF's path which keeps fp32 pixels in build_cond_images and
+        # only casts inside the VAE forward — preserving fp32 precision in
+        # the multimodal_data dict and minimizing precision drift vs HF.
+        # Verified by pixel-tensor diff: removing the early bf16 cast brings
+        # omni's vae_pixel_values byte-identical to HF's (within fp32 noise),
+        # whereas an early cast leaves a ~7e-4 mean-abs-diff bf16 quantization
+        # error on every element.
+        if images.dtype != self.vae.dtype:
+            images = images.to(dtype=self.vae.dtype)
+
         vae_encode_result = self.vae.encode(images)
         latents = vae_encode_result.latent_dist.sample()
 
@@ -1494,11 +1825,14 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             "Each image should have both VAE and ViT embeddings."
         )
 
-        # Order per image: timestep -> VAE tokens -> ViT tokens
+        # Order per image: timestep -> VAE tokens -> ViT tokens.
+        # The <img> placeholder at the timestep slot (see _get_prompt_updates)
+        # gets its embedding replaced by `timestep_emb(0)` here, which is what
+        # HF achieves via instantiate_continuous_tokens at runtime.
         combined_embeddings: list[torch.Tensor] = []
         num_images = len(vae_token_embeddings)
         for img_idx in range(num_images):
-            # 1. Timestep embedding
+            # 1. Timestep embedding (cond image timestep == 0)
             timestep = torch.zeros((1,)).to(vit_embeddings.device).to(vit_embeddings.dtype)
             timestep_emb = self._timestep_encode(timestep)
 
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
index b68b184ec31..0614a9f1179 100644
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
+++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
@@ -34,7 +34,7 @@ stage_args:
       top_p: 0.95
       top_k: 1024
       max_tokens: 2048
-      stop_token_ids: [127957, 128026]  # <|endoftext|>, </answer>
+      stop_token_ids: [127957, 128024, 128026]  # <|endoftext|>, </think>, </answer>
       detokenize: True
 
 runtime:
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
index a0a1a0dc1c4..c9daa5e5f39 100644
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
+++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
@@ -35,7 +35,7 @@ stage_args:
       top_p: 0.95
       top_k: 1024
       max_tokens: 2048
-      stop_token_ids: [127957, 128026]  # <|endoftext|>, </answer>
+      stop_token_ids: [127957, 128024, 128026]  # <|endoftext|>, </think>, </answer>
       detokenize: True
 
 runtime: