From 550f93880c7c436efa089622ee4827ffe4ae9995 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Wed, 6 May 2026 15:42:44 +0800
Subject: [PATCH 01/40] [Config] Add HunyuanImage3 deploy configs

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/README.md                  | 241 ++++++++----------
 .../hunyuan_image3/end2end.py                 |  49 +++-
 .../offline_inference/test_hunyuanimage3.py   |   5 +-
 tests/entrypoints/test_utils.py               |  39 +++
 tests/test_config_factory.py                  |  28 ++
 vllm_omni/config/pipeline_registry.py         |  12 +
 vllm_omni/config/stage_config.py              |  10 +
 vllm_omni/deploy/hunyuan_image3.yaml          | 100 ++++++++
 vllm_omni/deploy/hunyuan_image3_ar.yaml       |  47 ++++
 vllm_omni/deploy/hunyuan_image3_dit.yaml      |  64 +++++
 vllm_omni/entrypoints/cli/serve.py            |   1 +
 vllm_omni/entrypoints/utils.py                |   1 +
 .../models/hunyuan_image3/pipeline.py         |  87 +++++++
 .../stage_configs/hunyuan_image3_i2t.yaml     |  41 ---
 .../stage_configs/hunyuan_image3_it2i.yaml    |  72 ------
 .../stage_configs/hunyuan_image3_moe.yaml     |  96 -------
 .../hunyuan_image3_moe_dit_2gpu_fp8.yaml      |  32 ---
 .../stage_configs/hunyuan_image3_t2i.yaml     |  31 ---
 .../hunyuan_image3_t2i_2gpu.yaml              |  41 ---
 .../stage_configs/hunyuan_image3_t2t.yaml     |  42 ---
 .../npu/stage_configs/hunyuan_image3_t2i.yaml |  35 ---
 .../xpu/stage_configs/hunyuan_image3_t2i.yaml |  80 ------
 22 files changed, 539 insertions(+), 615 deletions(-)
 create mode 100644 vllm_omni/deploy/hunyuan_image3.yaml
 create mode 100644 vllm_omni/deploy/hunyuan_image3_ar.yaml
 create mode 100644 vllm_omni/deploy/hunyuan_image3_dit.yaml
 create mode 100644 vllm_omni/model_executor/models/hunyuan_image3/pipeline.py
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
 delete mode 100644 vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml
 delete mode 100644 vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml

diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index c1c97bfe1fa..82cca4db6db 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -1,172 +1,153 @@
 # HunyuanImage-3.0-Instruct
 
-## Set up
-
-Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
-
-## Run examples
-
-**Note**: These examples work with the default configuration on **8x NVIDIA L40S (48GB)**. For different GPU setups, modify the stage configuration to adjust device allocation and memory utilization.
-
-Get into the hunyuan_image3 folder:
+This example runs HunyuanImage-3.0-Instruct offline with the unified deploy
+YAMLs under `vllm_omni/deploy/`.
+
+## Deploy Configs
+
+| File | Topology | Default use |
+| :--- | :--- | :--- |
+| `vllm_omni/deploy/hunyuan_image3.yaml` | AR + DiT | Default for `text2img` and `img2img`. |
+| `vllm_omni/deploy/hunyuan_image3_ar.yaml` | AR only | Default for `img2text` and `text2text`. |
+| `vllm_omni/deploy/hunyuan_image3_dit.yaml` | DiT only | Standalone diffusion stage. Pass it explicitly with `--deploy-config`. |
+
+The example chooses a deploy config automatically when `--deploy-config` and
+`--stage-configs-path` are both omitted:
+
+| `--modality` | `mode` passed to Omni | Default deploy |
+| :--- | :--- | :--- |
+| `text2img` | `text-to-image` | `hunyuan_image3.yaml` |
+| `img2img` | `image-editing` | `hunyuan_image3.yaml` |
+| `img2text` | `image-to-text` | `hunyuan_image3_ar.yaml` |
+| `text2text` | `text-to-text` | `hunyuan_image3_ar.yaml` |
+
+`--modality` is an offline example convenience flag. It maps to the internal
+`mode` argument passed to `Omni(...)` by this script. HunyuanImage3 uses
+separate deploy YAMLs for AR + DiT, AR-only, and DiT-only topologies, so the
+stage topology is selected by the deploy file rather than by YAML mode
+overrides.
+
+Online serving does not expose a `--modality` flag or accept `mode` as an API
+request field. Choose the deploy topology when starting the server with
+`--deploy-config`, then use the OpenAI-compatible endpoint and request shape for
+the scenario. The `modalities` request field is used by the chat completions
+path; the image endpoints infer the image task from the endpoint and payload.
+
+| Online scenario | Server deploy | Request |
+| :--- | :--- | :--- |
+| Text to image | `--deploy-config vllm_omni/deploy/hunyuan_image3.yaml` | `POST /v1/images/generations`, or `POST /v1/chat/completions` with `"modalities": ["image"]`. |
+| Image editing | `--deploy-config vllm_omni/deploy/hunyuan_image3.yaml` | `POST /v1/images/edits`. |
+| Image/text to text | `--deploy-config vllm_omni/deploy/hunyuan_image3_ar.yaml` | `POST /v1/chat/completions` for text output, for example with `"modalities": ["text"]`. |
+| DiT-only image generation | `--deploy-config vllm_omni/deploy/hunyuan_image3_dit.yaml` | `POST /v1/images/generations`. |
+
+## Run Examples
+
+Text to image, using the default AR + DiT deploy:
 
 ```bash
-cd examples/offline_inference/hunyuan_image3
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality text2img \
+  --prompts "A cute cat sitting on a windowsill watching the sunset"
 ```
 
-### Modality Control
-
-HunyuanImage-3.0-Instruct supports multiple modality modes. You can control the mode using the `--modality` argument:
-
-#### Text to Image (text2img)
-
-- **Pipeline**: Text → AR (CoT + latent tokens) → DiT (denoise) → VAE Decode → Image
-- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT)
-- **KV Transfer**: AR sends KV cache to DiT for conditioned generation
-- **Default Config**: `hunyuan_image3_t2i.yaml`
+Image editing, using the default AR + DiT deploy:
 
 ```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2img \
-                  --prompts "A cute cat sitting on a windowsill watching the sunset"
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality img2img \
+  --image-path /path/to/image.png \
+  --prompts "Make the petals neon pink"
 ```
 
-**With VAE tiling (required on A100 GPUs):**
-```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2img \
-                  --prompts "A cute cat sitting on a windowsill watching the sunset" \
-                  --vae-use-tiling
-```
-
-#### Image to Image (img2img)
-
-- **Pipeline**: Image + Text → AR (CoT + recaption + latent) → DiT → Edited Image
-- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT)
-- **KV Transfer**: AR sends KV cache to DiT
-- **Default Config**: `hunyuan_image3_it2i.yaml`
+Image to text, using the AR-only deploy:
 
 ```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality img2img \
-                  --image-path /path/to/image.png \
-                  --prompts "Make the petals neon pink"
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality img2text \
+  --image-path /path/to/image.jpg \
+  --prompts "Describe the content of the picture."
 ```
 
-#### Image to Text (img2text)
-
-- **Pipeline**: Image + Question → AR → Text description
-- **Stages Used**: Stage 0 (AR) only
-- **Default Config**: `hunyuan_image3_i2t.yaml`
+Text to text, using the AR-only deploy:
 
 ```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality img2text \
-                  --image-path /path/to/image.jpg \
-                  --prompts "Describe the content of the picture."
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality text2text \
+  --prompts "What is the capital of France?"
 ```
 
-#### Text to Text (text2text)
-
-- **Pipeline**: Text → AR → Text
-- **Stages Used**: Stage 0 (AR) only
-- **Default Config**: `hunyuan_image3_t2t.yaml`
+Standalone DiT, using the DiT-only deploy explicitly:
 
 ```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2text \
-                  --prompts "What is the capital of France?"
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality text2img \
+  --deploy-config vllm_omni/deploy/hunyuan_image3_dit.yaml \
+  --prompts "A cinematic portrait of an astronaut in a greenhouse"
 ```
 
-### Inference Steps & Guidance
-
-Control generation quality for image modalities:
+Override the default full AR + DiT deploy explicitly:
 
 ```bash
-python end2end.py --modality text2img \
-                  --steps 50 \
-                  --guidance-scale 5.0 \
-                  --height 1024 --width 1024 \
-                  --prompts "A photo-realistic sunset over the ocean"
+python examples/offline_inference/hunyuan_image3/end2end.py \
+  --model tencent/HunyuanImage-3.0-Instruct \
+  --modality text2img \
+  --deploy-config vllm_omni/deploy/hunyuan_image3.yaml \
+  --prompts "A cute cat"
 ```
 
-### Key Arguments
-
-#### 📌 Command Line Arguments (end2end.py)
-
-| Argument               | Type   | Default                              | Description                                                  |
-| :--------------------- | :----- | :----------------------------------- | :----------------------------------------------------------- |
-| `--model`              | string | `tencent/HunyuanImage-3.0-Instruct` | Model path or name                                           |
-| `--modality`           | choice | `text2img`                           | Modality: `text2img`, `img2img`, `img2text`, `text2text`     |
-| `--prompts`            | list   | `None`                               | Input text prompts                                           |
-| `--image-path`         | string | `None`                               | Input image path (for `img2img`/`img2text`)                  |
-| `--output`             | string | `.`                                  | Output directory for saved images                            |
-| `--steps`              | int    | `50`                                 | Number of inference steps                                    |
-| `--guidance-scale`     | float  | `5.0`                                | Classifier-free guidance scale                               |
-| `--seed`               | int    | `42`                                 | Random seed                                                  |
-| `--height`             | int    | `1024`                               | Output image height                                          |
-| `--width`              | int    | `1024`                               | Output image width                                           |
-| `--bot-task`           | string | auto                                 | Override prompt task (e.g. `it2i_think`, `t2i_recaption`)    |
-| `--sys-type`           | string | auto                                 | Override system prompt type (e.g. `en_unified`, `en_vanilla`) |
-| `--stage-configs-path` | string | auto                                 | Custom stage config YAML path                                |
-| `--enforce-eager`      | flag   | `False`                              | Disable torch.compile                                        |
-| `--init-timeout`       | int    | `300`                                | Initialization timeout (seconds)                             |
-| `--vae-use-tiling`     | flag   | `False`                              | Enable VAE tiling for memory optimization (required to avoid OOM on A100) |
-
-------
-
-#### ⚙️ Stage Configurations
-
-| Config YAML                         | Modality  | Stages | GPUs   | Description                           |
-| :---------------------------------- | :-------- | :----- | :----- | :------------------------------------ |
-| `hunyuan_image3_t2i.yaml`           | text2img  | 2      | 8      | T2I with AR→DiT, 4 GPU each          |
-| `hunyuan_image3_it2i.yaml`          | img2img   | 2      | 8      | IT2I with AR→DiT, 4 GPU each         |
-| `hunyuan_image3_i2t.yaml`           | img2text  | 1      | 4      | I2T (AR only)                         |
-| `hunyuan_image3_t2t.yaml`           | text2text | 1      | 4      | T2T (AR only)                         |
-| `hunyuan_image3_t2i_2gpu.yaml`      | text2img  | 2      | 2      | T2I for 2-GPU setups                  |
-| `hunyuan_image3_moe.yaml`           | text2img  | 2      | 8      | T2I with MoE AR→DiT KV reuse          |
-| `hunyuan_image3_moe_dit_2gpu_fp8.yaml` | text2img | 2   | 2      | T2I with FP8 quantization             |
-
-------
-
-## Using MoE Config
-
-The `hunyuan_image3_moe.yaml` config enables AR→DiT KV cache reuse with 8 GPUs (4 for AR + 4 for DiT).
+## Key Arguments
 
-```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2img \
-                  --stage-configs-path hunyuan_image3_moe.yaml \
-                  --prompts "A cute cat"
-```
+| Argument | Description |
+| :--- | :--- |
+| `--deploy-config` | Preferred config path for unified deploy YAMLs. |
+| `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. |
+| `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. |
+| `--steps` | Number of diffusion inference steps for image generation. |
+| `--guidance-scale` | Classifier-free guidance scale for image generation. |
+| `--height`, `--width` | Output image size for `text2img`. |
+| `--bot-task` | Override the prompt task, for example `t2i_think` or `t2i_recaption`. |
+| `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. |
+| `--vae-use-tiling` | Enable VAE tiling for memory reduction. |
 
-------
+## Notes
+
+- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults.
+- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage.
+- The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs.
+- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only.
 
 ## Prompt Format
 
 HunyuanImage-3.0-Instruct uses an instruct chat template:
 
-```
-<|startoftext|>{system_prompt}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger_tag?}
+```text
+<|startoftext|>{system_prompt}
+
+User: {<img>?}{user_prompt}
+
+Assistant: {trigger_tag?}
 ```
 
-- `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline)
-- Trigger tags: `<think>` (CoT), `<recaption>` (recaptioning) — placed AFTER `Assistant: `
-- System prompt: Auto-selected based on task
-- `t2i_vanilla` is the only task that uses the bare pretrain template (no chat structure)
+- `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline).
+- Trigger tags: `<think>` for CoT and `<recaption>` for recaptioning, placed after `Assistant: `.
+- System prompt: Auto-selected based on task.
+- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure.
 
 The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
-helper handles segment-by-segment tokenization (matches HF `apply_chat_template` byte-for-byte).
-
-------
+helper handles segment-by-segment tokenization and matches HF `apply_chat_template`.
 
 ## FAQ
 
-- **OOM errors**: Decrease `gpu_memory_utilization` in the YAML stage config, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling` (required on A100 GPUs).
+- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`.
 - **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended).
 
-| Stage             | VRAM (approx)        |
-| :---------------- | :------------------- |
-| Stage 0 (AR)      | ~15 GiB + KV Cache   |
-| Stage 1 (DiT)     | ~30 GiB              |
-| Total (8-GPU)     | ~45 GiB + KV Cache   |
+| Stage | VRAM (approx) |
+| :--- | :--- |
+| Stage 0 (AR) | ~15 GiB + KV Cache |
+| Stage 1 (DiT) | ~30 GiB |
+| Total (8-GPU) | ~45 GiB + KV Cache |
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index f8f92944f71..1eaa669c53a 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -15,6 +15,7 @@
 
 import argparse
 import os
+from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     build_prompt_tokens,
@@ -42,12 +43,23 @@
 }
 
 
-# Modality → default stage config
-_MODALITY_DEFAULT_CONFIG = {
-    "text2img": "hunyuan_image3_t2i.yaml",
-    "img2img": "hunyuan_image3_it2i.yaml",
-    "img2text": "hunyuan_image3_i2t.yaml",
-    "text2text": "hunyuan_image3_t2t.yaml",
+# Default deploy configs are absolute so this example works from any cwd.
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
+_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
+
+_MODALITY_DEFAULT_DEPLOY_CONFIG = {
+    "text2img": _DEFAULT_DEPLOY_CONFIG,
+    "img2img": _DEFAULT_DEPLOY_CONFIG,
+    "img2text": _DEFAULT_AR_DEPLOY_CONFIG,
+    "text2text": _DEFAULT_AR_DEPLOY_CONFIG,
+}
+
+_MODALITY_MODE = {
+    "text2img": "text-to-image",
+    "img2img": "image-editing",
+    "img2text": "image-to-text",
+    "text2text": "text-to-text",
 }
 
 
@@ -105,7 +117,8 @@ def parse_args():
     )
 
     # Omni init args
-    parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom stage config YAML path.")
+    parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
+    parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
     parser.add_argument("--log-stats", action="store_true", default=False)
     parser.add_argument("--init-timeout", type=int, default=300, help="Initialization timeout in seconds.")
     parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.")
@@ -123,20 +136,27 @@ def main():
     # Determine task for prompt formatting
     task = args.bot_task or _MODALITY_TASK_MAP[args.modality]
 
-    # Determine stage config
-    stage_configs_path = args.stage_configs_path or _MODALITY_DEFAULT_CONFIG[args.modality]
+    if args.deploy_config is not None and args.stage_configs_path is not None:
+        raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
+
+    deploy_config = args.deploy_config
+    stage_configs_path = args.stage_configs_path
+    if deploy_config is None and stage_configs_path is None:
+        deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality]
 
     # Build Omni
     omni_kwargs = {
         "model": args.model,
         "vae_use_tiling": args.vae_use_tiling,
-        "stage_configs_path": stage_configs_path,
         "log_stats": args.log_stats,
         "init_timeout": args.init_timeout,
         "enforce_eager": args.enforce_eager,
     }
-    if args.modality in ("text2img", "img2img"):
-        omni_kwargs["mode"] = "text-to-image"
+    if deploy_config is not None:
+        omni_kwargs["deploy_config"] = deploy_config
+    else:
+        omni_kwargs["stage_configs_path"] = stage_configs_path
+    omni_kwargs["mode"] = _MODALITY_MODE[args.modality]
 
     omni = Omni(**omni_kwargs)
 
@@ -215,7 +235,10 @@ def main():
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
     print(f"  Modality: {args.modality}")
-    print(f"  Stage config: {stage_configs_path}")
+    if deploy_config is not None:
+        print(f"  Deploy config: {deploy_config}")
+    else:
+        print(f"  Stage config: {stage_configs_path}")
     print(f"  Num stages: {omni.num_stages}")
     if args.modality in ("text2img", "img2img"):
         print(f"  Inference steps: {args.steps}")
diff --git a/tests/e2e/offline_inference/test_hunyuanimage3.py b/tests/e2e/offline_inference/test_hunyuanimage3.py
index 5b34faa988e..2a385f6a4c0 100644
--- a/tests/e2e/offline_inference/test_hunyuanimage3.py
+++ b/tests/e2e/offline_inference/test_hunyuanimage3.py
@@ -17,7 +17,7 @@
 MODEL_NAME = "tencent/HunyuanImage-3.0"
 LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32"
 REPO_ROOT = Path(__file__).resolve().parents[3]
-STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml"
+DEPLOY_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml"
 
 pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion]
 
@@ -274,7 +274,8 @@ def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]:
 def omni() -> Generator[Omni, None, None]:
     with OmniRunner(
         MODEL_NAME,
-        stage_configs_path=str(STAGE_CONFIG_PATH),
+        deploy_config=str(DEPLOY_CONFIG_PATH),
+        mode="text-to-image",
     ) as runner:
         yield runner.omni
 
diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py
index 98ecc8ae586..b52b49d68f5 100644
--- a/tests/entrypoints/test_utils.py
+++ b/tests/entrypoints/test_utils.py
@@ -18,6 +18,7 @@
     _filter_dict_like_object,
     coerce_param_message_types,
     filter_dataclass_kwargs,
+    filter_stages,
     load_and_resolve_stage_configs,
     load_stage_configs_from_yaml,
     resolve_model_config_path,
@@ -401,6 +402,44 @@ def test_stage_configs_path_promotes_new_deploy_yaml_without_expanding_replicas(
         assert stage_configs[1].runtime.num_replicas == 3
         assert stage_configs[1].runtime.devices == "1,2,3"
 
+    def test_filter_stages_selects_mode_stages_without_mutating_stage_config(self, tmp_path):
+        config_path = tmp_path / "deploy.yaml"
+        config_path.write_text(
+            """modes:
+  - mode: text-to-text
+    stages: [0]
+  - mode: text-to-image
+    stages: [0, 1]
+""",
+            encoding="utf-8",
+        )
+        stages = [
+            create_config(
+                {
+                    "stage_id": 0,
+                    "runtime": {"requires_multimodal_data": True},
+                    "final_output": False,
+                    "final_output_type": None,
+                }
+            ),
+            create_config(
+                {
+                    "stage_id": 1,
+                    "runtime": {"requires_multimodal_data": True},
+                    "final_output": True,
+                    "final_output_type": "image",
+                }
+            ),
+        ]
+
+        filtered = filter_stages(str(config_path), stages, {"mode": "text-to-text"})
+
+        assert len(filtered) == 1
+        assert filtered[0].stage_id == 0
+        assert filtered[0].runtime.requires_multimodal_data is True
+        assert filtered[0].final_output is False
+        assert filtered[0].final_output_type is None
+
 
 class TestLoadStageConfigsFromYaml:
     """Regression tests for stage-config loading and merging."""
diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 7b620ea6e80..57313fe3be2 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -941,6 +941,34 @@ def test_merge_pipeline_deploy_preserves_num_replicas(self, tmp_path):
         assert stages[1].yaml_runtime["devices"] == "1,2"
         assert stages[1].yaml_runtime["num_replicas"] == 2
 
+    def test_merge_pipeline_deploy_preserves_requires_multimodal_data(self):
+        from vllm_omni.config.stage_config import (
+            DeployConfig,
+            PipelineConfig,
+            StageDeployConfig,
+            StageExecutionType,
+            StagePipelineConfig,
+            merge_pipeline_deploy,
+        )
+
+        pipeline = PipelineConfig(
+            model_type="test_mm",
+            model_arch="TestModel",
+            stages=(
+                StagePipelineConfig(
+                    stage_id=0,
+                    model_stage="ar",
+                    execution_type=StageExecutionType.LLM_AR,
+                    requires_multimodal_data=True,
+                ),
+            ),
+        )
+        deploy = DeployConfig(async_chunk=False, stages=[StageDeployConfig(stage_id=0)])
+
+        stages = merge_pipeline_deploy(pipeline, deploy)
+
+        assert stages[0].yaml_runtime["requires_multimodal_data"] is True
+
 
 class TestQwen3OmniPipeline:
     def test_registered(self):
diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
index 1a129cad8c0..3d44d1bff93 100644
--- a/vllm_omni/config/pipeline_registry.py
+++ b/vllm_omni/config/pipeline_registry.py
@@ -65,6 +65,18 @@
         "vllm_omni.model_executor.models.glm_image.pipeline",
         "GLM_IMAGE_PIPELINE",
     ),
+    "hunyuan_image3": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_PIPELINE",
+    ),
+    "hunyuan_image3_ar": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_AR_PIPELINE",
+    ),
+    "hunyuan_image3_dit": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_DIT_PIPELINE",
+    ),
     "voxcpm2": (
         "vllm_omni.model_executor.models.voxcpm2.pipeline",
         "VOXCPM2_PIPELINE",
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 17c70302312..0bd1f2b7f8f 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -863,6 +863,7 @@ def merge_pipeline_deploy(
             if ds.devices is not None:
                 runtime["devices"] = ds.devices
             runtime["num_replicas"] = ds.num_replicas
+        runtime["requires_multimodal_data"] = ps.requires_multimodal_data
 
         result.append(
             StageConfig(
@@ -1078,6 +1079,15 @@ def create_from_model(
         if model_type and model_type in _PIPELINE_REGISTRY:
             return cls._create_from_registry(model_type, cli_overrides, deploy_config_path)
 
+        if deploy_config_path is not None:
+            deploy_cfg = load_deploy_config(deploy_config_path)
+            if deploy_cfg.pipeline and deploy_cfg.pipeline in _PIPELINE_REGISTRY:
+                return cls._create_from_registry(
+                    deploy_cfg.pipeline,
+                    cli_overrides,
+                    deploy_config_path,
+                )
+
         # --- HF architecture fallback: some models report a generic
         # model_type that collides with another model. Match by the
         # hf_architectures declared on each registered PipelineConfig.
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
new file mode 100644
index 00000000000..dd176fe3d51
--- /dev/null
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -0,0 +1,100 @@
+# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1).
+# The base CUDA layout follows the existing 8-GPU AR->DiT config
+# (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the
+# verified NPU/XPU stage configs that previously lived under stage_configs/.
+pipeline: hunyuan_image3
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.9
+    enforce_eager: true
+    max_num_batched_tokens: 32768
+    devices: "0,1,2,3"
+    tensor_parallel_size: 4
+    hf_overrides:
+      rope_parameters:
+        mrope_section: [0, 32, 32]
+        rope_type: default
+    default_sampling_params:
+      temperature: 0.6
+      top_p: 0.95
+      top_k: 1024
+      max_tokens: 4096
+      stop_token_ids: [127957]
+      detokenize: false
+
+  - stage_id: 1
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.9
+    enforce_eager: true
+    devices: "4,5,6,7"
+    vae_use_slicing: false
+    vae_use_tiling: false
+    cache_backend:
+    cache_config:
+    enable_cache_dit_summary: false
+    parallel_config:
+      pipeline_parallel_size: 1
+      data_parallel_size: 1
+      tensor_parallel_size: 4
+      enable_expert_parallel: true
+      sequence_parallel_size: 1
+      ulysses_degree: 1
+      ring_degree: 1
+      cfg_parallel_size: 1
+      vae_patch_parallel_size: 1
+      use_hsdp: false
+      hsdp_shard_size: -1
+      hsdp_replicate_size: 1
+    default_sampling_params:
+      seed: 42
+
+edges:
+  - from: 0
+    to: 1
+    window_size: -1
+    max_inflight: 1
+
+platforms:
+  npu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.65
+        devices: "0,1,2,3"
+        tensor_parallel_size: 4
+      - stage_id: 1
+        gpu_memory_utilization: 0.65
+        devices: "4,5,6,7"
+        max_num_batched_tokens: 32768
+        parallel_config:
+          tensor_parallel_size: 4
+          enable_expert_parallel: false
+
+  xpu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.95
+        devices: "0,1,2,3,4,5,6,7"
+        tensor_parallel_size: 8
+        max_num_batched_tokens: 32784
+        quantization: fp8
+        enable_expert_parallel: true
+        worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker
+      - stage_id: 1
+        gpu_memory_utilization: 0.9
+        devices: "0,1,2,3,4,5,6,7"
+        quantization: fp8
+        parallel_config:
+          pipeline_parallel_size: 1
+          data_parallel_size: 1
+          tensor_parallel_size: 8
+          enable_expert_parallel: true
+          sequence_parallel_size: 1
+          ulysses_degree: 1
+          ring_degree: 1
+          cfg_parallel_size: 1
+          vae_patch_parallel_size: 1
+          use_hsdp: false
+          hsdp_shard_size: -1
+          hsdp_replicate_size: 1
diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml
new file mode 100644
index 00000000000..44cd96b72ce
--- /dev/null
+++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml
@@ -0,0 +1,47 @@
+# HunyuanImage-3.0-Instruct AR-only deploy.
+#
+# Use this when AR and DiT are deployed as independent services. This file
+# resolves to stage 0 only, avoiding the default two-stage topology.
+pipeline: hunyuan_image3_ar
+async_chunk: false
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.75
+    trust_remote_code: true
+    enforce_eager: true
+    enable_prefix_caching: false
+    max_num_batched_tokens: 32768
+    devices: "0,1,2,3"
+    tensor_parallel_size: 4
+    hf_overrides:
+      rope_parameters:
+        mrope_section: [0, 32, 32]
+        rope_type: default
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 0.95
+      top_k: 1024
+      max_tokens: 1024
+      stop_token_ids: [127957, 128026]
+      detokenize: true
+
+platforms:
+  npu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.75
+        devices: "0,1,2,3,4,5,6,7"
+        tensor_parallel_size: 8
+
+  xpu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.95
+        devices: "0,1,2,3,4,5,6,7"
+        tensor_parallel_size: 8
+        max_num_batched_tokens: 32784
+        quantization: fp8
+        enable_expert_parallel: true
+        worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker
diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml
new file mode 100644
index 00000000000..3c0ba190101
--- /dev/null
+++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml
@@ -0,0 +1,64 @@
+# HunyuanImage-3.0-Instruct DiT-only deploy.
+#
+# Use this for standalone DiT/offline execution. This file resolves to one
+# diffusion stage with stage_id 0 and does not depend on the AR stage.
+pipeline: hunyuan_image3_dit
+async_chunk: false
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.9
+    enforce_eager: true
+    devices: "0,1,2,3"
+    vae_use_slicing: false
+    vae_use_tiling: false
+    cache_backend:
+    cache_config:
+    enable_cache_dit_summary: false
+    parallel_config:
+      pipeline_parallel_size: 1
+      data_parallel_size: 1
+      tensor_parallel_size: 4
+      enable_expert_parallel: true
+      sequence_parallel_size: 1
+      ulysses_degree: 1
+      ring_degree: 1
+      cfg_parallel_size: 1
+      vae_patch_parallel_size: 1
+      use_hsdp: false
+      hsdp_shard_size: -1
+      hsdp_replicate_size: 1
+    default_sampling_params:
+      seed: 42
+
+platforms:
+  npu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.65
+        devices: "0,1,2,3"
+        max_num_batched_tokens: 32768
+        parallel_config:
+          tensor_parallel_size: 4
+          enable_expert_parallel: true
+
+  xpu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.9
+        devices: "0,1,2,3,4,5,6,7"
+        quantization: fp8
+        parallel_config:
+          pipeline_parallel_size: 1
+          data_parallel_size: 1
+          tensor_parallel_size: 8
+          enable_expert_parallel: true
+          sequence_parallel_size: 1
+          ulysses_degree: 1
+          ring_degree: 1
+          cfg_parallel_size: 1
+          vae_patch_parallel_size: 1
+          use_hsdp: false
+          hsdp_shard_size: -1
+          hsdp_replicate_size: 1
diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py
index 540d5c0cfdf..b4293d59fd7 100644
--- a/vllm_omni/entrypoints/cli/serve.py
+++ b/vllm_omni/entrypoints/cli/serve.py
@@ -593,6 +593,7 @@ def run_headless(args: argparse.Namespace) -> None:
         model,
         args_dict.get("stage_configs_path"),
         args_dict,
+        deploy_config_path=args_dict.get("deploy_config"),
     )
 
     # Locate the stage config that matches stage_id.
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index d728e76417c..460c6985b0c 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -340,6 +340,7 @@ def resolve_model_config_path(model: str) -> str:
         normalized_model_type = _DIFFUSERS_CLASS_TO_CONFIG[model_type]
     else:
         normalized_model_type = model_type.replace("-", "_")
+
     model_type_str = f"{normalized_model_type}.yaml"
     complete_config_path = PROJECT_ROOT / default_config_path / model_type_str
     if os.path.exists(complete_config_path):
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py b/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py
new file mode 100644
index 00000000000..3ff53af6292
--- /dev/null
+++ b/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""HunyuanImage3 pipeline topology."""
+
+from vllm_omni.config.stage_config import (
+    PipelineConfig,
+    StageExecutionType,
+    StagePipelineConfig,
+)
+
+_HUNYUAN_IMAGE3_HF_ARCHS = (
+    "HunyuanImage3ForConditionalGeneration",
+    "HunyuanImage3ForCausalMM",
+)
+_HUNYUAN_IMAGE3_MODEL_ARCH = "HunyuanImage3ForCausalMM"
+_HUNYUAN_IMAGE3_INPUT_PROCESSOR = "vllm_omni.model_executor.stage_input_processors.hunyuan_image3"
+
+
+HUNYUAN_IMAGE3_PIPELINE = PipelineConfig(
+    model_type="hunyuan_image3",
+    model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+    hf_architectures=_HUNYUAN_IMAGE3_HF_ARCHS,
+    stages=(
+        StagePipelineConfig(
+            stage_id=0,
+            model_stage="AR",
+            execution_type=StageExecutionType.LLM_AR,
+            input_sources=(),
+            final_output=False,
+            owns_tokenizer=False,
+            requires_multimodal_data=True,
+            model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+            engine_output_type="latent",
+        ),
+        StagePipelineConfig(
+            stage_id=1,
+            model_stage="dit",
+            execution_type=StageExecutionType.DIFFUSION,
+            input_sources=(0,),
+            final_output=True,
+            final_output_type="image",
+            requires_multimodal_data=True,
+            model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+            custom_process_input_func=f"{_HUNYUAN_IMAGE3_INPUT_PROCESSOR}.ar2diffusion",
+        ),
+    ),
+)
+
+
+HUNYUAN_IMAGE3_AR_PIPELINE = PipelineConfig(
+    model_type="hunyuan_image3_ar",
+    model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+    hf_architectures=(),
+    stages=(
+        StagePipelineConfig(
+            stage_id=0,
+            model_stage="AR",
+            execution_type=StageExecutionType.LLM_AR,
+            input_sources=(),
+            final_output=True,
+            final_output_type="text",
+            owns_tokenizer=False,
+            requires_multimodal_data=True,
+            model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+            engine_output_type="latent",
+        ),
+    ),
+)
+
+
+HUNYUAN_IMAGE3_DIT_PIPELINE = PipelineConfig(
+    model_type="hunyuan_image3_dit",
+    model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+    hf_architectures=(),
+    stages=(
+        StagePipelineConfig(
+            stage_id=0,
+            model_stage="dit",
+            execution_type=StageExecutionType.DIFFUSION,
+            input_sources=(),
+            final_output=True,
+            final_output_type="image",
+            requires_multimodal_data=True,
+            model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH,
+        ),
+    ),
+)
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
deleted file mode 100644
index 0614a9f1179..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Stage config for HunyuanImage-3.0 Image-to-Text (I2T / image understanding).
-# Single LLM stage: AR model reads image + text prompt, generates text output.
-
-stage_args:
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0,1,2,3"
-      max_batch_size: 1
-      requires_multimodal_data: true
-    engine_args:
-      model_stage: AR
-      max_num_seqs: 1
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.95
-      enforce_eager: true
-      trust_remote_code: true
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 0.95
-      top_k: 1024
-      max_tokens: 2048
-      stop_token_ids: [127957, 128024, 128026]  # <|endoftext|>, </think>, </answer>
-      detokenize: True
-
-runtime:
-  enabled: true
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml
deleted file mode 100644
index 31511697371..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-# Stage config for HunyuanImage-3.0 Image+Text-to-Image (image editing).
-# Stage 0: AR (HunyuanImage3ForConditionalGeneration) — reads (image, text), emits latent tokens
-# Stage 1: Diffusion (HunyuanImage3Pipeline / DiT + VAE) — denoise + decode latents → image
-
-stage_args:
-  # Stage 0: AR Model
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0,1,2,3"
-      max_batch_size: 1
-      requires_multimodal_data: true  # AR needs the original image
-    engine_args:
-      model_stage: AR
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.95
-      enforce_eager: true
-      trust_remote_code: true
-      engine_output_type: latent  # AR outputs latent for DiT
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-    is_comprehension: false  # Generation task, not comprehension
-    final_output: false  # AR is not the final output
-    default_sampling_params:
-      temperature: 0.6
-      top_p: 0.95
-      top_k: 1024
-      max_tokens: 4096
-      stop_token_ids: [127957]  # <|endoftext|>
-      detokenize: true  # DiT bridge consumes ar_generated_text; let the AR engine produce it
-
-  # Stage 1: Diffusion (DiT + VAE)
-  # Receives latents from AR stage, performs denoising + VAE decode
-  - stage_id: 1
-    stage_type: diffusion
-    runtime:
-      process: true
-      devices: "4,5,6,7"
-      max_batch_size: 1
-      requires_multimodal_data: true  # May need condition images
-    engine_args:
-      model_stage: dit
-      model_arch: HunyuanImage3ForCausalMM
-      enforce_eager: true
-      trust_remote_code: true
-      distributed_executor_backend: "mp"
-      parallel_config:
-        tensor_parallel_size: 4
-        enable_expert_parallel: true
-    engine_input_source: [0]  # Input from AR stage
-    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hunyuan_image3.ar2diffusion
-    final_output: true
-    final_output_type: image
-    default_sampling_params:
-      num_inference_steps: 50
-      guidance_scale: 2.5
-
-# Top-level runtime config
-runtime:
-  enabled: true
-  edges:
-    - from: 0  # AR → Diffusion
-      to: 1
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml
deleted file mode 100644
index f0797c63270..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 with AR→DiT KV reuse.
-# Stage 0: AR Model (vLLM implementation)
-# Stage 1: DiT Model (diffusion)
-#
-# text-to-image flow: AR (stage 0) → KV transfer → DiT (stage 1)
-# image-to-text flow: AR (stage 0) only
-#
-# Compared to hunyuan_image3_t2i.yaml, this config:
-#   1. Enables both stages [0, 1] for text-to-image (AR prefill + DiT denoising)
-#   2. Adds omni_kv_config to send/receive KV cache between stages
-
-# The following config has been verified on 8x L40S-48G GPU (4 for AR + 4 for DiT).
-stage_args:
-  - stage_id: 0
-    stage_type: llm  # Use llm stage type for AR stages
-    runtime:
-      process: true  # Run this stage in a separate process
-      devices: "0,1,2,3"  # AR stage uses GPU 0-3
-    engine_args:
-      model_stage: AR
-      max_num_seqs: 1
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.9
-      enforce_eager: true  # Now we only support eager mode
-      trust_remote_code: true
-      engine_output_type: latent
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-      omni_kv_config:
-        need_send_cache: true
-        kv_transfer_criteria:
-          type: prefill_finished  # Send KV cache after AR prefill completes
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 2048
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.1
-  - stage_id: 1
-    stage_type: diffusion
-    runtime:
-      process: true
-      devices: "4,5,6,7"  # DiT stage uses GPU 4-7
-      max_batch_size: 1
-    engine_args:
-      model_stage: diffusion
-      enforce_eager: true
-      distributed_executor_backend: "mp"
-      vae_use_slicing: false
-      vae_use_tiling: false
-      cache_backend: null
-      cache_config: null
-      enable_cache_dit_summary: false
-      omni_kv_config:
-        need_recv_cache: true  # Receive AR KV cache from stage 0
-      parallel_config:
-        pipeline_parallel_size: 1
-        data_parallel_size: 1
-        tensor_parallel_size: 4
-        enable_expert_parallel: false
-        sequence_parallel_size: 1
-        ulysses_degree: 1
-        ring_degree: 1
-        cfg_parallel_size: 1
-        vae_patch_parallel_size: 1
-        use_hsdp: false
-        hsdp_shard_size: -1
-        hsdp_replicate_size: 1
-    engine_input_source: [0]  # Receive input (including KV) from stage 0
-    final_output: true
-    final_output_type: image
-
-# Top-level runtime config: windows, edges, and connectors
-runtime:
-  enabled: true
-  defaults:
-    window_size: -1  # Trigger downstream only after full upstream completion
-    max_inflight: 1  # Process serially within each stage
-
-  edges:
-    - from: 0
-      to: 1
-      window_size: -1
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml
deleted file mode 100644
index 586b601bc5a..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 DiT with FP8 online quantization.
-# The following config is for 2x H200 GPU.
-
-# Stage 0:  Diffusion (DiT + VAE)
-# This stage receives noise and timesteps and performs denoising + VAE decode
-stage_args:
-  - stage_id: 0
-    stage_type: diffusion
-    runtime:
-      devices: "0,1"
-      max_batch_size: 1
-    engine_args:
-      model_stage: dit
-      enforce_eager: true
-      trust_remote_code: true
-      distributed_executor_backend: "mp"
-      quantization: "fp8"
-      parallel_config:
-        tensor_parallel_size: 2
-        enable_expert_parallel: true
-      omni_kv_config:
-        need_recv_cache: true
-
-    final_output: true
-    final_output_type: image
-    is_comprehension: false
-    default_sampling_params:
-      seed: 42
-
-# Runtime edges
-runtime:
-  enabled: true
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml
deleted file mode 100644
index 1d8c7f4812d..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 DiT.
-# The following config has been verified on 4x H20 GPU.
-
-# Stage 0:  Diffusion (DiT + VAE)
-# This stage receives noise and timesteps and performs denoising + VAE decode
-stage_args:
-  - stage_id: 0
-    stage_type: diffusion
-    runtime:
-      devices: "0,1,2,3"
-    engine_args:
-      max_num_seqs: 1
-      model_stage: dit
-      enforce_eager: true
-      trust_remote_code: true
-      distributed_executor_backend: "mp"
-      parallel_config:
-        tensor_parallel_size: 4
-        enable_expert_parallel: true
-      omni_kv_config:
-        need_recv_cache: true
-
-    final_output: true
-    final_output_type: image
-    is_comprehension: false
-    default_sampling_params:
-      seed: 42
-
-# Runtime edges
-runtime:
-  enabled: true
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml
deleted file mode 100644
index 41ed74ba62a..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 on 2 GPUs with FP8.
-# Stage 0: AR Model (vLLM implementation)
-
-stage_args:
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0,1"
-    engine_args:
-      model_stage: AR
-      max_num_seqs: 1
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.9
-      enforce_eager: true
-      trust_remote_code: true
-      engine_output_type: latent
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 2
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 2048
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.1
-
-runtime:
-  enabled: true
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
deleted file mode 100644
index c9daa5e5f39..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Stage config for HunyuanImage-3.0 Text-to-Text (T2T / pure text generation).
-# Single LLM stage: AR model reads text prompt only, generates text output.
-# Sampling params aligned with official generation_config.json.
-
-stage_args:
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0,1,2,3"
-      max_batch_size: 1
-      requires_multimodal_data: false
-    engine_args:
-      model_stage: AR
-      max_num_seqs: 1
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.95
-      enforce_eager: true
-      trust_remote_code: true
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 0.95
-      top_k: 1024
-      max_tokens: 2048
-      stop_token_ids: [127957, 128024, 128026]  # <|endoftext|>, </think>, </answer>
-      detokenize: True
-
-runtime:
-  enabled: true
diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml
deleted file mode 100644
index 0fd03949d11..00000000000
--- a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 DiT on NPU.
-# The following config has been verified on 8x A3-64G NPUs.
-
-# Stage 0: Diffusion (DiT + VAE)
-# This stage receives noise and timesteps and performs denoising + VAE decode.
-stage_args:
-  - stage_id: 0
-    stage_type: diffusion
-    runtime:
-      devices: "0,1,2,3,4,5,6,7"
-    engine_args:
-      max_num_seqs: 1
-      model_stage: dit
-      gpu_memory_utilization: 0.65
-      enforce_eager: true
-      trust_remote_code: true
-      engine_output_type: image
-      distributed_executor_backend: "mp"
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      parallel_config:
-        tensor_parallel_size: 8
-        enable_expert_parallel: true
-      omni_kv_config:
-        need_recv_cache: true
-
-    final_output: true
-    final_output_type: image
-    is_comprehension: false
-    default_sampling_params:
-      seed: 42
-
-# Runtime defaults
-runtime:
-  enabled: true
diff --git a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml
deleted file mode 100644
index 4e0005f82a1..00000000000
--- a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-# Stage config for running Hunyuan-Image3.0 with architecture of OmniLLM.
-# Stage 0: AR Model (vLLM implementation)
-
-# The following config has been verified on 8x Max 1550 GPU.
-modes:
-  - mode: text-to-image
-    stages: [1]
-  - mode: image-to-text
-    stages: [0]
-stage_args:
-  - stage_id: 0
-    stage_type: llm  # Use llm stage type to launch OmniLLM
-    runtime:
-      process: true  # Run this stage in a separate process
-      devices: "0,1,2,3,4,5,6,7"  # Visible devices for this stage
-      max_batch_size: 1
-    engine_args:
-      model_stage: AR
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.95
-      enforce_eager: true  # Now we only support eager mode
-      trust_remote_code: true
-      engine_output_type: latent
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32784
-      tensor_parallel_size: 8
-      pipeline_parallel_size: 1
-      enable_expert_parallel: true
-      quantization: "fp8"
-    is_comprehension: true
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 2048
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.1
-  - stage_id: 1
-    stage_type: diffusion
-    runtime:
-      process: true
-      devices: "0,1,2,3,4,5,6,7"
-      max_batch_size: 1
-    engine_args:
-      model_stage: diffusion
-      gpu_memory_utilization: 0.9
-      enforce_eager: true
-      engine_output_type: image
-      distributed_executor_backend: "mp"
-      enable_prefix_caching: false
-      vae_use_slicing: false
-      vae_use_tiling: false
-      cache_backend: null
-      cache_config: null
-      enable_cache_dit_summary: false
-      quantization: "fp8"
-      parallel_config:
-        pipeline_parallel_size: 1
-        data_parallel_size: 1
-        tensor_parallel_size: 8
-        enable_expert_parallel: true
-        sequence_parallel_size: 1
-        ulysses_degree: 1
-        ring_degree: 1
-        cfg_parallel_size: 1
-        vae_patch_parallel_size: 1
-        use_hsdp: false
-        hsdp_shard_size: -1
-        hsdp_replicate_size: 1
-    final_output: true
-    final_output_type: image
-
-# Top-level runtime config (concise): default windows and stage edges
-runtime:
-  enabled: true

From 6fddd0e7925b28eb6593ee43c6fd49abcb0ffbc8 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 10:52:03 +0800
Subject: [PATCH 02/40] Add request-level HunyuanImage3 bot task controls

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/end2end.py                 |   9 +-
 .../hunyuan_image3/test_prompt_utils.py       |  75 ++++++++-
 vllm_omni/deploy/hunyuan_image3.yaml          |   1 -
 vllm_omni/deploy/hunyuan_image3_ar.yaml       |   1 -
 .../models/hunyuan_image3/prompt_utils.py     | 117 ++++++++++++-
 vllm_omni/entrypoints/openai/api_server.py    |   5 +
 .../entrypoints/openai/protocol/images.py     |   4 +
 vllm_omni/entrypoints/openai/serving_chat.py  | 155 ++++++++++++++++--
 .../models/hunyuan_image3/hunyuan_image3.py   |  11 +-
 9 files changed, 353 insertions(+), 25 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 1eaa669c53a..b6ffa535463 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,7 +18,9 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    bot_task_for_task,
     build_prompt_tokens,
+    stop_token_ids_for_bot_task,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -135,6 +137,7 @@ def main():
 
     # Determine task for prompt formatting
     task = args.bot_task or _MODALITY_TASK_MAP[args.modality]
+    bot_task = bot_task_for_task(task)
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -219,7 +222,8 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    for i, sp in enumerate(params_list):
+    ar_stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task)
+    for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
             sp.guidance_scale = args.guidance_scale
@@ -229,12 +233,15 @@ def main():
             if args.modality in ("text2img",):
                 sp.height = args.height
                 sp.width = args.width
+        elif hasattr(sp, "stop_token_ids"):
+            sp.stop_token_ids = ar_stop_token_ids
 
     # Print configuration
     print(f"\n{'=' * 60}")
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
     print(f"  Modality: {args.modality}")
+    print(f"  Bot task: {bot_task}")
     if deploy_config is not None:
         print(f"  Deploy config: {deploy_config}")
     else:
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 501664fe688..62beb45a1f6 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -24,9 +24,12 @@
 import pytest
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    apply_bot_task_to_sampling_params,
     available_tasks,
+    bot_task_for_task,
     build_prompt,
     build_prompt_tokens,
+    stop_token_ids_for_bot_task,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -49,10 +52,16 @@ class FakeTokenizer:
         "<img>": 2,
         "<think>": 3,
         "<recaption>": 4,
+        "<|endoftext|>": 5,
+        "</recaption>": 6,
+        "</answer>": 7,
+        "<boi>": 8,
+        **{f"<img_ratio_{i}>": 1000 + i for i in range(33)},
     }
 
     def __init__(self) -> None:
         self.encode_calls: list[str] = []
+        self.eos_token_id = self.SPECIAL["<|endoftext|>"]
 
     def convert_tokens_to_ids(self, tok: str) -> int:
         return self.SPECIAL.get(tok, 0)
@@ -75,6 +84,60 @@ def test_available_tasks_covers_all_modalities():
     }
 
 
+@pytest.mark.parametrize(
+    "task,expected_bot_task",
+    [
+        ("t2t", "auto"),
+        ("i2t", "auto"),
+        ("it2i_think", "think_recaption"),
+        ("it2i_recaption", "recaption"),
+        ("t2i_think", "think_recaption"),
+        ("t2i_recaption", "recaption"),
+        ("t2i_vanilla", "image"),
+    ],
+)
+def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task: str):
+    assert bot_task_for_task(task) == expected_bot_task
+
+
+def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
+    tok = FakeTokenizer()
+
+    assert stop_token_ids_for_bot_task(tok, "auto") == [5, 8]
+    assert stop_token_ids_for_bot_task(tok, "image") == [5]
+    assert stop_token_ids_for_bot_task(tok, "think_recaption") == [6, 7, 5]
+    assert stop_token_ids_for_bot_task(tok, "recaption") == [6, 7, 5]
+    assert stop_token_ids_for_bot_task(tok, "auto", image_size="auto") == [
+        5,
+        *range(1000, 1033),
+    ]
+
+
+class FakeSamplingParams:
+
+    def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
+        self.stop_token_ids = stop_token_ids
+        self.max_tokens = max_tokens
+
+
+def test_apply_bot_task_to_sampling_params_updates_only_target_stage():
+    tok = FakeTokenizer()
+    stage0 = FakeSamplingParams(stop_token_ids=[999])
+    stage1 = FakeSamplingParams(stop_token_ids=[888])
+
+    updated = apply_bot_task_to_sampling_params(
+        [stage0, stage1],
+        tok,
+        "think_recaption",
+        stage_index=0,
+    )
+
+    assert updated[0] is stage0
+    assert updated[0].stop_token_ids == [6, 7, 5]
+    assert updated[1] is stage1
+    assert stage0.stop_token_ids == [6, 7, 5]
+
+
 @pytest.mark.parametrize(
     "task",
     [
@@ -234,10 +297,16 @@ def test_end2end_routes_through_shared_prompt_utils():
     for node in ast.walk(tree):
         if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
             imported_from_prompt_utils.update(alias.name for alias in node.names)
-    assert "build_prompt_tokens" in imported_from_prompt_utils, (
-        "end2end.py must import build_prompt_tokens from "
+    expected_imports = {
+        "bot_task_for_task",
+        "build_prompt_tokens",
+        "stop_token_ids_for_bot_task",
+    }
+    assert expected_imports <= imported_from_prompt_utils, (
+        "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "
         "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared "
-        "helper is the single source of truth for the AR-prefill template."
+        "module is the single source of truth for the AR-prefill template and "
+        "bot_task-derived AR stop token ids."
     )
 
 
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index dd176fe3d51..b5238169786 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -21,7 +21,6 @@ stages:
       top_p: 0.95
       top_k: 1024
       max_tokens: 4096
-      stop_token_ids: [127957]
       detokenize: false
 
   - stage_id: 1
diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml
index 44cd96b72ce..27cbf0f9a60 100644
--- a/vllm_omni/deploy/hunyuan_image3_ar.yaml
+++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml
@@ -24,7 +24,6 @@ stages:
       top_p: 0.95
       top_k: 1024
       max_tokens: 1024
-      stop_token_ids: [127957, 128026]
       detokenize: true
 
 platforms:
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 6e8efac3133..a92b4a0848c 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -17,12 +17,24 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 from .system_prompt import get_system_prompt
 
+BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
+_BOT_TASK_TO_TOKENIZER_TASK = {
+    "auto": "auto",
+    "image": "image",
+    "recaption": "recaption",
+    "think_recaption": "think",
+}
+
 # task -> (sys_type, bot_task, trigger_tag)
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
+    "t2t_think": ("en_unified", "think", "<think>"),
     "i2t": ("en_unified", None, None),
+    "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
     "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
     "t2i_think": ("en_unified", "think", "<think>"),
@@ -36,6 +48,100 @@ def available_tasks() -> list[str]:
     return sorted(_TASK_PRESETS)
 
 
+def bot_task_for_task(task: str) -> str:
+    """Return the HunyuanImage3 bot_task associated with a prompt task."""
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    _, preset_bot_task, _ = _TASK_PRESETS[task]
+    if preset_bot_task == "think":
+        return "think_recaption"
+    return preset_bot_task or "auto"
+
+
+def tokenizer_bot_task_for_bot_task(bot_task: str) -> str:
+    """Map the public HunyuanImage3 bot_task to tokenizer-internal task."""
+    if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
+    return _BOT_TASK_TO_TOKENIZER_TASK[bot_task]
+
+
+def _token_id(tokenizer, token: str) -> int:
+    token_id = tokenizer.convert_tokens_to_ids(token)
+    if token_id is None:
+        raise ValueError(f"Tokenizer does not know special token {token!r}")
+    return int(token_id)
+
+
+def _eos_token_id(tokenizer) -> int:
+    token_id = getattr(tokenizer, "eos_token_id", None)
+    if token_id is not None:
+        return int(token_id)
+    return _token_id(tokenizer, "<|endoftext|>")
+
+
+def stop_token_ids_for_bot_task(
+    tokenizer,
+    bot_task: str,
+    image_size: int | str | None = None,
+) -> list[int]:
+    """Return AR stop token ids for a HunyuanImage3 bot_task.
+
+    Mirrors the official HunyuanImage-3.0 generation logic: `auto`
+    additionally stops on image-start markers, text/image tasks stop on
+    their structural end tokens, and all ids are resolved from the
+    tokenizer instead of being hard-coded in deploy YAML.
+    """
+    eos_id = _eos_token_id(tokenizer)
+
+    if image_size == "auto":
+        extra_auto_stops = [
+            _token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)
+        ]
+    else:
+        extra_auto_stops = [_token_id(tokenizer, "<boi>")]
+
+    tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
+    stop_token_id = {
+        "auto": [eos_id] + extra_auto_stops,
+        "image": [eos_id],
+        "recaption": [
+            _token_id(tokenizer, "</recaption>"),
+            _token_id(tokenizer, "</answer>"),
+            eos_id,
+        ],
+        "think": [
+            _token_id(tokenizer, "</recaption>"),
+            _token_id(tokenizer, "</answer>"),
+            eos_id,
+        ],
+    }
+    return stop_token_id[tokenizer_bot_task]
+
+
+def apply_bot_task_to_sampling_params(
+    sampling_params_list: list[Any],
+    tokenizer: Any,
+    bot_task: str,
+    *,
+    stage_index: int = 0,
+    image_size: int | str | None = None,
+) -> list[Any]:
+    """Apply a per-request HunyuanImage3 bot_task to one AR stage."""
+    if stage_index < 0 or stage_index >= len(sampling_params_list):
+        raise IndexError(
+            f"stage_index {stage_index} is out of range for "
+            f"{len(sampling_params_list)} sampling params"
+        )
+
+    updated_params_list = list(sampling_params_list)
+    params = updated_params_list[stage_index]
+    params.stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task, image_size=image_size)
+
+    updated_params_list[stage_index] = params
+    return updated_params_list
+
+
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",
@@ -149,4 +255,13 @@ def build_prompt_tokens(
     return ids
 
 
-__all__ = ["build_prompt", "build_prompt_tokens", "available_tasks"]
+__all__ = [
+    "available_tasks",
+    "apply_bot_task_to_sampling_params",
+    "bot_task_for_task",
+    "BOT_TASKS",
+    "build_prompt",
+    "build_prompt_tokens",
+    "stop_token_ids_for_bot_task",
+    "tokenizer_bot_task_for_bot_task",
+]
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 06fb0a7f4cb..9b3aec58f21 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1527,6 +1527,8 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request)
                 extra_body["guidance_scale"] = request.guidance_scale
             if request.true_cfg_scale is not None:
                 extra_body["true_cfg_scale"] = request.true_cfg_scale
+            if request.bot_task is not None:
+                extra_body["bot_task"] = request.bot_task
             if request.generator_device is not None:
                 extra_body["generator_device"] = request.generator_device
             if request.lora is not None:
@@ -1693,6 +1695,7 @@ async def edit_images(
     guidance_scale: float | None = Form(None),
     strength: float | None = Form(None),
     true_cfg_scale: float | None = Form(None),
+    bot_task: str | None = Form(None),
     seed: int | None = Form(None),
     generator_device: str | None = Form(None),
     # vllm-omni extension for per-request LoRA.
@@ -1896,6 +1899,8 @@ async def edit_images(
                 extra_body["strength"] = strength
             if true_cfg_scale is not None:
                 extra_body["true_cfg_scale"] = true_cfg_scale
+            if bot_task is not None:
+                extra_body["bot_task"] = bot_task
             if layers is not None:
                 extra_body["layers"] = layers
             if resolution is not None:
diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py
index 0fb22a548cf..c78a95de058 100644
--- a/vllm_omni/entrypoints/openai/protocol/images.py
+++ b/vllm_omni/entrypoints/openai/protocol/images.py
@@ -117,6 +117,10 @@ def validate_use_system_prompt(cls, v):
         le=20.0,
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
+    bot_task: str | None = Field(
+        default=None,
+        description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.",
+    )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
     generator_device: str | None = Field(
         default=None,
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 7558e85aaac..b2375fd38b4 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -197,10 +197,25 @@ async def create_chat_completion(
             if tokenizer is None:
                 tokenizer = await self.engine_client.get_tokenizer()
 
+            extra_body = self._get_extra_body_from_request(request)
+            bot_task = (
+                extra_body.get("bot_task")
+                if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or []))
+                is not None
+                else None
+            )
+            request_chat_template_kwargs = request.chat_template_kwargs or {}
+            if bot_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+
+                tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
+                request_chat_template_kwargs = dict(request_chat_template_kwargs)
+                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task
+
             reasoning_parser: ReasoningParser | None = None
             if self.reasoning_parser_cls:
                 chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
-                    request.chat_template_kwargs,
+                    request_chat_template_kwargs,
                     self.default_chat_template_kwargs,
                 )
                 reasoning_parser = self.reasoning_parser_cls(
@@ -248,13 +263,13 @@ async def create_chat_completion(
             if not self.use_harmony:
                 error_check_ret = self._validate_chat_template(
                     request_chat_template=request.chat_template,
-                    chat_template_kwargs=request.chat_template_kwargs,
+                    chat_template_kwargs=request_chat_template_kwargs,
                     trust_request_chat_template=self.trust_request_chat_template,
                 )
                 if error_check_ret is not None:
                     return error_check_ret
 
-                chat_template_kwargs = request.chat_template_kwargs or {}
+                chat_template_kwargs = dict(request_chat_template_kwargs)
                 chat_template_kwargs.update(reasoning_effort=request.reasoning_effort)
 
                 # Merge chat_template_kwargs with defaults
@@ -321,9 +336,7 @@ async def create_chat_completion(
                 #   `extra_body` is flattented and merged into the payload's root.
                 #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
                 #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-                extra_body = getattr(request, "extra_body", None)
-                if not extra_body:
-                    extra_body = request.model_extra or {}
+                extra_body = self._get_extra_body_from_request(request)
 
                 height, width = self._resolve_height_width_from_extra_body(extra_body)
 
@@ -367,6 +380,9 @@ async def create_chat_completion(
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
+                if bot_task is not None:
+                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task
+                    tprompt["bot_task"] = bot_task
                 tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
@@ -404,6 +420,12 @@ async def create_chat_completion(
                 # to delta to ensure emitted outputs are correctly drained. Otherwise
                 # convert cumulative to Final Only to ensure the output is correct.
                 sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream)
+                sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+                    engine=self.engine_client,
+                    sampling_params_list=sampling_params_list,
+                    bot_task=bot_task,
+                    tokenizer=tokenizer,
+                )
 
                 # Apply user-specified overrides to diffusion stage(s) for image generation
                 for idx, sp in enumerate(sampling_params_list):
@@ -685,6 +707,89 @@ def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[Sam
                 raise ValueError(f"Invalid sampling params: {sampling_params}")
         return final_sampling_params_list
 
+    @staticmethod
+    def _get_extra_body_from_request(request: Any) -> dict[str, Any]:
+        body: dict[str, Any] = {}
+        model_extra = getattr(request, "model_extra", None)
+        if isinstance(model_extra, dict):
+            body.update(model_extra)
+        extra_body = getattr(request, "extra_body", None)
+        if isinstance(extra_body, dict):
+            body.update(extra_body)
+        return body
+
+    @staticmethod
+    def _stage_config_get(stage_config: Any, key: str) -> Any:
+        if isinstance(stage_config, dict):
+            return stage_config.get(key)
+        if hasattr(stage_config, "get"):
+            try:
+                return stage_config.get(key)
+            except Exception:
+                pass
+        return getattr(stage_config, key, None)
+
+    @classmethod
+    def _is_hunyuan_image3_stage(cls, stage_config: Any) -> bool:
+        model_arch = cls._stage_config_get(stage_config, "model_arch")
+        if model_arch == "HunyuanImage3ForCausalMM":
+            return True
+
+        engine_args = cls._stage_config_get(stage_config, "engine_args")
+        if isinstance(engine_args, dict):
+            return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM"
+        if engine_args is not None and hasattr(engine_args, "get"):
+            try:
+                return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM"
+            except Exception:
+                pass
+        return getattr(engine_args, "model_arch", None) == "HunyuanImage3ForCausalMM"
+
+    @classmethod
+    def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | None:
+        for idx, stage_config in enumerate(stage_configs):
+            if cls._is_hunyuan_image3_stage(stage_config) and get_stage_type(stage_config) != "diffusion":
+                return idx
+        return None
+
+    async def _apply_hunyuan_image3_bot_task_sampling_params(
+        self,
+        *,
+        engine: Any,
+        sampling_params_list: list[Any],
+        bot_task: Any,
+        tokenizer: Any | None = None,
+    ) -> list[Any]:
+        if bot_task is None:
+            return sampling_params_list
+
+        stage_configs = list(getattr(engine, "stage_configs", []) or [])
+        stage_index = self._get_hunyuan_image3_ar_stage_index(stage_configs)
+        if stage_index is None:
+            return sampling_params_list
+
+        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+            BOT_TASKS,
+            apply_bot_task_to_sampling_params,
+            tokenizer_bot_task_for_bot_task,
+        )
+
+        if bot_task not in BOT_TASKS:
+            raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
+        tokenizer_bot_task_for_bot_task(bot_task)
+
+        if tokenizer is None and hasattr(engine, "get_tokenizer"):
+            tokenizer = await engine.get_tokenizer()
+        if tokenizer is None:
+            raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.")
+
+        return apply_bot_task_to_sampling_params(
+            sampling_params_list,
+            tokenizer,
+            bot_task,
+            stage_index=stage_index,
+        )
+
     def _get_comprehension_stage_index(self) -> int:
         for idx, stage in enumerate(self.engine_client.stage_configs):
             if stage.is_comprehension:
@@ -2149,7 +2254,11 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        bot_task = extra_body.get("bot_task")
+        bot_task = (
+            extra_body.get("bot_task")
+            if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None
+            else None
+        )
 
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
@@ -2188,6 +2297,11 @@ def _build_multistage_generation_inputs(
             mm_processor_kwargs["target_h"] = height
         if width is not None:
             mm_processor_kwargs["target_w"] = width
+        if bot_task is not None:
+            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+
+            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task)
+            engine_prompt["bot_task"] = bot_task
         if mm_processor_kwargs:
             engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs
         if engine_prompt_data is not None:
@@ -2284,6 +2398,7 @@ async def generate_diffusion_images(
         negative_prompt = extra_body.get("negative_prompt")
         num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
         lora_body = extra_body.get("lora")
+        bot_task = extra_body.get("bot_task")
 
         pil_images: list[Image.Image] = []
         for img_b64 in reference_images:
@@ -2367,6 +2482,12 @@ async def generate_diffusion_images(
                 engine_prompt = gen_prompt
                 sampling_params_list = [gen_params]
 
+            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+                engine=diffusion_engine,
+                sampling_params_list=sampling_params_list,
+                bot_task=bot_task,
+            )
+
             result = None
             async for output in diffusion_engine.generate(
                 prompt=engine_prompt,
@@ -2435,9 +2556,7 @@ async def _create_diffusion_chat_completion(
             #   `extra_body` is flattented and merged into the payload's root.
             #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
             #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-            extra_body = getattr(request, "extra_body", None)
-            if not extra_body:
-                extra_body = request.model_extra or {}
+            extra_body = self._get_extra_body_from_request(request)
 
             # Parse size if provided (supports "1024x1024" format)
             height, width = self._resolve_height_width_from_extra_body(extra_body)
@@ -2456,6 +2575,7 @@ async def _create_diffusion_chat_completion(
                 seed = getattr(request, "seed", None)
             negative_prompt = extra_body.get("negative_prompt")
             num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
+            bot_task = extra_body.get("bot_task")
 
             # Text-to-video parameters (ref: text_to_video.py)
             num_frames = extra_body.get("num_frames")
@@ -2569,6 +2689,15 @@ async def _create_diffusion_chat_completion(
             # Generate image or audio (e.g. AudioX) via AsyncOmni
             diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
             stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or [])
+            if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
+                bot_task = None
+            elif bot_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+
+                gen_prompt["bot_task"] = bot_task
+                gen_prompt["mm_processor_kwargs"] = {
+                    "bot_task": tokenizer_bot_task_for_bot_task(bot_task),
+                }
             sampling_params_list = build_stage_sampling_params_list(
                 stage_configs,
                 get_default_sampling_params_list(diffusion_engine),
@@ -2579,6 +2708,12 @@ async def _create_diffusion_chat_completion(
             if not sampling_params_list:
                 sampling_params_list = [gen_params]
 
+            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+                engine=diffusion_engine,
+                sampling_params_list=sampling_params_list,
+                bot_task=bot_task,
+            )
+
             result = None
             async for output in diffusion_engine.generate(
                 prompt=gen_prompt,
diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
index 88494eda456..1e057a71efa 100644
--- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
+++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py
@@ -1517,14 +1517,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         # For comprehension mode, block image generation tokens but allow
         # text structure tokens (<think>, <answer>, etc.) so the model can
-        # follow its natural generation pattern. The yaml stop_token_ids
-        # for i2t/t2t now includes </think> (128024) so the AR-only output
-        # terminates after the analysis section, matching HF's
-        # `bot_task="think"` behavior. Without that stop, the model
-        # continues into a recaption section even in comprehension mode
-        # (the stage-transition processor only fires in generation mode,
-        # but the instruct-tuned model writes recaption on its own from
-        # internal habit).
+        # follow its natural generation pattern. Runtime sampling params
+        # decide stop tokens from the active bot_task, matching the official
+        # HunyuanImage3 generation path without hard-coded YAML token ids.
         self._blocked_token_ids: set[int] = set()
         if self._is_comprehension:
             self._blocked_token_ids.update(

From f032d5f12a7e7de012651d7d72a26ae950a04c6e Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 15:53:37 +0800
Subject: [PATCH 03/40] Apply ruff format for HunyuanImage3 files

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../diffusion/models/hunyuan_image3/test_prompt_utils.py | 1 -
 .../diffusion/models/hunyuan_image3/prompt_utils.py      | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 62beb45a1f6..50bbf9b704c 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -114,7 +114,6 @@ def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
 
 
 class FakeSamplingParams:
-
     def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
         self.stop_token_ids = stop_token_ids
         self.max_tokens = max_tokens
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index a92b4a0848c..079f14b9fda 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -95,9 +95,7 @@ def stop_token_ids_for_bot_task(
     eos_id = _eos_token_id(tokenizer)
 
     if image_size == "auto":
-        extra_auto_stops = [
-            _token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)
-        ]
+        extra_auto_stops = [_token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)]
     else:
         extra_auto_stops = [_token_id(tokenizer, "<boi>")]
 
@@ -129,10 +127,7 @@ def apply_bot_task_to_sampling_params(
 ) -> list[Any]:
     """Apply a per-request HunyuanImage3 bot_task to one AR stage."""
     if stage_index < 0 or stage_index >= len(sampling_params_list):
-        raise IndexError(
-            f"stage_index {stage_index} is out of range for "
-            f"{len(sampling_params_list)} sampling params"
-        )
+        raise IndexError(f"stage_index {stage_index} is out of range for {len(sampling_params_list)} sampling params")
 
     updated_params_list = list(sampling_params_list)
     params = updated_params_list[stage_index]

From 851baf60694bb133966e860cd81d12509d275620 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 16:22:00 +0800
Subject: [PATCH 04/40] Refine HunyuanImage3 prompt task composition

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/README.md                  |   5 +-
 .../hunyuan_image3/end2end.py                 |  44 ++----
 .../hunyuan_image3/test_prompt_utils.py       |  70 ++++++++-
 .../models/hunyuan_image3/prompt_utils.py     | 144 ++++++++++++++++--
 4 files changed, 218 insertions(+), 45 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 82cca4db6db..98908ace0d7 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -110,7 +110,7 @@ python examples/offline_inference/hunyuan_image3/end2end.py \
 | `--steps` | Number of diffusion inference steps for image generation. |
 | `--guidance-scale` | Classifier-free guidance scale for image generation. |
 | `--height`, `--width` | Output image size for `text2img`. |
-| `--bot-task` | Override the prompt task, for example `t2i_think` or `t2i_recaption`. |
+| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds `<think>`; `recaption` adds `<recaption>`; `vanilla` uses the text-to-image pretrain template. |
 | `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. |
 | `--vae-use-tiling` | Enable VAE tiling for memory reduction. |
 
@@ -137,6 +137,9 @@ Assistant: {trigger_tag?}
 - Trigger tags: `<think>` for CoT and `<recaption>` for recaptioning, placed after `Assistant: `.
 - System prompt: Auto-selected based on task.
 - `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure.
+- The example composes the internal prompt task from `--modality` and `--bot-task`
+  before calling `prompt_utils`; for example, `img2text + think` becomes
+  `i2t_think` for prompt and stop-token lookup.
 
 The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
 helper handles segment-by-segment tokenization and matches HF `apply_chat_template`.
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index b6ffa535463..8233e2bf820 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,33 +18,16 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    available_prompt_bot_tasks,
     bot_task_for_task,
     build_prompt_tokens,
-    stop_token_ids_for_bot_task,
+    stop_token_ids_for_task,
+    sys_type_for_task,
+    task_for_modality_and_bot_task,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
 
-# task -> (sys_type, bot_task, trigger_tag)
-_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
-    "t2t": ("en_unified", None, None),
-    "i2t": ("en_unified", None, None),
-    "it2i_think": ("en_unified", "think", "<think>"),
-    "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_think": ("en_unified", "think", "<think>"),
-    "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_vanilla": ("en_vanilla", "image", None),
-}
-
-# Modality → prompt_utils task mapping
-_MODALITY_TASK_MAP = {
-    "text2img": "t2i_think",
-    "img2img": "it2i_think",
-    "img2text": "i2t",
-    "text2text": "t2t",
-}
-
-
 # Default deploy configs are absolute so this example works from any cwd.
 _REPO_ROOT = Path(__file__).resolve().parents[3]
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
@@ -108,8 +91,13 @@ def parse_args():
     parser.add_argument(
         "--bot-task",
         type=str,
-        default=None,
-        help="Override prompt task (e.g. it2i_think, t2i_recaption). Default: auto from modality.",
+        default="auto",
+        choices=available_prompt_bot_tasks(),
+        help=(
+            "Prompt behavior. 'auto' selects the default for the modality; "
+            "'think' adds <think>; 'recaption' adds <recaption>; "
+            "'vanilla' uses the t2i pretrain template."
+        ),
     )
     parser.add_argument(
         "--sys-type",
@@ -135,8 +123,8 @@ def main():
     args = parse_args()
     os.makedirs(args.output, exist_ok=True)
 
-    # Determine task for prompt formatting
-    task = args.bot_task or _MODALITY_TASK_MAP[args.modality]
+    # Determine task for prompt formatting from modality + bot behavior.
+    task = task_for_modality_and_bot_task(args.modality, args.bot_task)
     bot_task = bot_task_for_task(task)
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
@@ -188,8 +176,7 @@ def main():
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
         token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
-        preset_sys_type, _, _ = _TASK_PRESETS[task]
-        effective_sys_type = args.sys_type or preset_sys_type
+        effective_sys_type = args.sys_type or sys_type_for_task(task)
 
         # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
         # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
@@ -222,7 +209,7 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    ar_stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task)
+    ar_stop_token_ids = stop_token_ids_for_task(tokenizer, task)
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
@@ -241,6 +228,7 @@ def main():
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
     print(f"  Modality: {args.modality}")
+    print(f"  Prompt task: {task}")
     print(f"  Bot task: {bot_task}")
     if deploy_config is not None:
         print(f"  Deploy config: {deploy_config}")
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 50bbf9b704c..e634fdb09aa 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -25,11 +25,15 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     apply_bot_task_to_sampling_params,
+    available_prompt_bot_tasks,
     available_tasks,
     bot_task_for_task,
     build_prompt,
     build_prompt_tokens,
     stop_token_ids_for_bot_task,
+    stop_token_ids_for_task,
+    sys_type_for_task,
+    task_for_modality_and_bot_task,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -75,7 +79,9 @@ def test_available_tasks_covers_all_modalities():
     tasks = set(available_tasks())
     assert tasks >= {
         "t2t",
+        "t2t_think",
         "i2t",
+        "i2t_think",
         "it2i_think",
         "it2i_recaption",
         "t2i_think",
@@ -88,7 +94,9 @@ def test_available_tasks_covers_all_modalities():
     "task,expected_bot_task",
     [
         ("t2t", "auto"),
+        ("t2t_think", "think_recaption"),
         ("i2t", "auto"),
+        ("i2t_think", "think_recaption"),
         ("it2i_think", "think_recaption"),
         ("it2i_recaption", "recaption"),
         ("t2i_think", "think_recaption"),
@@ -100,6 +108,37 @@ def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task:
     assert bot_task_for_task(task) == expected_bot_task
 
 
+@pytest.mark.parametrize(
+    "modality,bot_task,expected_task",
+    [
+        ("text2text", "auto", "t2t"),
+        ("img2text", "auto", "i2t"),
+        ("text2img", "auto", "t2i_think"),
+        ("img2img", "auto", "it2i_think"),
+        ("i2t", "think", "i2t_think"),
+        ("ti2i", "recaption", "it2i_recaption"),
+        ("t2i", "vanilla", "t2i_vanilla"),
+        ("text2text", "none", "t2t"),
+    ],
+)
+def test_task_for_modality_and_bot_task_composes_prompt_task(
+    modality: str,
+    bot_task: str,
+    expected_task: str,
+):
+    assert task_for_modality_and_bot_task(modality, bot_task) == expected_task
+
+
+def test_task_for_modality_and_bot_task_rejects_invalid_combinations():
+    assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"]
+
+    with pytest.raises(ValueError, match="not supported"):
+        task_for_modality_and_bot_task("img2text", "recaption")
+
+    with pytest.raises(ValueError, match="not supported"):
+        task_for_modality_and_bot_task("img2img", "vanilla")
+
+
 def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
     tok = FakeTokenizer()
 
@@ -113,6 +152,19 @@ def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
     ]
 
 
+def test_stop_token_ids_for_task_are_resolved_from_prompt_task():
+    tok = FakeTokenizer()
+
+    assert stop_token_ids_for_task(tok, "i2t") == [5, 8]
+    assert stop_token_ids_for_task(tok, "i2t_think") == [6, 7, 5]
+    assert stop_token_ids_for_task(tok, "t2i_vanilla") == [5]
+
+
+def test_sys_type_for_task_returns_prompt_preset_default():
+    assert sys_type_for_task("i2t_think") == "en_unified"
+    assert sys_type_for_task("t2i_vanilla") == "en_vanilla"
+
+
 class FakeSamplingParams:
     def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
         self.stop_token_ids = stop_token_ids
@@ -141,7 +193,9 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage():
     "task",
     [
         "t2t",
+        "t2t_think",
         "i2t",
+        "i2t_think",
         "it2i_think",
         "it2i_recaption",
         "t2i_think",
@@ -170,7 +224,7 @@ def test_build_prompt_string_structure_chat_template(task: str):
     # documentation, so substring index() catches the wrong occurrence -- use
     # endswith() which directly captures "trigger is at the tail" (the Part A
     # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
-    if task in ("it2i_think", "t2i_think"):
+    if task in ("t2t_think", "i2t_think", "it2i_think", "t2i_think"):
         assert s.endswith("Assistant: <think>"), (
             f"Trigger <think> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
         )
@@ -238,7 +292,14 @@ def test_build_prompt_tokens_no_image_for_text_only_tasks():
 
 @pytest.mark.parametrize(
     "task,trigger_id",
-    [("it2i_think", 3), ("t2i_think", 3), ("it2i_recaption", 4), ("t2i_recaption", 4)],
+    [
+        ("t2t_think", 3),
+        ("i2t_think", 3),
+        ("it2i_think", 3),
+        ("t2i_think", 3),
+        ("it2i_recaption", 4),
+        ("t2i_recaption", 4),
+    ],
 )
 def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
     """Trigger tag id must be the LAST token (after `Assistant: ` segment)."""
@@ -297,9 +358,12 @@ def test_end2end_routes_through_shared_prompt_utils():
         if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
             imported_from_prompt_utils.update(alias.name for alias in node.names)
     expected_imports = {
+        "available_prompt_bot_tasks",
         "bot_task_for_task",
         "build_prompt_tokens",
-        "stop_token_ids_for_bot_task",
+        "stop_token_ids_for_task",
+        "sys_type_for_task",
+        "task_for_modality_and_bot_task",
     }
     assert expected_imports <= imported_from_prompt_utils, (
         "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 079f14b9fda..b22acbdaf7a 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -22,6 +22,7 @@
 from .system_prompt import get_system_prompt
 
 BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
+PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla")
 _BOT_TASK_TO_TOKENIZER_TASK = {
     "auto": "auto",
     "image": "image",
@@ -42,18 +43,123 @@
     "t2i_vanilla": ("en_vanilla", "image", None),
 }
 
+_MODALITY_TO_TASK_PREFIX = {
+    "text2text": "t2t",
+    "t2t": "t2t",
+    "img2text": "i2t",
+    "image2text": "i2t",
+    "i2t": "i2t",
+    "text2img": "t2i",
+    "text2image": "t2i",
+    "t2i": "t2i",
+    "img2img": "it2i",
+    "image2image": "it2i",
+    "it2i": "it2i",
+    "ti2i": "it2i",
+}
+
+_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = {
+    "t2t": None,
+    "i2t": None,
+    "t2i": "think",
+    "it2i": "think",
+}
+
+_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = {
+    ("t2t", None): "t2t",
+    ("t2t", "think"): "t2t_think",
+    ("i2t", None): "i2t",
+    ("i2t", "think"): "i2t_think",
+    ("t2i", "think"): "t2i_think",
+    ("t2i", "recaption"): "t2i_recaption",
+    ("t2i", "vanilla"): "t2i_vanilla",
+    ("it2i", "think"): "it2i_think",
+    ("it2i", "recaption"): "it2i_recaption",
+}
+
+_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = {
+    "auto": "auto",
+    "default": "auto",
+    "none": None,
+    "no": None,
+    "false": None,
+    "think": "think",
+    "think_recaption": "think",
+    "recaption": "recaption",
+    "image": "vanilla",
+    "vanilla": "vanilla",
+}
+
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
 
-def bot_task_for_task(task: str) -> str:
-    """Return the HunyuanImage3 bot_task associated with a prompt task."""
+def available_prompt_bot_tasks() -> list[str]:
+    """Sorted public bot_task values accepted by `task_for_modality_and_bot_task`."""
+    return sorted(PROMPT_BOT_TASKS)
+
+
+def _task_preset(task: str) -> tuple[str, str | None, str | None]:
     if task not in _TASK_PRESETS:
         raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+    return _TASK_PRESETS[task]
+
+
+def _task_has_image_input(task: str) -> bool:
+    return task.startswith(("i2t", "it2i"))
+
+
+def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
+    if bot_task is None:
+        return "auto"
+
+    normalized = bot_task.strip().lower()
+    if normalized not in _PROMPT_BOT_TASK_ALIASES:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}")
+    return _PROMPT_BOT_TASK_ALIASES[normalized]
+
+
+def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
+    """Return the canonical prompt task for an input/output modality.
 
-    _, preset_bot_task, _ = _TASK_PRESETS[task]
+    `modality` chooses the base route (t2t, t2i, i2t, or it2i/ti2i), while
+    `bot_task` chooses the prompt behavior such as thinking, recaptioning,
+    or the vanilla text-to-image template.
+    """
+    modality_key = modality.strip().lower()
+    if modality_key not in _MODALITY_TO_TASK_PREFIX:
+        raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}")
+
+    task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key]
+    normalized_bot_task = _normalize_prompt_bot_task(bot_task)
+    if normalized_bot_task == "auto":
+        normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix]
+
+    task_key = (task_prefix, normalized_bot_task)
+    if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK:
+        valid_bot_tasks = sorted(
+            "none" if candidate is None else candidate
+            for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK
+            if prefix == task_prefix
+        )
+        raise ValueError(
+            f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}"
+        )
+
+    return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
+
+
+def sys_type_for_task(task: str) -> str:
+    """Return the default system prompt type for a canonical prompt task."""
+    preset_sys_type, _, _ = _task_preset(task)
+    return preset_sys_type
+
+
+def bot_task_for_task(task: str) -> str:
+    """Return the HunyuanImage3 bot_task associated with a prompt task."""
+    _, preset_bot_task, _ = _task_preset(task)
     if preset_bot_task == "think":
         return "think_recaption"
     return preset_bot_task or "auto"
@@ -117,6 +223,19 @@ def stop_token_ids_for_bot_task(
     return stop_token_id[tokenizer_bot_task]
 
 
+def stop_token_ids_for_task(
+    tokenizer,
+    task: str,
+    image_size: int | str | None = None,
+) -> list[int]:
+    """Return AR stop token ids for a canonical prompt task."""
+    return stop_token_ids_for_bot_task(
+        tokenizer,
+        bot_task_for_task(task),
+        image_size=image_size,
+    )
+
+
 def apply_bot_task_to_sampling_params(
     sampling_params_list: list[Any],
     tokenizer: Any,
@@ -151,16 +270,13 @@ def build_prompt(
     inputs that need to match HF baseline byte-for-byte, use
     `build_prompt_tokens` instead and feed the result via prompt_token_ids.
     """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
     effective_sys_type = sys_type or preset_sys_type
 
     system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
     sys_text = system_prompt.strip() if system_prompt else ""
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = _task_has_image_input(task)
 
     # t2i_vanilla: pretrain mode for direct text->image generation. The
     # vanilla system prompt drives the model with no chat structure.
@@ -212,17 +328,14 @@ def build_prompt_tokens(
     boundary merge happens. We replicate that here and feed the result to
     Omni via OmniTokensPrompt (prompt_token_ids).
     """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
     effective_sys_type = sys_type or preset_sys_type
 
     bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
     img_id = tokenizer.convert_tokens_to_ids("<img>")
     trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = _task_has_image_input(task)
 
     # t2i_vanilla uses pretrain template with no chat structure; the vanilla
     # system prompt drives the model directly. No segment boundaries to
@@ -252,11 +365,16 @@ def build_prompt_tokens(
 
 __all__ = [
     "available_tasks",
+    "available_prompt_bot_tasks",
     "apply_bot_task_to_sampling_params",
     "bot_task_for_task",
     "BOT_TASKS",
     "build_prompt",
     "build_prompt_tokens",
+    "PROMPT_BOT_TASKS",
     "stop_token_ids_for_bot_task",
+    "stop_token_ids_for_task",
+    "sys_type_for_task",
+    "task_for_modality_and_bot_task",
     "tokenizer_bot_task_for_bot_task",
 ]

From d6ed92fa2a21c40ef528879d9fed43b1aef0e189 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 16:51:34 +0800
Subject: [PATCH 05/40] Unify online HunyuanImage3 bot task handling

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       |  43 +++++
 .../openai_api/test_image_server.py           | 102 +++++++++++
 ...test_serving_chat_multistage_generation.py |  69 ++++++++
 .../models/hunyuan_image3/prompt_utils.py     |  56 ++++++
 .../entrypoints/openai/protocol/images.py     |  28 ++-
 vllm_omni/entrypoints/openai/serving_chat.py  | 163 +++++++++++++-----
 6 files changed, 413 insertions(+), 48 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index e634fdb09aa..6a69888684f 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -25,6 +25,7 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     apply_bot_task_to_sampling_params,
+    apply_task_to_sampling_params,
     available_prompt_bot_tasks,
     available_tasks,
     bot_task_for_task,
@@ -34,6 +35,8 @@
     stop_token_ids_for_task,
     sys_type_for_task,
     task_for_modality_and_bot_task,
+    task_for_modality_and_request_bot_task,
+    tokenizer_bot_task_for_task,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -139,6 +142,24 @@ def test_task_for_modality_and_bot_task_rejects_invalid_combinations():
         task_for_modality_and_bot_task("img2img", "vanilla")
 
 
+@pytest.mark.parametrize(
+    "modality,bot_task,expected_task",
+    [
+        ("text2img", "think", "t2i_think"),
+        ("text2img", "think_recaption", "t2i_think"),
+        ("text2img", "image", "t2i_vanilla"),
+        ("img2img", "recaption", "it2i_recaption"),
+        ("img2text", "auto", "i2t"),
+    ],
+)
+def test_task_for_modality_and_request_bot_task_accepts_legacy_and_unified_values(
+    modality: str,
+    bot_task: str,
+    expected_task: str,
+):
+    assert task_for_modality_and_request_bot_task(modality, bot_task) == expected_task
+
+
 def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
     tok = FakeTokenizer()
 
@@ -165,6 +186,11 @@ def test_sys_type_for_task_returns_prompt_preset_default():
     assert sys_type_for_task("t2i_vanilla") == "en_vanilla"
 
 
+def test_tokenizer_bot_task_for_task_returns_internal_task_name():
+    assert tokenizer_bot_task_for_task("t2i_think") == "think"
+    assert tokenizer_bot_task_for_task("t2i_vanilla") == "image"
+
+
 class FakeSamplingParams:
     def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
         self.stop_token_ids = stop_token_ids
@@ -189,6 +215,23 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage():
     assert stage0.stop_token_ids == [6, 7, 5]
 
 
+def test_apply_task_to_sampling_params_updates_only_target_stage():
+    tok = FakeTokenizer()
+    stage0 = FakeSamplingParams(stop_token_ids=[999])
+    stage1 = FakeSamplingParams(stop_token_ids=[888])
+
+    updated = apply_task_to_sampling_params(
+        [stage0, stage1],
+        tok,
+        "i2t_think",
+        stage_index=0,
+    )
+
+    assert updated[0] is stage0
+    assert updated[0].stop_token_ids == [6, 7, 5]
+    assert updated[1] is stage1
+
+
 @pytest.mark.parametrize(
     "task",
     [
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index b5ff891f8f6..81d4aa0ad19 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -578,6 +578,103 @@ def test_multistage_images_async_omni_construction(async_omni_test_client):
     assert captured[1].guidance_scale == 6.5
 
 
+def test_multistage_hunyuan_images_accept_unified_bot_task():
+    """Regression: /v1/images/generations maps unified bot_task values for HunyuanImage3."""
+
+    class FakeTokenizer:
+        eos_token_id = 5
+
+        def convert_tokens_to_ids(self, token):
+            mapping = {
+                "</recaption>": 6,
+                "</answer>": 7,
+                "<boi>": 8,
+            }
+            mapping.update({f"<img_ratio_{i}>": 1000 + i for i in range(33)})
+            return mapping[token]
+
+    class FakeAsyncOmniClass(AsyncOmni):
+        def __init__(self):
+            stage_configs = [
+                SimpleNamespace(
+                    stage_type="llm",
+                    is_comprehension=True,
+                    model_arch="HunyuanImage3ForCausalMM",
+                ),
+                SimpleNamespace(
+                    stage_type="diffusion",
+                    is_comprehension=False,
+                    model_arch="HunyuanImage3Pipeline",
+                ),
+            ]
+            default_sampling_params_list = [
+                SamplingParams(temperature=0.1),
+                OmniDiffusionSamplingParams(),
+            ]
+            self.engine = SimpleNamespace(
+                stage_configs=stage_configs,
+                default_sampling_params_list=default_sampling_params_list,
+            )
+            self.default_sampling_params_list = default_sampling_params_list
+            self.captured_sampling_params_list = None
+            self.captured_prompt = None
+            self._images = [Image.new("RGB", (64, 64), color="green")]
+            self.od_config = SimpleNamespace(supports_multimodal_inputs=True)
+
+        async def generate(self, prompt, request_id, sampling_params=None, sampling_params_list=None):
+            self.captured_sampling_params_list = (
+                sampling_params_list if sampling_params_list is not None else [sampling_params]
+            )
+            self.captured_prompt = prompt
+            yield MockGenerationResult([img.copy() for img in self._images])
+
+        async def get_tokenizer(self):
+            return FakeTokenizer()
+
+        def __class_getitem__(cls, item):
+            return cls
+
+        def get_diffusion_od_config(self):
+            return self.od_config
+
+    app = FastAPI()
+    app.include_router(router)
+
+    engine = FakeAsyncOmniClass()
+    chat_handler = object.__new__(OmniOpenAIServingChat)
+    chat_handler.engine_client = engine
+    chat_handler._diffusion_engine = None
+    app.state.openai_serving_chat = chat_handler
+    app.state.engine_client = engine
+    app.state.stage_configs = engine.engine.stage_configs
+    app.state.args = Namespace(
+        default_sampling_params='{"1": {"num_inference_steps":4, "guidance_scale":7.5, "generator_device":"cpu"}}',
+        max_generated_image_size=1048576,
+    )
+    app.state.openai_serving_models = _DiffusionServingModels(
+        [BaseModelPath(name="tencent/HunyuanImage-3.0-Instruct", model_path="tencent/HunyuanImage-3.0-Instruct")]
+    )
+    client = TestClient(app)
+
+    response = client.post(
+        "/v1/images/generations",
+        json={
+            "prompt": "a cat",
+            "bot_task": "think",
+            "size": "128x256",
+        },
+    )
+    assert response.status_code == 200
+
+    captured_prompt = engine.captured_prompt
+    assert captured_prompt["bot_task"] == "think_recaption"
+    assert captured_prompt["mm_processor_kwargs"]["bot_task"] == "think"
+
+    captured = engine.captured_sampling_params_list
+    assert captured is not None
+    assert captured[0].stop_token_ids == [6, 7, 5]
+
+
 def test_generate_images_async_omni_glm_image_sets_stage0_max_tokens():
     """GLM-Image multistage: stage-0 gets target_h/w from requested size.
 
@@ -906,6 +1003,8 @@ def test_parameter_validation():
     assert req.size is None  # Engine will use model defaults
     assert req.num_inference_steps is None  # Engine will use model defaults
     assert req.true_cfg_scale is None  # Engine will use model defaults
+    assert ImageGenerationRequest(prompt="test", bot_task="think").bot_task == "think"
+    assert ImageGenerationRequest(prompt="test", bot_task="think_recaption").bot_task == "think_recaption"
 
     # Invalid num_inference_steps (out of range)
     with pytest.raises(ValueError):
@@ -928,6 +1027,9 @@ def test_parameter_validation():
     with pytest.raises(ValueError):
         ImageGenerationRequest(prompt="test", layers=11)
 
+    with pytest.raises(ValueError):
+        ImageGenerationRequest(prompt="test", bot_task="bogus")
+
 
 # Pass-Through Tests
 
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 144a0e97a6c..45eee6eb04a 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -91,3 +91,72 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser
     assert engine.default_sampling_params_list[1].lora_request is None
     assert engine.default_sampling_params_list[2].resolution == 640
     assert engine.default_sampling_params_list[2].lora_request is None
+
+
+@pytest.mark.parametrize(
+    "output_modalities,messages,bot_task,expected_task",
+    [
+        (["image"], [{"role": "user", "content": "draw a cat"}], "think", "t2i_think"),
+        (
+            ["image"],
+            [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}],
+            "recaption",
+            "it2i_recaption",
+        ),
+        (
+            ["text"],
+            [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}],
+            "think_recaption",
+            "i2t_think",
+        ),
+        (["text"], [{"role": "user", "content": "describe"}], "none", "t2t"),
+    ],
+)
+def test_resolve_hunyuan_image3_request_task(serving_chat, output_modalities, messages, bot_task, expected_task):
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    stage_configs = [SimpleNamespace(stage_type="llm", model_arch="HunyuanImage3ForCausalMM", is_comprehension=True)]
+    task = OmniOpenAIServingChat._resolve_hunyuan_image3_request_task(
+        stage_configs=stage_configs,
+        output_modalities=output_modalities,
+        messages=messages,
+        bot_task=bot_task,
+    )
+
+    assert task == expected_task
+
+
+def test_build_multistage_generation_inputs_maps_unified_bot_task_for_hunyuan(serving_chat):
+    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
+
+    engine = SimpleNamespace(
+        stage_configs=[
+            SimpleNamespace(
+                stage_type="llm",
+                is_comprehension=True,
+                model_arch="HunyuanImage3ForCausalMM",
+            ),
+            SimpleNamespace(
+                stage_type="diffusion",
+                is_comprehension=False,
+                model_arch="HunyuanImage3Pipeline",
+            ),
+        ],
+        default_sampling_params_list=[
+            SamplingParams(temperature=0.2, seed=11),
+            OmniDiffusionSamplingParams(),
+        ],
+    )
+
+    engine_prompt, _sampling_params_list = OmniOpenAIServingChat._build_multistage_generation_inputs(
+        serving_chat,
+        engine=engine,
+        prompt="draw a robot",
+        extra_body={"bot_task": "think"},
+        reference_images=[],
+        gen_params=OmniDiffusionSamplingParams(height=768, width=1024, seed=0, num_outputs_per_prompt=1),
+    )
+
+    assert engine_prompt["modalities"] == ["image"]
+    assert engine_prompt["bot_task"] == "think_recaption"
+    assert engine_prompt["mm_processor_kwargs"]["bot_task"] == "think"
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index b22acbdaf7a..d9d0d508288 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -90,6 +90,11 @@
     "vanilla": "vanilla",
 }
 
+_REQUEST_BOT_TASK_ALIASES: dict[str, str | None] = {
+    **_PROMPT_BOT_TASK_ALIASES,
+    "image": "vanilla",
+}
+
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
@@ -121,6 +126,17 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
     return _PROMPT_BOT_TASK_ALIASES[normalized]
 
 
+def _normalize_request_bot_task(bot_task: str | None) -> str | None:
+    if bot_task is None:
+        return "auto"
+
+    normalized = bot_task.strip().lower()
+    if normalized not in _REQUEST_BOT_TASK_ALIASES:
+        valid_bot_tasks = sorted(set(PROMPT_BOT_TASKS) | set(BOT_TASKS))
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {valid_bot_tasks}")
+    return _REQUEST_BOT_TASK_ALIASES[normalized]
+
+
 def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
     """Return the canonical prompt task for an input/output modality.
 
@@ -151,6 +167,20 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto")
     return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
 
 
+def task_for_modality_and_request_bot_task(modality: str, bot_task: str | None = "auto") -> str:
+    """Resolve a request bot_task into a canonical prompt task.
+
+    Request values accept both the unified public bot_task vocabulary
+    (`think`, `recaption`, `vanilla`, `none`, `auto`) and the legacy
+    HunyuanImage3 values (`auto`, `image`, `recaption`,
+    `think_recaption`).
+    """
+    return task_for_modality_and_bot_task(
+        modality,
+        _normalize_request_bot_task(bot_task),
+    )
+
+
 def sys_type_for_task(task: str) -> str:
     """Return the default system prompt type for a canonical prompt task."""
     preset_sys_type, _, _ = _task_preset(task)
@@ -236,6 +266,11 @@ def stop_token_ids_for_task(
     )
 
 
+def tokenizer_bot_task_for_task(task: str) -> str:
+    """Return the tokenizer-internal bot_task for a canonical prompt task."""
+    return tokenizer_bot_task_for_bot_task(bot_task_for_task(task))
+
+
 def apply_bot_task_to_sampling_params(
     sampling_params_list: list[Any],
     tokenizer: Any,
@@ -256,6 +291,24 @@ def apply_bot_task_to_sampling_params(
     return updated_params_list
 
 
+def apply_task_to_sampling_params(
+    sampling_params_list: list[Any],
+    tokenizer: Any,
+    task: str,
+    *,
+    stage_index: int = 0,
+    image_size: int | str | None = None,
+) -> list[Any]:
+    """Apply a canonical prompt task to one AR stage's stop tokens."""
+    return apply_bot_task_to_sampling_params(
+        sampling_params_list,
+        tokenizer,
+        bot_task_for_task(task),
+        stage_index=stage_index,
+        image_size=image_size,
+    )
+
+
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",
@@ -367,6 +420,7 @@ def build_prompt_tokens(
     "available_tasks",
     "available_prompt_bot_tasks",
     "apply_bot_task_to_sampling_params",
+    "apply_task_to_sampling_params",
     "bot_task_for_task",
     "BOT_TASKS",
     "build_prompt",
@@ -376,5 +430,7 @@ def build_prompt_tokens(
     "stop_token_ids_for_task",
     "sys_type_for_task",
     "task_for_modality_and_bot_task",
+    "task_for_modality_and_request_bot_task",
+    "tokenizer_bot_task_for_task",
     "tokenizer_bot_task_for_bot_task",
 ]
diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py
index c78a95de058..548fe55fe30 100644
--- a/vllm_omni/entrypoints/openai/protocol/images.py
+++ b/vllm_omni/entrypoints/openai/protocol/images.py
@@ -119,8 +119,34 @@ def validate_use_system_prompt(cls, v):
     )
     bot_task: str | None = Field(
         default=None,
-        description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.",
+        description=(
+            "HunyuanImage3 prompt behavior for this request. Preferred values: "
+            "auto, none, think, recaption, vanilla. Legacy values auto, image, "
+            "recaption, and think_recaption are also accepted."
+        ),
     )
+
+    @field_validator("bot_task")
+    @classmethod
+    def validate_bot_task(cls, v):
+        """Validate HunyuanImage3 bot_task / prompt behavior."""
+        if v is None:
+            return None
+
+        normalized = v.strip().lower()
+        valid_values = {
+            "auto",
+            "none",
+            "think",
+            "recaption",
+            "vanilla",
+            "image",
+            "think_recaption",
+        }
+        if normalized not in valid_values:
+            raise ValueError(f"Invalid bot_task: {v}. Must be one of: {sorted(valid_values)}")
+        return normalized
+
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
     generator_device: str | None = Field(
         default=None,
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index b2375fd38b4..cef07b0ac18 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -198,19 +198,24 @@ async def create_chat_completion(
                 tokenizer = await self.engine_client.get_tokenizer()
 
             extra_body = self._get_extra_body_from_request(request)
-            bot_task = (
-                extra_body.get("bot_task")
-                if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or []))
-                is not None
-                else None
+            output_modalities = getattr(request, "modalities", self.engine_client.output_modalities)
+            hunyuan_task = self._resolve_hunyuan_image3_request_task(
+                stage_configs=list(getattr(self.engine_client, "stage_configs", []) or []),
+                output_modalities=output_modalities,
+                messages=request.messages,
+                bot_task=extra_body.get("bot_task"),
             )
+            hunyuan_bot_task = None
             request_chat_template_kwargs = request.chat_template_kwargs or {}
-            if bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+            if hunyuan_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+                    bot_task_for_task,
+                    tokenizer_bot_task_for_task,
+                )
 
-                tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
+                hunyuan_bot_task = bot_task_for_task(hunyuan_task)
                 request_chat_template_kwargs = dict(request_chat_template_kwargs)
-                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task
+                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
 
             reasoning_parser: ReasoningParser | None = None
             if self.reasoning_parser_cls:
@@ -311,7 +316,6 @@ async def create_chat_completion(
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
-        output_modalities = getattr(request, "modalities", self.engine_client.output_modalities)
         request.modalities = (
             output_modalities if output_modalities is not None else self.engine_client.output_modalities
         )
@@ -380,9 +384,11 @@ async def create_chat_completion(
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
-                if bot_task is not None:
-                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task
-                    tprompt["bot_task"] = bot_task
+                if hunyuan_task is not None:
+                    from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_task
+
+                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
+                    tprompt["bot_task"] = hunyuan_bot_task
                 tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
@@ -420,10 +426,10 @@ async def create_chat_completion(
                 # to delta to ensure emitted outputs are correctly drained. Otherwise
                 # convert cumulative to Final Only to ensure the output is correct.
                 sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream)
-                sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+                sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
                     engine=self.engine_client,
                     sampling_params_list=sampling_params_list,
-                    bot_task=bot_task,
+                    task=hunyuan_task,
                     tokenizer=tokenizer,
                 )
 
@@ -752,15 +758,69 @@ def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | N
                 return idx
         return None
 
-    async def _apply_hunyuan_image3_bot_task_sampling_params(
+    @staticmethod
+    def _infer_hunyuan_image3_request_modality(
+        output_modalities: list[str] | None,
+        has_reference_images: bool,
+    ) -> str:
+        image_output_requested = bool(output_modalities) and "image" in output_modalities
+        if image_output_requested:
+            return "img2img" if has_reference_images else "text2img"
+        return "img2text" if has_reference_images else "text2text"
+
+    @classmethod
+    def _resolve_hunyuan_image3_request_task(
+        cls,
+        *,
+        stage_configs: list[Any],
+        output_modalities: list[str] | None,
+        bot_task: str | None,
+        messages: list[Any] | None = None,
+        reference_images: list[Any] | None = None,
+    ) -> str | None:
+        if bot_task is None:
+            return None
+
+        if cls._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
+            return None
+
+        has_reference_images = False
+        if reference_images is not None:
+            has_reference_images = len(reference_images) > 0
+        elif messages is not None:
+            normalized_messages = cls._messages_to_dicts(messages)
+            for message in normalized_messages:
+                if message.get("role", "") != "user":
+                    continue
+                content = message.get("content", "")
+                if isinstance(content, list):
+                    if any(
+                        (isinstance(item, dict) and (item.get("type") == "image_url" or "image" in item))
+                        for item in content
+                    ):
+                        has_reference_images = True
+                        break
+                elif isinstance(content, dict) and (content.get("type") == "image_url" or "image" in content):
+                    has_reference_images = True
+                    break
+
+        modality = cls._infer_hunyuan_image3_request_modality(output_modalities, has_reference_images)
+
+        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+            task_for_modality_and_request_bot_task,
+        )
+
+        return task_for_modality_and_request_bot_task(modality, bot_task)
+
+    async def _apply_hunyuan_image3_task_sampling_params(
         self,
         *,
         engine: Any,
         sampling_params_list: list[Any],
-        bot_task: Any,
+        task: str | None,
         tokenizer: Any | None = None,
     ) -> list[Any]:
-        if bot_task is None:
+        if task is None:
             return sampling_params_list
 
         stage_configs = list(getattr(engine, "stage_configs", []) or [])
@@ -769,24 +829,18 @@ async def _apply_hunyuan_image3_bot_task_sampling_params(
             return sampling_params_list
 
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            BOT_TASKS,
-            apply_bot_task_to_sampling_params,
-            tokenizer_bot_task_for_bot_task,
+            apply_task_to_sampling_params,
         )
 
-        if bot_task not in BOT_TASKS:
-            raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
-        tokenizer_bot_task_for_bot_task(bot_task)
-
         if tokenizer is None and hasattr(engine, "get_tokenizer"):
             tokenizer = await engine.get_tokenizer()
         if tokenizer is None:
             raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.")
 
-        return apply_bot_task_to_sampling_params(
+        return apply_task_to_sampling_params(
             sampling_params_list,
             tokenizer,
-            bot_task,
+            task,
             stage_index=stage_index,
         )
 
@@ -2254,10 +2308,11 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        bot_task = (
-            extra_body.get("bot_task")
-            if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None
-            else None
+        hunyuan_task = self._resolve_hunyuan_image3_request_task(
+            stage_configs=list(stage_configs),
+            output_modalities=["image"],
+            reference_images=reference_images,
+            bot_task=extra_body.get("bot_task"),
         )
 
         engine_prompt_data: dict[str, Any] | None = None
@@ -2297,11 +2352,14 @@ def _build_multistage_generation_inputs(
             mm_processor_kwargs["target_h"] = height
         if width is not None:
             mm_processor_kwargs["target_w"] = width
-        if bot_task is not None:
-            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+        if hunyuan_task is not None:
+            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+                bot_task_for_task,
+                tokenizer_bot_task_for_task,
+            )
 
-            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task)
-            engine_prompt["bot_task"] = bot_task
+            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
+            engine_prompt["bot_task"] = bot_task_for_task(hunyuan_task)
         if mm_processor_kwargs:
             engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs
         if engine_prompt_data is not None:
@@ -2398,7 +2456,12 @@ async def generate_diffusion_images(
         negative_prompt = extra_body.get("negative_prompt")
         num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
         lora_body = extra_body.get("lora")
-        bot_task = extra_body.get("bot_task")
+        hunyuan_task = self._resolve_hunyuan_image3_request_task(
+            stage_configs=list(getattr(engine, "stage_configs", None) or []),
+            output_modalities=["image"],
+            reference_images=reference_images,
+            bot_task=extra_body.get("bot_task"),
+        )
 
         pil_images: list[Image.Image] = []
         for img_b64 in reference_images:
@@ -2482,10 +2545,10 @@ async def generate_diffusion_images(
                 engine_prompt = gen_prompt
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+            sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
                 engine=diffusion_engine,
                 sampling_params_list=sampling_params_list,
-                bot_task=bot_task,
+                task=hunyuan_task,
             )
 
             result = None
@@ -2575,7 +2638,12 @@ async def _create_diffusion_chat_completion(
                 seed = getattr(request, "seed", None)
             negative_prompt = extra_body.get("negative_prompt")
             num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
-            bot_task = extra_body.get("bot_task")
+            hunyuan_task = self._resolve_hunyuan_image3_request_task(
+                stage_configs=list(getattr(self._diffusion_engine, "stage_configs", []) or []),
+                output_modalities=["image"],
+                reference_images=reference_images,
+                bot_task=extra_body.get("bot_task"),
+            )
 
             # Text-to-video parameters (ref: text_to_video.py)
             num_frames = extra_body.get("num_frames")
@@ -2689,14 +2757,15 @@ async def _create_diffusion_chat_completion(
             # Generate image or audio (e.g. AudioX) via AsyncOmni
             diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
             stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or [])
-            if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
-                bot_task = None
-            elif bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+            if hunyuan_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+                    bot_task_for_task,
+                    tokenizer_bot_task_for_task,
+                )
 
-                gen_prompt["bot_task"] = bot_task
+                gen_prompt["bot_task"] = bot_task_for_task(hunyuan_task)
                 gen_prompt["mm_processor_kwargs"] = {
-                    "bot_task": tokenizer_bot_task_for_bot_task(bot_task),
+                    "bot_task": tokenizer_bot_task_for_task(hunyuan_task),
                 }
             sampling_params_list = build_stage_sampling_params_list(
                 stage_configs,
@@ -2708,10 +2777,10 @@ async def _create_diffusion_chat_completion(
             if not sampling_params_list:
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
+            sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
                 engine=diffusion_engine,
                 sampling_params_list=sampling_params_list,
-                bot_task=bot_task,
+                task=hunyuan_task,
             )
 
             result = None

From a10219d615927c000964621296ca06a5dde4fca6 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 17:50:31 +0800
Subject: [PATCH 06/40] Revert "Unify online HunyuanImage3 bot task handling"

This reverts commit 6b67a5f426a3d02e263d0014cff917b171f39943.

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       |  43 -----
 .../openai_api/test_image_server.py           | 102 -----------
 ...test_serving_chat_multistage_generation.py |  69 --------
 .../models/hunyuan_image3/prompt_utils.py     |  56 ------
 .../entrypoints/openai/protocol/images.py     |  28 +--
 vllm_omni/entrypoints/openai/serving_chat.py  | 163 +++++-------------
 6 files changed, 48 insertions(+), 413 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 6a69888684f..e634fdb09aa 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -25,7 +25,6 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     apply_bot_task_to_sampling_params,
-    apply_task_to_sampling_params,
     available_prompt_bot_tasks,
     available_tasks,
     bot_task_for_task,
@@ -35,8 +34,6 @@
     stop_token_ids_for_task,
     sys_type_for_task,
     task_for_modality_and_bot_task,
-    task_for_modality_and_request_bot_task,
-    tokenizer_bot_task_for_task,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -142,24 +139,6 @@ def test_task_for_modality_and_bot_task_rejects_invalid_combinations():
         task_for_modality_and_bot_task("img2img", "vanilla")
 
 
-@pytest.mark.parametrize(
-    "modality,bot_task,expected_task",
-    [
-        ("text2img", "think", "t2i_think"),
-        ("text2img", "think_recaption", "t2i_think"),
-        ("text2img", "image", "t2i_vanilla"),
-        ("img2img", "recaption", "it2i_recaption"),
-        ("img2text", "auto", "i2t"),
-    ],
-)
-def test_task_for_modality_and_request_bot_task_accepts_legacy_and_unified_values(
-    modality: str,
-    bot_task: str,
-    expected_task: str,
-):
-    assert task_for_modality_and_request_bot_task(modality, bot_task) == expected_task
-
-
 def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
     tok = FakeTokenizer()
 
@@ -186,11 +165,6 @@ def test_sys_type_for_task_returns_prompt_preset_default():
     assert sys_type_for_task("t2i_vanilla") == "en_vanilla"
 
 
-def test_tokenizer_bot_task_for_task_returns_internal_task_name():
-    assert tokenizer_bot_task_for_task("t2i_think") == "think"
-    assert tokenizer_bot_task_for_task("t2i_vanilla") == "image"
-
-
 class FakeSamplingParams:
     def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
         self.stop_token_ids = stop_token_ids
@@ -215,23 +189,6 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage():
     assert stage0.stop_token_ids == [6, 7, 5]
 
 
-def test_apply_task_to_sampling_params_updates_only_target_stage():
-    tok = FakeTokenizer()
-    stage0 = FakeSamplingParams(stop_token_ids=[999])
-    stage1 = FakeSamplingParams(stop_token_ids=[888])
-
-    updated = apply_task_to_sampling_params(
-        [stage0, stage1],
-        tok,
-        "i2t_think",
-        stage_index=0,
-    )
-
-    assert updated[0] is stage0
-    assert updated[0].stop_token_ids == [6, 7, 5]
-    assert updated[1] is stage1
-
-
 @pytest.mark.parametrize(
     "task",
     [
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 81d4aa0ad19..b5ff891f8f6 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -578,103 +578,6 @@ def test_multistage_images_async_omni_construction(async_omni_test_client):
     assert captured[1].guidance_scale == 6.5
 
 
-def test_multistage_hunyuan_images_accept_unified_bot_task():
-    """Regression: /v1/images/generations maps unified bot_task values for HunyuanImage3."""
-
-    class FakeTokenizer:
-        eos_token_id = 5
-
-        def convert_tokens_to_ids(self, token):
-            mapping = {
-                "</recaption>": 6,
-                "</answer>": 7,
-                "<boi>": 8,
-            }
-            mapping.update({f"<img_ratio_{i}>": 1000 + i for i in range(33)})
-            return mapping[token]
-
-    class FakeAsyncOmniClass(AsyncOmni):
-        def __init__(self):
-            stage_configs = [
-                SimpleNamespace(
-                    stage_type="llm",
-                    is_comprehension=True,
-                    model_arch="HunyuanImage3ForCausalMM",
-                ),
-                SimpleNamespace(
-                    stage_type="diffusion",
-                    is_comprehension=False,
-                    model_arch="HunyuanImage3Pipeline",
-                ),
-            ]
-            default_sampling_params_list = [
-                SamplingParams(temperature=0.1),
-                OmniDiffusionSamplingParams(),
-            ]
-            self.engine = SimpleNamespace(
-                stage_configs=stage_configs,
-                default_sampling_params_list=default_sampling_params_list,
-            )
-            self.default_sampling_params_list = default_sampling_params_list
-            self.captured_sampling_params_list = None
-            self.captured_prompt = None
-            self._images = [Image.new("RGB", (64, 64), color="green")]
-            self.od_config = SimpleNamespace(supports_multimodal_inputs=True)
-
-        async def generate(self, prompt, request_id, sampling_params=None, sampling_params_list=None):
-            self.captured_sampling_params_list = (
-                sampling_params_list if sampling_params_list is not None else [sampling_params]
-            )
-            self.captured_prompt = prompt
-            yield MockGenerationResult([img.copy() for img in self._images])
-
-        async def get_tokenizer(self):
-            return FakeTokenizer()
-
-        def __class_getitem__(cls, item):
-            return cls
-
-        def get_diffusion_od_config(self):
-            return self.od_config
-
-    app = FastAPI()
-    app.include_router(router)
-
-    engine = FakeAsyncOmniClass()
-    chat_handler = object.__new__(OmniOpenAIServingChat)
-    chat_handler.engine_client = engine
-    chat_handler._diffusion_engine = None
-    app.state.openai_serving_chat = chat_handler
-    app.state.engine_client = engine
-    app.state.stage_configs = engine.engine.stage_configs
-    app.state.args = Namespace(
-        default_sampling_params='{"1": {"num_inference_steps":4, "guidance_scale":7.5, "generator_device":"cpu"}}',
-        max_generated_image_size=1048576,
-    )
-    app.state.openai_serving_models = _DiffusionServingModels(
-        [BaseModelPath(name="tencent/HunyuanImage-3.0-Instruct", model_path="tencent/HunyuanImage-3.0-Instruct")]
-    )
-    client = TestClient(app)
-
-    response = client.post(
-        "/v1/images/generations",
-        json={
-            "prompt": "a cat",
-            "bot_task": "think",
-            "size": "128x256",
-        },
-    )
-    assert response.status_code == 200
-
-    captured_prompt = engine.captured_prompt
-    assert captured_prompt["bot_task"] == "think_recaption"
-    assert captured_prompt["mm_processor_kwargs"]["bot_task"] == "think"
-
-    captured = engine.captured_sampling_params_list
-    assert captured is not None
-    assert captured[0].stop_token_ids == [6, 7, 5]
-
-
 def test_generate_images_async_omni_glm_image_sets_stage0_max_tokens():
     """GLM-Image multistage: stage-0 gets target_h/w from requested size.
 
@@ -1003,8 +906,6 @@ def test_parameter_validation():
     assert req.size is None  # Engine will use model defaults
     assert req.num_inference_steps is None  # Engine will use model defaults
     assert req.true_cfg_scale is None  # Engine will use model defaults
-    assert ImageGenerationRequest(prompt="test", bot_task="think").bot_task == "think"
-    assert ImageGenerationRequest(prompt="test", bot_task="think_recaption").bot_task == "think_recaption"
 
     # Invalid num_inference_steps (out of range)
     with pytest.raises(ValueError):
@@ -1027,9 +928,6 @@ def test_parameter_validation():
     with pytest.raises(ValueError):
         ImageGenerationRequest(prompt="test", layers=11)
 
-    with pytest.raises(ValueError):
-        ImageGenerationRequest(prompt="test", bot_task="bogus")
-
 
 # Pass-Through Tests
 
diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
index 45eee6eb04a..144a0e97a6c 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py
@@ -91,72 +91,3 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser
     assert engine.default_sampling_params_list[1].lora_request is None
     assert engine.default_sampling_params_list[2].resolution == 640
     assert engine.default_sampling_params_list[2].lora_request is None
-
-
-@pytest.mark.parametrize(
-    "output_modalities,messages,bot_task,expected_task",
-    [
-        (["image"], [{"role": "user", "content": "draw a cat"}], "think", "t2i_think"),
-        (
-            ["image"],
-            [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}],
-            "recaption",
-            "it2i_recaption",
-        ),
-        (
-            ["text"],
-            [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}],
-            "think_recaption",
-            "i2t_think",
-        ),
-        (["text"], [{"role": "user", "content": "describe"}], "none", "t2t"),
-    ],
-)
-def test_resolve_hunyuan_image3_request_task(serving_chat, output_modalities, messages, bot_task, expected_task):
-    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
-    stage_configs = [SimpleNamespace(stage_type="llm", model_arch="HunyuanImage3ForCausalMM", is_comprehension=True)]
-    task = OmniOpenAIServingChat._resolve_hunyuan_image3_request_task(
-        stage_configs=stage_configs,
-        output_modalities=output_modalities,
-        messages=messages,
-        bot_task=bot_task,
-    )
-
-    assert task == expected_task
-
-
-def test_build_multistage_generation_inputs_maps_unified_bot_task_for_hunyuan(serving_chat):
-    from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat
-
-    engine = SimpleNamespace(
-        stage_configs=[
-            SimpleNamespace(
-                stage_type="llm",
-                is_comprehension=True,
-                model_arch="HunyuanImage3ForCausalMM",
-            ),
-            SimpleNamespace(
-                stage_type="diffusion",
-                is_comprehension=False,
-                model_arch="HunyuanImage3Pipeline",
-            ),
-        ],
-        default_sampling_params_list=[
-            SamplingParams(temperature=0.2, seed=11),
-            OmniDiffusionSamplingParams(),
-        ],
-    )
-
-    engine_prompt, _sampling_params_list = OmniOpenAIServingChat._build_multistage_generation_inputs(
-        serving_chat,
-        engine=engine,
-        prompt="draw a robot",
-        extra_body={"bot_task": "think"},
-        reference_images=[],
-        gen_params=OmniDiffusionSamplingParams(height=768, width=1024, seed=0, num_outputs_per_prompt=1),
-    )
-
-    assert engine_prompt["modalities"] == ["image"]
-    assert engine_prompt["bot_task"] == "think_recaption"
-    assert engine_prompt["mm_processor_kwargs"]["bot_task"] == "think"
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index d9d0d508288..b22acbdaf7a 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -90,11 +90,6 @@
     "vanilla": "vanilla",
 }
 
-_REQUEST_BOT_TASK_ALIASES: dict[str, str | None] = {
-    **_PROMPT_BOT_TASK_ALIASES,
-    "image": "vanilla",
-}
-
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
@@ -126,17 +121,6 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
     return _PROMPT_BOT_TASK_ALIASES[normalized]
 
 
-def _normalize_request_bot_task(bot_task: str | None) -> str | None:
-    if bot_task is None:
-        return "auto"
-
-    normalized = bot_task.strip().lower()
-    if normalized not in _REQUEST_BOT_TASK_ALIASES:
-        valid_bot_tasks = sorted(set(PROMPT_BOT_TASKS) | set(BOT_TASKS))
-        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {valid_bot_tasks}")
-    return _REQUEST_BOT_TASK_ALIASES[normalized]
-
-
 def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
     """Return the canonical prompt task for an input/output modality.
 
@@ -167,20 +151,6 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto")
     return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
 
 
-def task_for_modality_and_request_bot_task(modality: str, bot_task: str | None = "auto") -> str:
-    """Resolve a request bot_task into a canonical prompt task.
-
-    Request values accept both the unified public bot_task vocabulary
-    (`think`, `recaption`, `vanilla`, `none`, `auto`) and the legacy
-    HunyuanImage3 values (`auto`, `image`, `recaption`,
-    `think_recaption`).
-    """
-    return task_for_modality_and_bot_task(
-        modality,
-        _normalize_request_bot_task(bot_task),
-    )
-
-
 def sys_type_for_task(task: str) -> str:
     """Return the default system prompt type for a canonical prompt task."""
     preset_sys_type, _, _ = _task_preset(task)
@@ -266,11 +236,6 @@ def stop_token_ids_for_task(
     )
 
 
-def tokenizer_bot_task_for_task(task: str) -> str:
-    """Return the tokenizer-internal bot_task for a canonical prompt task."""
-    return tokenizer_bot_task_for_bot_task(bot_task_for_task(task))
-
-
 def apply_bot_task_to_sampling_params(
     sampling_params_list: list[Any],
     tokenizer: Any,
@@ -291,24 +256,6 @@ def apply_bot_task_to_sampling_params(
     return updated_params_list
 
 
-def apply_task_to_sampling_params(
-    sampling_params_list: list[Any],
-    tokenizer: Any,
-    task: str,
-    *,
-    stage_index: int = 0,
-    image_size: int | str | None = None,
-) -> list[Any]:
-    """Apply a canonical prompt task to one AR stage's stop tokens."""
-    return apply_bot_task_to_sampling_params(
-        sampling_params_list,
-        tokenizer,
-        bot_task_for_task(task),
-        stage_index=stage_index,
-        image_size=image_size,
-    )
-
-
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",
@@ -420,7 +367,6 @@ def build_prompt_tokens(
     "available_tasks",
     "available_prompt_bot_tasks",
     "apply_bot_task_to_sampling_params",
-    "apply_task_to_sampling_params",
     "bot_task_for_task",
     "BOT_TASKS",
     "build_prompt",
@@ -430,7 +376,5 @@ def build_prompt_tokens(
     "stop_token_ids_for_task",
     "sys_type_for_task",
     "task_for_modality_and_bot_task",
-    "task_for_modality_and_request_bot_task",
-    "tokenizer_bot_task_for_task",
     "tokenizer_bot_task_for_bot_task",
 ]
diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py
index 548fe55fe30..c78a95de058 100644
--- a/vllm_omni/entrypoints/openai/protocol/images.py
+++ b/vllm_omni/entrypoints/openai/protocol/images.py
@@ -119,34 +119,8 @@ def validate_use_system_prompt(cls, v):
     )
     bot_task: str | None = Field(
         default=None,
-        description=(
-            "HunyuanImage3 prompt behavior for this request. Preferred values: "
-            "auto, none, think, recaption, vanilla. Legacy values auto, image, "
-            "recaption, and think_recaption are also accepted."
-        ),
+        description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.",
     )
-
-    @field_validator("bot_task")
-    @classmethod
-    def validate_bot_task(cls, v):
-        """Validate HunyuanImage3 bot_task / prompt behavior."""
-        if v is None:
-            return None
-
-        normalized = v.strip().lower()
-        valid_values = {
-            "auto",
-            "none",
-            "think",
-            "recaption",
-            "vanilla",
-            "image",
-            "think_recaption",
-        }
-        if normalized not in valid_values:
-            raise ValueError(f"Invalid bot_task: {v}. Must be one of: {sorted(valid_values)}")
-        return normalized
-
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
     generator_device: str | None = Field(
         default=None,
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index cef07b0ac18..b2375fd38b4 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -198,24 +198,19 @@ async def create_chat_completion(
                 tokenizer = await self.engine_client.get_tokenizer()
 
             extra_body = self._get_extra_body_from_request(request)
-            output_modalities = getattr(request, "modalities", self.engine_client.output_modalities)
-            hunyuan_task = self._resolve_hunyuan_image3_request_task(
-                stage_configs=list(getattr(self.engine_client, "stage_configs", []) or []),
-                output_modalities=output_modalities,
-                messages=request.messages,
-                bot_task=extra_body.get("bot_task"),
+            bot_task = (
+                extra_body.get("bot_task")
+                if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or []))
+                is not None
+                else None
             )
-            hunyuan_bot_task = None
             request_chat_template_kwargs = request.chat_template_kwargs or {}
-            if hunyuan_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-                    bot_task_for_task,
-                    tokenizer_bot_task_for_task,
-                )
+            if bot_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
 
-                hunyuan_bot_task = bot_task_for_task(hunyuan_task)
+                tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
                 request_chat_template_kwargs = dict(request_chat_template_kwargs)
-                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
+                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task
 
             reasoning_parser: ReasoningParser | None = None
             if self.reasoning_parser_cls:
@@ -316,6 +311,7 @@ async def create_chat_completion(
         if raw_request:
             raw_request.state.request_metadata = request_metadata
 
+        output_modalities = getattr(request, "modalities", self.engine_client.output_modalities)
         request.modalities = (
             output_modalities if output_modalities is not None else self.engine_client.output_modalities
         )
@@ -384,11 +380,9 @@ async def create_chat_completion(
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
-                if hunyuan_task is not None:
-                    from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_task
-
-                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
-                    tprompt["bot_task"] = hunyuan_bot_task
+                if bot_task is not None:
+                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task
+                    tprompt["bot_task"] = bot_task
                 tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
@@ -426,10 +420,10 @@ async def create_chat_completion(
                 # to delta to ensure emitted outputs are correctly drained. Otherwise
                 # convert cumulative to Final Only to ensure the output is correct.
                 sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream)
-                sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
+                sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
                     engine=self.engine_client,
                     sampling_params_list=sampling_params_list,
-                    task=hunyuan_task,
+                    bot_task=bot_task,
                     tokenizer=tokenizer,
                 )
 
@@ -758,69 +752,15 @@ def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | N
                 return idx
         return None
 
-    @staticmethod
-    def _infer_hunyuan_image3_request_modality(
-        output_modalities: list[str] | None,
-        has_reference_images: bool,
-    ) -> str:
-        image_output_requested = bool(output_modalities) and "image" in output_modalities
-        if image_output_requested:
-            return "img2img" if has_reference_images else "text2img"
-        return "img2text" if has_reference_images else "text2text"
-
-    @classmethod
-    def _resolve_hunyuan_image3_request_task(
-        cls,
-        *,
-        stage_configs: list[Any],
-        output_modalities: list[str] | None,
-        bot_task: str | None,
-        messages: list[Any] | None = None,
-        reference_images: list[Any] | None = None,
-    ) -> str | None:
-        if bot_task is None:
-            return None
-
-        if cls._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
-            return None
-
-        has_reference_images = False
-        if reference_images is not None:
-            has_reference_images = len(reference_images) > 0
-        elif messages is not None:
-            normalized_messages = cls._messages_to_dicts(messages)
-            for message in normalized_messages:
-                if message.get("role", "") != "user":
-                    continue
-                content = message.get("content", "")
-                if isinstance(content, list):
-                    if any(
-                        (isinstance(item, dict) and (item.get("type") == "image_url" or "image" in item))
-                        for item in content
-                    ):
-                        has_reference_images = True
-                        break
-                elif isinstance(content, dict) and (content.get("type") == "image_url" or "image" in content):
-                    has_reference_images = True
-                    break
-
-        modality = cls._infer_hunyuan_image3_request_modality(output_modalities, has_reference_images)
-
-        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            task_for_modality_and_request_bot_task,
-        )
-
-        return task_for_modality_and_request_bot_task(modality, bot_task)
-
-    async def _apply_hunyuan_image3_task_sampling_params(
+    async def _apply_hunyuan_image3_bot_task_sampling_params(
         self,
         *,
         engine: Any,
         sampling_params_list: list[Any],
-        task: str | None,
+        bot_task: Any,
         tokenizer: Any | None = None,
     ) -> list[Any]:
-        if task is None:
+        if bot_task is None:
             return sampling_params_list
 
         stage_configs = list(getattr(engine, "stage_configs", []) or [])
@@ -829,18 +769,24 @@ async def _apply_hunyuan_image3_task_sampling_params(
             return sampling_params_list
 
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            apply_task_to_sampling_params,
+            BOT_TASKS,
+            apply_bot_task_to_sampling_params,
+            tokenizer_bot_task_for_bot_task,
         )
 
+        if bot_task not in BOT_TASKS:
+            raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
+        tokenizer_bot_task_for_bot_task(bot_task)
+
         if tokenizer is None and hasattr(engine, "get_tokenizer"):
             tokenizer = await engine.get_tokenizer()
         if tokenizer is None:
             raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.")
 
-        return apply_task_to_sampling_params(
+        return apply_bot_task_to_sampling_params(
             sampling_params_list,
             tokenizer,
-            task,
+            bot_task,
             stage_index=stage_index,
         )
 
@@ -2308,11 +2254,10 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        hunyuan_task = self._resolve_hunyuan_image3_request_task(
-            stage_configs=list(stage_configs),
-            output_modalities=["image"],
-            reference_images=reference_images,
-            bot_task=extra_body.get("bot_task"),
+        bot_task = (
+            extra_body.get("bot_task")
+            if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None
+            else None
         )
 
         engine_prompt_data: dict[str, Any] | None = None
@@ -2352,14 +2297,11 @@ def _build_multistage_generation_inputs(
             mm_processor_kwargs["target_h"] = height
         if width is not None:
             mm_processor_kwargs["target_w"] = width
-        if hunyuan_task is not None:
-            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-                bot_task_for_task,
-                tokenizer_bot_task_for_task,
-            )
+        if bot_task is not None:
+            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
 
-            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task)
-            engine_prompt["bot_task"] = bot_task_for_task(hunyuan_task)
+            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task)
+            engine_prompt["bot_task"] = bot_task
         if mm_processor_kwargs:
             engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs
         if engine_prompt_data is not None:
@@ -2456,12 +2398,7 @@ async def generate_diffusion_images(
         negative_prompt = extra_body.get("negative_prompt")
         num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
         lora_body = extra_body.get("lora")
-        hunyuan_task = self._resolve_hunyuan_image3_request_task(
-            stage_configs=list(getattr(engine, "stage_configs", None) or []),
-            output_modalities=["image"],
-            reference_images=reference_images,
-            bot_task=extra_body.get("bot_task"),
-        )
+        bot_task = extra_body.get("bot_task")
 
         pil_images: list[Image.Image] = []
         for img_b64 in reference_images:
@@ -2545,10 +2482,10 @@ async def generate_diffusion_images(
                 engine_prompt = gen_prompt
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
+            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
                 engine=diffusion_engine,
                 sampling_params_list=sampling_params_list,
-                task=hunyuan_task,
+                bot_task=bot_task,
             )
 
             result = None
@@ -2638,12 +2575,7 @@ async def _create_diffusion_chat_completion(
                 seed = getattr(request, "seed", None)
             negative_prompt = extra_body.get("negative_prompt")
             num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
-            hunyuan_task = self._resolve_hunyuan_image3_request_task(
-                stage_configs=list(getattr(self._diffusion_engine, "stage_configs", []) or []),
-                output_modalities=["image"],
-                reference_images=reference_images,
-                bot_task=extra_body.get("bot_task"),
-            )
+            bot_task = extra_body.get("bot_task")
 
             # Text-to-video parameters (ref: text_to_video.py)
             num_frames = extra_body.get("num_frames")
@@ -2757,15 +2689,14 @@ async def _create_diffusion_chat_completion(
             # Generate image or audio (e.g. AudioX) via AsyncOmni
             diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
             stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or [])
-            if hunyuan_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-                    bot_task_for_task,
-                    tokenizer_bot_task_for_task,
-                )
+            if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
+                bot_task = None
+            elif bot_task is not None:
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
 
-                gen_prompt["bot_task"] = bot_task_for_task(hunyuan_task)
+                gen_prompt["bot_task"] = bot_task
                 gen_prompt["mm_processor_kwargs"] = {
-                    "bot_task": tokenizer_bot_task_for_task(hunyuan_task),
+                    "bot_task": tokenizer_bot_task_for_bot_task(bot_task),
                 }
             sampling_params_list = build_stage_sampling_params_list(
                 stage_configs,
@@ -2777,10 +2708,10 @@ async def _create_diffusion_chat_completion(
             if not sampling_params_list:
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params(
+            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
                 engine=diffusion_engine,
                 sampling_params_list=sampling_params_list,
-                task=hunyuan_task,
+                bot_task=bot_task,
             )
 
             result = None

From 441145c1de3983ca79e45ea8acec21a0d126b340 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 19:02:29 +0800
Subject: [PATCH 07/40] Consolidate HunyuanImage3 bot task resolution

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/end2end.py                 |  13 +-
 .../hunyuan_image3/test_prompt_utils.py       |  32 ++--
 .../models/hunyuan_image3/prompt_utils.py     | 174 +++++++++++++-----
 3 files changed, 156 insertions(+), 63 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 8233e2bf820..9b717e198b8 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -19,11 +19,9 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     available_prompt_bot_tasks,
-    bot_task_for_task,
     build_prompt_tokens,
-    stop_token_ids_for_task,
+    resolve_bot_task,
     sys_type_for_task,
-    task_for_modality_and_bot_task,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -124,8 +122,10 @@ def main():
     os.makedirs(args.output, exist_ok=True)
 
     # Determine task for prompt formatting from modality + bot behavior.
-    task = task_for_modality_and_bot_task(args.modality, args.bot_task)
-    bot_task = bot_task_for_task(task)
+    bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality)
+    task = bot_task_resolution.task
+    assert task is not None
+    bot_task = bot_task_resolution.bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -209,7 +209,8 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    ar_stop_token_ids = stop_token_ids_for_task(tokenizer, task)
+    ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids
+    assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index e634fdb09aa..6c1f277b366 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -27,13 +27,12 @@
     apply_bot_task_to_sampling_params,
     available_prompt_bot_tasks,
     available_tasks,
-    bot_task_for_task,
     build_prompt,
     build_prompt_tokens,
+    resolve_bot_task,
     stop_token_ids_for_bot_task,
     stop_token_ids_for_task,
     sys_type_for_task,
-    task_for_modality_and_bot_task,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -104,8 +103,10 @@ def test_available_tasks_covers_all_modalities():
         ("t2i_vanilla", "image"),
     ],
 )
-def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task: str):
-    assert bot_task_for_task(task) == expected_bot_task
+def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: str):
+    resolution = resolve_bot_task(task=task)
+    assert resolution.task == task
+    assert resolution.bot_task == expected_bot_task
 
 
 @pytest.mark.parametrize(
@@ -126,17 +127,28 @@ def test_task_for_modality_and_bot_task_composes_prompt_task(
     bot_task: str,
     expected_task: str,
 ):
-    assert task_for_modality_and_bot_task(modality, bot_task) == expected_task
+    assert resolve_bot_task(bot_task, modality=modality).task == expected_task
 
 
-def test_task_for_modality_and_bot_task_rejects_invalid_combinations():
+def test_resolve_bot_task_rejects_invalid_combinations():
     assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"]
 
     with pytest.raises(ValueError, match="not supported"):
-        task_for_modality_and_bot_task("img2text", "recaption")
+        resolve_bot_task("recaption", modality="img2text")
 
     with pytest.raises(ValueError, match="not supported"):
-        task_for_modality_and_bot_task("img2img", "vanilla")
+        resolve_bot_task("vanilla", modality="img2img")
+
+
+def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids():
+    tok = FakeTokenizer()
+
+    resolution = resolve_bot_task("think_recaption", tokenizer=tok)
+
+    assert resolution.task is None
+    assert resolution.bot_task == "think_recaption"
+    assert resolution.tokenizer_bot_task == "think"
+    assert resolution.stop_token_ids == [6, 7, 5]
 
 
 def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
@@ -359,11 +371,9 @@ def test_end2end_routes_through_shared_prompt_utils():
             imported_from_prompt_utils.update(alias.name for alias in node.names)
     expected_imports = {
         "available_prompt_bot_tasks",
-        "bot_task_for_task",
         "build_prompt_tokens",
-        "stop_token_ids_for_task",
+        "resolve_bot_task",
         "sys_type_for_task",
-        "task_for_modality_and_bot_task",
     }
     assert expected_imports <= imported_from_prompt_utils, (
         "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index b22acbdaf7a..a3c19c7c28d 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -17,6 +17,7 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
 from typing import Any
 
 from .system_prompt import get_system_prompt
@@ -91,13 +92,26 @@
 }
 
 
+@dataclass(frozen=True)
+class BotTaskResolution:
+    """Resolved HunyuanImage3 prompt/bot-task settings."""
+
+    task: str | None
+    sys_type: str | None
+    prompt_bot_task: str | None
+    bot_task: str
+    tokenizer_bot_task: str
+    trigger_tag: str | None
+    stop_token_ids: list[int] | None = None
+
+
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
 
 def available_prompt_bot_tasks() -> list[str]:
-    """Sorted public bot_task values accepted by `task_for_modality_and_bot_task`."""
+    """Sorted public bot_task values accepted by `resolve_bot_task` with modality."""
     return sorted(PROMPT_BOT_TASKS)
 
 
@@ -121,13 +135,7 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
     return _PROMPT_BOT_TASK_ALIASES[normalized]
 
 
-def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
-    """Return the canonical prompt task for an input/output modality.
-
-    `modality` chooses the base route (t2t, t2i, i2t, or it2i/ti2i), while
-    `bot_task` chooses the prompt behavior such as thinking, recaptioning,
-    or the vanilla text-to-image template.
-    """
+def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str:
     modality_key = modality.strip().lower()
     if modality_key not in _MODALITY_TO_TASK_PREFIX:
         raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}")
@@ -151,25 +159,117 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto")
     return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
 
 
+def _bot_task_for_preset_bot_task(preset_bot_task: str | None) -> str:
+    if preset_bot_task == "think":
+        return "think_recaption"
+    return preset_bot_task or "auto"
+
+
+def _tokenizer_bot_task_for_bot_task(bot_task: str) -> str:
+    if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
+    return _BOT_TASK_TO_TOKENIZER_TASK[bot_task]
+
+
+def _stop_token_ids_for_tokenizer_bot_task(
+    tokenizer,
+    tokenizer_bot_task: str,
+    image_size: int | str | None = None,
+) -> list[int]:
+    eos_id = _eos_token_id(tokenizer)
+
+    if image_size == "auto":
+        extra_auto_stops = [_token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)]
+    else:
+        extra_auto_stops = [_token_id(tokenizer, "<boi>")]
+
+    stop_token_id = {
+        "auto": [eos_id] + extra_auto_stops,
+        "image": [eos_id],
+        "recaption": [
+            _token_id(tokenizer, "</recaption>"),
+            _token_id(tokenizer, "</answer>"),
+            eos_id,
+        ],
+        "think": [
+            _token_id(tokenizer, "</recaption>"),
+            _token_id(tokenizer, "</answer>"),
+            eos_id,
+        ],
+    }
+    return stop_token_id[tokenizer_bot_task]
+
+
+def resolve_bot_task(
+    bot_task: str | None = "auto",
+    *,
+    modality: str | None = None,
+    task: str | None = None,
+    tokenizer: Any | None = None,
+    image_size: int | str | None = None,
+) -> BotTaskResolution:
+    """Resolve HunyuanImage3 bot-task related prompt settings.
+
+    Pass `modality + bot_task` for CLI/request-level behavior, `task` for a
+    canonical prompt task, or only `bot_task` to validate/map a pipeline
+    HunyuanImage3 bot_task.
+    """
+    if task is not None and modality is not None:
+        raise ValueError("Pass either task or modality, not both.")
+
+    if task is None and modality is not None:
+        task = _task_for_modality_and_prompt_bot_task(modality, bot_task)
+
+    if task is not None:
+        sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+        resolved_bot_task = _bot_task_for_preset_bot_task(preset_bot_task)
+        prompt_bot_task = preset_bot_task
+    else:
+        sys_type = None
+        trigger_tag = None
+        prompt_bot_task = None
+        resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower()
+
+    tokenizer_bot_task = _tokenizer_bot_task_for_bot_task(resolved_bot_task)
+    stop_token_ids = (
+        _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size)
+        if tokenizer is not None
+        else None
+    )
+
+    return BotTaskResolution(
+        task=task,
+        sys_type=sys_type,
+        prompt_bot_task=prompt_bot_task,
+        bot_task=resolved_bot_task,
+        tokenizer_bot_task=tokenizer_bot_task,
+        trigger_tag=trigger_tag,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
+    """Return the canonical prompt task for an input/output modality."""
+    task = resolve_bot_task(bot_task, modality=modality).task
+    assert task is not None
+    return task
+
+
 def sys_type_for_task(task: str) -> str:
     """Return the default system prompt type for a canonical prompt task."""
-    preset_sys_type, _, _ = _task_preset(task)
-    return preset_sys_type
+    sys_type = resolve_bot_task(task=task).sys_type
+    assert sys_type is not None
+    return sys_type
 
 
 def bot_task_for_task(task: str) -> str:
     """Return the HunyuanImage3 bot_task associated with a prompt task."""
-    _, preset_bot_task, _ = _task_preset(task)
-    if preset_bot_task == "think":
-        return "think_recaption"
-    return preset_bot_task or "auto"
+    return resolve_bot_task(task=task).bot_task
 
 
 def tokenizer_bot_task_for_bot_task(bot_task: str) -> str:
     """Map the public HunyuanImage3 bot_task to tokenizer-internal task."""
-    if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
-        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
-    return _BOT_TASK_TO_TOKENIZER_TASK[bot_task]
+    return resolve_bot_task(bot_task).tokenizer_bot_task
 
 
 def _token_id(tokenizer, token: str) -> int:
@@ -198,29 +298,9 @@ def stop_token_ids_for_bot_task(
     their structural end tokens, and all ids are resolved from the
     tokenizer instead of being hard-coded in deploy YAML.
     """
-    eos_id = _eos_token_id(tokenizer)
-
-    if image_size == "auto":
-        extra_auto_stops = [_token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)]
-    else:
-        extra_auto_stops = [_token_id(tokenizer, "<boi>")]
-
-    tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
-    stop_token_id = {
-        "auto": [eos_id] + extra_auto_stops,
-        "image": [eos_id],
-        "recaption": [
-            _token_id(tokenizer, "</recaption>"),
-            _token_id(tokenizer, "</answer>"),
-            eos_id,
-        ],
-        "think": [
-            _token_id(tokenizer, "</recaption>"),
-            _token_id(tokenizer, "</answer>"),
-            eos_id,
-        ],
-    }
-    return stop_token_id[tokenizer_bot_task]
+    stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
+    assert stop_token_ids is not None
+    return stop_token_ids
 
 
 def stop_token_ids_for_task(
@@ -229,11 +309,9 @@ def stop_token_ids_for_task(
     image_size: int | str | None = None,
 ) -> list[int]:
     """Return AR stop token ids for a canonical prompt task."""
-    return stop_token_ids_for_bot_task(
-        tokenizer,
-        bot_task_for_task(task),
-        image_size=image_size,
-    )
+    stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
+    assert stop_token_ids is not None
+    return stop_token_ids
 
 
 def apply_bot_task_to_sampling_params(
@@ -250,7 +328,9 @@ def apply_bot_task_to_sampling_params(
 
     updated_params_list = list(sampling_params_list)
     params = updated_params_list[stage_index]
-    params.stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task, image_size=image_size)
+    stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
+    assert stop_token_ids is not None
+    params.stop_token_ids = stop_token_ids
 
     updated_params_list[stage_index] = params
     return updated_params_list
@@ -364,6 +444,7 @@ def build_prompt_tokens(
 
 
 __all__ = [
+    "BotTaskResolution",
     "available_tasks",
     "available_prompt_bot_tasks",
     "apply_bot_task_to_sampling_params",
@@ -372,6 +453,7 @@ def build_prompt_tokens(
     "build_prompt",
     "build_prompt_tokens",
     "PROMPT_BOT_TASKS",
+    "resolve_bot_task",
     "stop_token_ids_for_bot_task",
     "stop_token_ids_for_task",
     "sys_type_for_task",

From 5d88d160f7ac8720d5253b6e4037fc1c1fee558c Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 19:07:02 +0800
Subject: [PATCH 08/40] Remove legacy HunyuanImage3 bot task helpers

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 24 +++----
 .../models/hunyuan_image3/prompt_utils.py     | 68 ++-----------------
 vllm_omni/entrypoints/openai/serving_chat.py  | 16 ++---
 3 files changed, 23 insertions(+), 85 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 6c1f277b366..4298a37870e 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -30,8 +30,6 @@
     build_prompt,
     build_prompt_tokens,
     resolve_bot_task,
-    stop_token_ids_for_bot_task,
-    stop_token_ids_for_task,
     sys_type_for_task,
 )
 
@@ -122,7 +120,7 @@ def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: s
         ("text2text", "none", "t2t"),
     ],
 )
-def test_task_for_modality_and_bot_task_composes_prompt_task(
+def test_resolve_bot_task_composes_prompt_task(
     modality: str,
     bot_task: str,
     expected_task: str,
@@ -151,25 +149,25 @@ def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids():
     assert resolution.stop_token_ids == [6, 7, 5]
 
 
-def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer():
+def test_resolve_bot_task_resolves_stop_ids_from_bot_task():
     tok = FakeTokenizer()
 
-    assert stop_token_ids_for_bot_task(tok, "auto") == [5, 8]
-    assert stop_token_ids_for_bot_task(tok, "image") == [5]
-    assert stop_token_ids_for_bot_task(tok, "think_recaption") == [6, 7, 5]
-    assert stop_token_ids_for_bot_task(tok, "recaption") == [6, 7, 5]
-    assert stop_token_ids_for_bot_task(tok, "auto", image_size="auto") == [
+    assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [5, 8]
+    assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [5]
+    assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [6, 7, 5]
+    assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [6, 7, 5]
+    assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [
         5,
         *range(1000, 1033),
     ]
 
 
-def test_stop_token_ids_for_task_are_resolved_from_prompt_task():
+def test_resolve_bot_task_resolves_stop_ids_from_prompt_task():
     tok = FakeTokenizer()
 
-    assert stop_token_ids_for_task(tok, "i2t") == [5, 8]
-    assert stop_token_ids_for_task(tok, "i2t_think") == [6, 7, 5]
-    assert stop_token_ids_for_task(tok, "t2i_vanilla") == [5]
+    assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [5, 8]
+    assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [6, 7, 5]
+    assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [5]
 
 
 def test_sys_type_for_task_returns_prompt_preset_default():
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index a3c19c7c28d..c62e8a39437 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -159,18 +159,6 @@ def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None =
     return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
 
 
-def _bot_task_for_preset_bot_task(preset_bot_task: str | None) -> str:
-    if preset_bot_task == "think":
-        return "think_recaption"
-    return preset_bot_task or "auto"
-
-
-def _tokenizer_bot_task_for_bot_task(bot_task: str) -> str:
-    if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
-        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
-    return _BOT_TASK_TO_TOKENIZER_TASK[bot_task]
-
-
 def _stop_token_ids_for_tokenizer_bot_task(
     tokenizer,
     tokenizer_bot_task: str,
@@ -222,7 +210,7 @@ def resolve_bot_task(
 
     if task is not None:
         sys_type, preset_bot_task, trigger_tag = _task_preset(task)
-        resolved_bot_task = _bot_task_for_preset_bot_task(preset_bot_task)
+        resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto"
         prompt_bot_task = preset_bot_task
     else:
         sys_type = None
@@ -230,7 +218,9 @@ def resolve_bot_task(
         prompt_bot_task = None
         resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower()
 
-    tokenizer_bot_task = _tokenizer_bot_task_for_bot_task(resolved_bot_task)
+    if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
+        raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}")
+    tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task]
     stop_token_ids = (
         _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size)
         if tokenizer is not None
@@ -248,13 +238,6 @@ def resolve_bot_task(
     )
 
 
-def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str:
-    """Return the canonical prompt task for an input/output modality."""
-    task = resolve_bot_task(bot_task, modality=modality).task
-    assert task is not None
-    return task
-
-
 def sys_type_for_task(task: str) -> str:
     """Return the default system prompt type for a canonical prompt task."""
     sys_type = resolve_bot_task(task=task).sys_type
@@ -262,16 +245,6 @@ def sys_type_for_task(task: str) -> str:
     return sys_type
 
 
-def bot_task_for_task(task: str) -> str:
-    """Return the HunyuanImage3 bot_task associated with a prompt task."""
-    return resolve_bot_task(task=task).bot_task
-
-
-def tokenizer_bot_task_for_bot_task(bot_task: str) -> str:
-    """Map the public HunyuanImage3 bot_task to tokenizer-internal task."""
-    return resolve_bot_task(bot_task).tokenizer_bot_task
-
-
 def _token_id(tokenizer, token: str) -> int:
     token_id = tokenizer.convert_tokens_to_ids(token)
     if token_id is None:
@@ -286,34 +259,6 @@ def _eos_token_id(tokenizer) -> int:
     return _token_id(tokenizer, "<|endoftext|>")
 
 
-def stop_token_ids_for_bot_task(
-    tokenizer,
-    bot_task: str,
-    image_size: int | str | None = None,
-) -> list[int]:
-    """Return AR stop token ids for a HunyuanImage3 bot_task.
-
-    Mirrors the official HunyuanImage-3.0 generation logic: `auto`
-    additionally stops on image-start markers, text/image tasks stop on
-    their structural end tokens, and all ids are resolved from the
-    tokenizer instead of being hard-coded in deploy YAML.
-    """
-    stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
-    assert stop_token_ids is not None
-    return stop_token_ids
-
-
-def stop_token_ids_for_task(
-    tokenizer,
-    task: str,
-    image_size: int | str | None = None,
-) -> list[int]:
-    """Return AR stop token ids for a canonical prompt task."""
-    stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
-    assert stop_token_ids is not None
-    return stop_token_ids
-
-
 def apply_bot_task_to_sampling_params(
     sampling_params_list: list[Any],
     tokenizer: Any,
@@ -448,15 +393,10 @@ def build_prompt_tokens(
     "available_tasks",
     "available_prompt_bot_tasks",
     "apply_bot_task_to_sampling_params",
-    "bot_task_for_task",
     "BOT_TASKS",
     "build_prompt",
     "build_prompt_tokens",
     "PROMPT_BOT_TASKS",
     "resolve_bot_task",
-    "stop_token_ids_for_bot_task",
-    "stop_token_ids_for_task",
     "sys_type_for_task",
-    "task_for_modality_and_bot_task",
-    "tokenizer_bot_task_for_bot_task",
 ]
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index b2375fd38b4..03214d21612 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -206,9 +206,9 @@ async def create_chat_completion(
             )
             request_chat_template_kwargs = request.chat_template_kwargs or {}
             if bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
 
-                tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task)
+                tokenizer_bot_task = resolve_bot_task(bot_task).tokenizer_bot_task
                 request_chat_template_kwargs = dict(request_chat_template_kwargs)
                 request_chat_template_kwargs["bot_task"] = tokenizer_bot_task
 
@@ -771,12 +771,12 @@ async def _apply_hunyuan_image3_bot_task_sampling_params(
         from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
             BOT_TASKS,
             apply_bot_task_to_sampling_params,
-            tokenizer_bot_task_for_bot_task,
+            resolve_bot_task,
         )
 
         if bot_task not in BOT_TASKS:
             raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
-        tokenizer_bot_task_for_bot_task(bot_task)
+        resolve_bot_task(bot_task)
 
         if tokenizer is None and hasattr(engine, "get_tokenizer"):
             tokenizer = await engine.get_tokenizer()
@@ -2298,9 +2298,9 @@ def _build_multistage_generation_inputs(
         if width is not None:
             mm_processor_kwargs["target_w"] = width
         if bot_task is not None:
-            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
 
-            mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task)
+            mm_processor_kwargs["bot_task"] = resolve_bot_task(bot_task).tokenizer_bot_task
             engine_prompt["bot_task"] = bot_task
         if mm_processor_kwargs:
             engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs
@@ -2692,11 +2692,11 @@ async def _create_diffusion_chat_completion(
             if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
                 bot_task = None
             elif bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task
+                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
 
                 gen_prompt["bot_task"] = bot_task
                 gen_prompt["mm_processor_kwargs"] = {
-                    "bot_task": tokenizer_bot_task_for_bot_task(bot_task),
+                    "bot_task": resolve_bot_task(bot_task).tokenizer_bot_task,
                 }
             sampling_params_list = build_stage_sampling_params_list(
                 stage_configs,

From 7d70ae5723a2cfcb64b3d333ac62298a8dbc4c99 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Thu, 7 May 2026 19:08:49 +0800
Subject: [PATCH 09/40] Remove online HunyuanImage3 bot task changes

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       |  25 ---
 .../models/hunyuan_image3/prompt_utils.py     |  23 ---
 vllm_omni/entrypoints/openai/api_server.py    |   5 -
 .../entrypoints/openai/protocol/images.py     |   4 -
 vllm_omni/entrypoints/openai/serving_chat.py  | 154 +-----------------
 5 files changed, 9 insertions(+), 202 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 4298a37870e..8d9448ea6b9 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -24,7 +24,6 @@
 import pytest
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    apply_bot_task_to_sampling_params,
     available_prompt_bot_tasks,
     available_tasks,
     build_prompt,
@@ -175,30 +174,6 @@ def test_sys_type_for_task_returns_prompt_preset_default():
     assert sys_type_for_task("t2i_vanilla") == "en_vanilla"
 
 
-class FakeSamplingParams:
-    def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None:
-        self.stop_token_ids = stop_token_ids
-        self.max_tokens = max_tokens
-
-
-def test_apply_bot_task_to_sampling_params_updates_only_target_stage():
-    tok = FakeTokenizer()
-    stage0 = FakeSamplingParams(stop_token_ids=[999])
-    stage1 = FakeSamplingParams(stop_token_ids=[888])
-
-    updated = apply_bot_task_to_sampling_params(
-        [stage0, stage1],
-        tok,
-        "think_recaption",
-        stage_index=0,
-    )
-
-    assert updated[0] is stage0
-    assert updated[0].stop_token_ids == [6, 7, 5]
-    assert updated[1] is stage1
-    assert stage0.stop_token_ids == [6, 7, 5]
-
-
 @pytest.mark.parametrize(
     "task",
     [
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index c62e8a39437..f6a622180a0 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -259,28 +259,6 @@ def _eos_token_id(tokenizer) -> int:
     return _token_id(tokenizer, "<|endoftext|>")
 
 
-def apply_bot_task_to_sampling_params(
-    sampling_params_list: list[Any],
-    tokenizer: Any,
-    bot_task: str,
-    *,
-    stage_index: int = 0,
-    image_size: int | str | None = None,
-) -> list[Any]:
-    """Apply a per-request HunyuanImage3 bot_task to one AR stage."""
-    if stage_index < 0 or stage_index >= len(sampling_params_list):
-        raise IndexError(f"stage_index {stage_index} is out of range for {len(sampling_params_list)} sampling params")
-
-    updated_params_list = list(sampling_params_list)
-    params = updated_params_list[stage_index]
-    stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids
-    assert stop_token_ids is not None
-    params.stop_token_ids = stop_token_ids
-
-    updated_params_list[stage_index] = params
-    return updated_params_list
-
-
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",
@@ -392,7 +370,6 @@ def build_prompt_tokens(
     "BotTaskResolution",
     "available_tasks",
     "available_prompt_bot_tasks",
-    "apply_bot_task_to_sampling_params",
     "BOT_TASKS",
     "build_prompt",
     "build_prompt_tokens",
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 9b3aec58f21..06fb0a7f4cb 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1527,8 +1527,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request)
                 extra_body["guidance_scale"] = request.guidance_scale
             if request.true_cfg_scale is not None:
                 extra_body["true_cfg_scale"] = request.true_cfg_scale
-            if request.bot_task is not None:
-                extra_body["bot_task"] = request.bot_task
             if request.generator_device is not None:
                 extra_body["generator_device"] = request.generator_device
             if request.lora is not None:
@@ -1695,7 +1693,6 @@ async def edit_images(
     guidance_scale: float | None = Form(None),
     strength: float | None = Form(None),
     true_cfg_scale: float | None = Form(None),
-    bot_task: str | None = Form(None),
     seed: int | None = Form(None),
     generator_device: str | None = Form(None),
     # vllm-omni extension for per-request LoRA.
@@ -1899,8 +1896,6 @@ async def edit_images(
                 extra_body["strength"] = strength
             if true_cfg_scale is not None:
                 extra_body["true_cfg_scale"] = true_cfg_scale
-            if bot_task is not None:
-                extra_body["bot_task"] = bot_task
             if layers is not None:
                 extra_body["layers"] = layers
             if resolution is not None:
diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py
index c78a95de058..0fb22a548cf 100644
--- a/vllm_omni/entrypoints/openai/protocol/images.py
+++ b/vllm_omni/entrypoints/openai/protocol/images.py
@@ -117,10 +117,6 @@ def validate_use_system_prompt(cls, v):
         le=20.0,
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
-    bot_task: str | None = Field(
-        default=None,
-        description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.",
-    )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
     generator_device: str | None = Field(
         default=None,
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 03214d21612..09b62bf8972 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -197,25 +197,10 @@ async def create_chat_completion(
             if tokenizer is None:
                 tokenizer = await self.engine_client.get_tokenizer()
 
-            extra_body = self._get_extra_body_from_request(request)
-            bot_task = (
-                extra_body.get("bot_task")
-                if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or []))
-                is not None
-                else None
-            )
-            request_chat_template_kwargs = request.chat_template_kwargs or {}
-            if bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
-
-                tokenizer_bot_task = resolve_bot_task(bot_task).tokenizer_bot_task
-                request_chat_template_kwargs = dict(request_chat_template_kwargs)
-                request_chat_template_kwargs["bot_task"] = tokenizer_bot_task
-
             reasoning_parser: ReasoningParser | None = None
             if self.reasoning_parser_cls:
                 chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
-                    request_chat_template_kwargs,
+                    request.chat_template_kwargs,
                     self.default_chat_template_kwargs,
                 )
                 reasoning_parser = self.reasoning_parser_cls(
@@ -263,13 +248,13 @@ async def create_chat_completion(
             if not self.use_harmony:
                 error_check_ret = self._validate_chat_template(
                     request_chat_template=request.chat_template,
-                    chat_template_kwargs=request_chat_template_kwargs,
+                    chat_template_kwargs=request.chat_template_kwargs,
                     trust_request_chat_template=self.trust_request_chat_template,
                 )
                 if error_check_ret is not None:
                     return error_check_ret
 
-                chat_template_kwargs = dict(request_chat_template_kwargs)
+                chat_template_kwargs = request.chat_template_kwargs or {}
                 chat_template_kwargs.update(reasoning_effort=request.reasoning_effort)
 
                 # Merge chat_template_kwargs with defaults
@@ -336,7 +321,9 @@ async def create_chat_completion(
                 #   `extra_body` is flattented and merged into the payload's root.
                 #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
                 #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-                extra_body = self._get_extra_body_from_request(request)
+                extra_body = getattr(request, "extra_body", None)
+                if not extra_body:
+                    extra_body = request.model_extra or {}
 
                 height, width = self._resolve_height_width_from_extra_body(extra_body)
 
@@ -380,9 +367,6 @@ async def create_chat_completion(
                     mm_processor_kwargs["target_h"] = height
                 if width is not None:
                     mm_processor_kwargs["target_w"] = width
-                if bot_task is not None:
-                    mm_processor_kwargs["bot_task"] = tokenizer_bot_task
-                    tprompt["bot_task"] = bot_task
                 tprompt["mm_processor_kwargs"] = mm_processor_kwargs
                 if engine_prompt_image is not None:
                     tprompt["multi_modal_data"] = engine_prompt_image
@@ -420,12 +404,6 @@ async def create_chat_completion(
                 # to delta to ensure emitted outputs are correctly drained. Otherwise
                 # convert cumulative to Final Only to ensure the output is correct.
                 sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream)
-                sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
-                    engine=self.engine_client,
-                    sampling_params_list=sampling_params_list,
-                    bot_task=bot_task,
-                    tokenizer=tokenizer,
-                )
 
                 # Apply user-specified overrides to diffusion stage(s) for image generation
                 for idx, sp in enumerate(sampling_params_list):
@@ -707,89 +685,6 @@ def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[Sam
                 raise ValueError(f"Invalid sampling params: {sampling_params}")
         return final_sampling_params_list
 
-    @staticmethod
-    def _get_extra_body_from_request(request: Any) -> dict[str, Any]:
-        body: dict[str, Any] = {}
-        model_extra = getattr(request, "model_extra", None)
-        if isinstance(model_extra, dict):
-            body.update(model_extra)
-        extra_body = getattr(request, "extra_body", None)
-        if isinstance(extra_body, dict):
-            body.update(extra_body)
-        return body
-
-    @staticmethod
-    def _stage_config_get(stage_config: Any, key: str) -> Any:
-        if isinstance(stage_config, dict):
-            return stage_config.get(key)
-        if hasattr(stage_config, "get"):
-            try:
-                return stage_config.get(key)
-            except Exception:
-                pass
-        return getattr(stage_config, key, None)
-
-    @classmethod
-    def _is_hunyuan_image3_stage(cls, stage_config: Any) -> bool:
-        model_arch = cls._stage_config_get(stage_config, "model_arch")
-        if model_arch == "HunyuanImage3ForCausalMM":
-            return True
-
-        engine_args = cls._stage_config_get(stage_config, "engine_args")
-        if isinstance(engine_args, dict):
-            return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM"
-        if engine_args is not None and hasattr(engine_args, "get"):
-            try:
-                return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM"
-            except Exception:
-                pass
-        return getattr(engine_args, "model_arch", None) == "HunyuanImage3ForCausalMM"
-
-    @classmethod
-    def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | None:
-        for idx, stage_config in enumerate(stage_configs):
-            if cls._is_hunyuan_image3_stage(stage_config) and get_stage_type(stage_config) != "diffusion":
-                return idx
-        return None
-
-    async def _apply_hunyuan_image3_bot_task_sampling_params(
-        self,
-        *,
-        engine: Any,
-        sampling_params_list: list[Any],
-        bot_task: Any,
-        tokenizer: Any | None = None,
-    ) -> list[Any]:
-        if bot_task is None:
-            return sampling_params_list
-
-        stage_configs = list(getattr(engine, "stage_configs", []) or [])
-        stage_index = self._get_hunyuan_image3_ar_stage_index(stage_configs)
-        if stage_index is None:
-            return sampling_params_list
-
-        from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-            BOT_TASKS,
-            apply_bot_task_to_sampling_params,
-            resolve_bot_task,
-        )
-
-        if bot_task not in BOT_TASKS:
-            raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}")
-        resolve_bot_task(bot_task)
-
-        if tokenizer is None and hasattr(engine, "get_tokenizer"):
-            tokenizer = await engine.get_tokenizer()
-        if tokenizer is None:
-            raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.")
-
-        return apply_bot_task_to_sampling_params(
-            sampling_params_list,
-            tokenizer,
-            bot_task,
-            stage_index=stage_index,
-        )
-
     def _get_comprehension_stage_index(self) -> int:
         for idx, stage in enumerate(self.engine_client.stage_configs):
             if stage.is_comprehension:
@@ -2254,11 +2149,6 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
-        bot_task = (
-            extra_body.get("bot_task")
-            if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None
-            else None
-        )
 
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
@@ -2297,11 +2187,6 @@ def _build_multistage_generation_inputs(
             mm_processor_kwargs["target_h"] = height
         if width is not None:
             mm_processor_kwargs["target_w"] = width
-        if bot_task is not None:
-            from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
-
-            mm_processor_kwargs["bot_task"] = resolve_bot_task(bot_task).tokenizer_bot_task
-            engine_prompt["bot_task"] = bot_task
         if mm_processor_kwargs:
             engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs
         if engine_prompt_data is not None:
@@ -2398,7 +2283,6 @@ async def generate_diffusion_images(
         negative_prompt = extra_body.get("negative_prompt")
         num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
         lora_body = extra_body.get("lora")
-        bot_task = extra_body.get("bot_task")
 
         pil_images: list[Image.Image] = []
         for img_b64 in reference_images:
@@ -2482,12 +2366,6 @@ async def generate_diffusion_images(
                 engine_prompt = gen_prompt
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
-                engine=diffusion_engine,
-                sampling_params_list=sampling_params_list,
-                bot_task=bot_task,
-            )
-
             result = None
             async for output in diffusion_engine.generate(
                 prompt=engine_prompt,
@@ -2556,7 +2434,9 @@ async def _create_diffusion_chat_completion(
             #   `extra_body` is flattented and merged into the payload's root.
             #   These extra fields are accessible via `model_extra` property (from Pydantic base class).
             #   When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict.
-            extra_body = self._get_extra_body_from_request(request)
+            extra_body = getattr(request, "extra_body", None)
+            if not extra_body:
+                extra_body = request.model_extra or {}
 
             # Parse size if provided (supports "1024x1024" format)
             height, width = self._resolve_height_width_from_extra_body(extra_body)
@@ -2575,7 +2455,6 @@ async def _create_diffusion_chat_completion(
                 seed = getattr(request, "seed", None)
             negative_prompt = extra_body.get("negative_prompt")
             num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1)
-            bot_task = extra_body.get("bot_task")
 
             # Text-to-video parameters (ref: text_to_video.py)
             num_frames = extra_body.get("num_frames")
@@ -2689,15 +2568,6 @@ async def _create_diffusion_chat_completion(
             # Generate image or audio (e.g. AudioX) via AsyncOmni
             diffusion_engine = cast(AsyncOmni, self._diffusion_engine)
             stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or [])
-            if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None:
-                bot_task = None
-            elif bot_task is not None:
-                from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task
-
-                gen_prompt["bot_task"] = bot_task
-                gen_prompt["mm_processor_kwargs"] = {
-                    "bot_task": resolve_bot_task(bot_task).tokenizer_bot_task,
-                }
             sampling_params_list = build_stage_sampling_params_list(
                 stage_configs,
                 get_default_sampling_params_list(diffusion_engine),
@@ -2708,12 +2578,6 @@ async def _create_diffusion_chat_completion(
             if not sampling_params_list:
                 sampling_params_list = [gen_params]
 
-            sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params(
-                engine=diffusion_engine,
-                sampling_params_list=sampling_params_list,
-                bot_task=bot_task,
-            )
-
             result = None
             async for output in diffusion_engine.generate(
                 prompt=gen_prompt,

From 09a025993982b49c86331a523cbf110a4624bcf9 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 09:33:57 +0800
Subject: [PATCH 10/40] Hardcode HunyuanImage3 offline control token ids

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../models/hunyuan_image3/prompt_utils.py     | 42 +++++++++++++++----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index f6a622180a0..248a13943fe 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -24,6 +24,29 @@
 
 BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
 PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla")
+
+# HunyuanImage-3.0-Instruct special token ids from tokenizer.json.
+# Keep offline AR prompt/stop-token behavior independent of runtime
+# tokenizer lookup for these fixed control tokens.
+HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS: dict[str, int] = {
+    "<|endoftext|>": 127957,
+    "<|startoftext|>": 127958,
+    "<boi>": 128000,
+    "<eoi>": 128001,
+    "<img>": 128006,
+    "<cfg>": 128010,
+    "<recaption>": 128018,
+    "</recaption>": 128019,
+    "<think>": 128023,
+    "</think>": 128024,
+    "<answer>": 128025,
+    "</answer>": 128026,
+    "<img_size_1024>": 128037,
+    "<img_ratio_0>": 128044,
+    "<img_ratio_32>": 128076,
+    "<img_ratio_33>": 130103,
+    "<img_ratio_36>": 130106,
+}
 _BOT_TASK_TO_TOKENIZER_TASK = {
     "auto": "auto",
     "image": "image",
@@ -167,7 +190,9 @@ def _stop_token_ids_for_tokenizer_bot_task(
     eos_id = _eos_token_id(tokenizer)
 
     if image_size == "auto":
-        extra_auto_stops = [_token_id(tokenizer, f"<img_ratio_{i}>") for i in range(33)]
+        start_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
+        end_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"]
+        extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1))
     else:
         extra_auto_stops = [_token_id(tokenizer, "<boi>")]
 
@@ -246,6 +271,9 @@ def sys_type_for_task(task: str) -> str:
 
 
 def _token_id(tokenizer, token: str) -> int:
+    if token in HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS:
+        return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token]
+
     token_id = tokenizer.convert_tokens_to_ids(token)
     if token_id is None:
         raise ValueError(f"Tokenizer does not know special token {token!r}")
@@ -253,10 +281,7 @@ def _token_id(tokenizer, token: str) -> int:
 
 
 def _eos_token_id(tokenizer) -> int:
-    token_id = getattr(tokenizer, "eos_token_id", None)
-    if token_id is not None:
-        return int(token_id)
-    return _token_id(tokenizer, "<|endoftext|>")
+    return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
 
 
 def build_prompt(
@@ -334,9 +359,9 @@ def build_prompt_tokens(
     preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
     effective_sys_type = sys_type or preset_sys_type
 
-    bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
-    img_id = tokenizer.convert_tokens_to_ids("<img>")
-    trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
+    bos_id = _token_id(tokenizer, "<|startoftext|>")
+    img_id = _token_id(tokenizer, "<img>")
+    trig_id = _token_id(tokenizer, trigger_tag) if trigger_tag else None
 
     has_image_input = _task_has_image_input(task)
 
@@ -373,6 +398,7 @@ def build_prompt_tokens(
     "BOT_TASKS",
     "build_prompt",
     "build_prompt_tokens",
+    "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
     "PROMPT_BOT_TASKS",
     "resolve_bot_task",
     "sys_type_for_task",

From 2cc6ad75f92b90b97c60a6c004e69949e21a1ac1 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 09:33:57 +0800
Subject: [PATCH 11/40] Hardcode HunyuanImage3 offline control token ids

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 80 ++++++++++++++-----
 1 file changed, 59 insertions(+), 21 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 8d9448ea6b9..e858944c0a4 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -24,6 +24,7 @@
 import pytest
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
     available_prompt_bot_tasks,
     available_tasks,
     build_prompt,
@@ -41,7 +42,7 @@
 class FakeTokenizer:
     """Minimal tokenizer stub that records every encode() call.
 
-    Returns deterministic ids: special tokens map to small ints (1-4),
+    Returns deterministic ids from convert_tokens_to_ids while
     encode() returns one id per character starting at 100. This lets
     tests both verify segmentation (by inspecting `encode_calls`) and
     locate substrings inside the returned id list.
@@ -145,28 +146,56 @@ def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids():
     assert resolution.task is None
     assert resolution.bot_task == "think_recaption"
     assert resolution.tokenizer_bot_task == "think"
-    assert resolution.stop_token_ids == [6, 7, 5]
+    assert resolution.stop_token_ids == [
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"],
+    ]
 
 
 def test_resolve_bot_task_resolves_stop_ids_from_bot_task():
     tok = FakeTokenizer()
 
-    assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [5, 8]
-    assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [5]
-    assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [6, 7, 5]
-    assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [6, 7, 5]
+    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
+    boi_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"]
+    end_recaption_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"]
+    end_answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"]
+
+    assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [eos_id, boi_id]
+    assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [eos_id]
+    assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [
+        end_recaption_id,
+        end_answer_id,
+        eos_id,
+    ]
+    assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [
+        end_recaption_id,
+        end_answer_id,
+        eos_id,
+    ]
     assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [
-        5,
-        *range(1000, 1033),
+        eos_id,
+        *range(
+            HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"],
+            HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"] + 1,
+        ),
     ]
 
 
 def test_resolve_bot_task_resolves_stop_ids_from_prompt_task():
     tok = FakeTokenizer()
 
-    assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [5, 8]
-    assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [6, 7, 5]
-    assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [5]
+    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
+    assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [
+        eos_id,
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
+    ]
+    assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
+        eos_id,
+    ]
+    assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id]
 
 
 def test_sys_type_for_task_returns_prompt_preset_default():
@@ -265,25 +294,31 @@ def test_build_prompt_tokens_segments_each_boundary():
 def test_build_prompt_tokens_image_placeholder_present_for_image_tasks():
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="i2t")
-    assert ids[0] == 1, "BOS (<|startoftext|>) must be the first token"
-    assert 2 in ids, "<img> placeholder must be present for i2t/it2i tasks"
+    assert ids[0] == HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|startoftext|>"], (
+        "BOS (<|startoftext|>) must be the first token"
+    )
+    assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img>"] in ids, (
+        "<img> placeholder must be present for i2t/it2i tasks"
+    )
 
 
 def test_build_prompt_tokens_no_image_for_text_only_tasks():
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="t2t")
-    assert 2 not in ids, "<img> must NOT appear for text-only tasks"
+    assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img>"] not in ids, (
+        "<img> must NOT appear for text-only tasks"
+    )
 
 
 @pytest.mark.parametrize(
     "task,trigger_id",
     [
-        ("t2t_think", 3),
-        ("i2t_think", 3),
-        ("it2i_think", 3),
-        ("t2i_think", 3),
-        ("it2i_recaption", 4),
-        ("t2i_recaption", 4),
+        ("t2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
+        ("i2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
+        ("it2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
+        ("t2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
+        ("it2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"]),
+        ("t2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"]),
     ],
 )
 def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
@@ -297,7 +332,10 @@ def test_build_prompt_tokens_no_trigger_for_plain_tasks():
     """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id."""
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="t2t")
-    assert ids[-1] not in {3, 4}  # neither <think> nor <recaption>
+    assert ids[-1] not in {
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"],
+    }
 
 
 # -------------------- end2end.py wiring guard --------------------

From 12a77da318d0c78be64877689c77642e45187d41 Mon Sep 17 00:00:00 2001
From: "Y. Fisher" <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 11:37:22 +0800
Subject: [PATCH 12/40] Refactor prompt_utils.py

Signed-off-by: Y. Fisher <yukexiong1@huawei.com>
Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../models/hunyuan_image3/prompt_utils.py     | 261 ++----------------
 1 file changed, 24 insertions(+), 237 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 248a13943fe..9754cf2c82f 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -22,8 +22,6 @@
 
 from .system_prompt import get_system_prompt
 
-BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
-PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla")
 
 # HunyuanImage-3.0-Instruct special token ids from tokenizer.json.
 # Keep offline AR prompt/stop-token behavior independent of runtime
@@ -47,17 +45,10 @@
     "<img_ratio_33>": 130103,
     "<img_ratio_36>": 130106,
 }
-_BOT_TASK_TO_TOKENIZER_TASK = {
-    "auto": "auto",
-    "image": "image",
-    "recaption": "recaption",
-    "think_recaption": "think",
-}
 
 # task -> (sys_type, bot_task, trigger_tag)
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
-    "t2t_think": ("en_unified", "think", "<think>"),
     "i2t": ("en_unified", None, None),
     "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
@@ -67,221 +58,22 @@
     "t2i_vanilla": ("en_vanilla", "image", None),
 }
 
-_MODALITY_TO_TASK_PREFIX = {
-    "text2text": "t2t",
-    "t2t": "t2t",
-    "img2text": "i2t",
-    "image2text": "i2t",
-    "i2t": "i2t",
-    "text2img": "t2i",
-    "text2image": "t2i",
-    "t2i": "t2i",
-    "img2img": "it2i",
-    "image2image": "it2i",
-    "it2i": "it2i",
-    "ti2i": "it2i",
-}
-
-_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = {
-    "t2t": None,
-    "i2t": None,
-    "t2i": "think",
-    "it2i": "think",
-}
-
-_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = {
-    ("t2t", None): "t2t",
-    ("t2t", "think"): "t2t_think",
-    ("i2t", None): "i2t",
-    ("i2t", "think"): "i2t_think",
-    ("t2i", "think"): "t2i_think",
-    ("t2i", "recaption"): "t2i_recaption",
-    ("t2i", "vanilla"): "t2i_vanilla",
-    ("it2i", "think"): "it2i_think",
-    ("it2i", "recaption"): "it2i_recaption",
-}
-
-_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = {
-    "auto": "auto",
-    "default": "auto",
-    "none": None,
-    "no": None,
-    "false": None,
-    "think": "think",
-    "think_recaption": "think",
-    "recaption": "recaption",
-    "image": "vanilla",
-    "vanilla": "vanilla",
-}
-
-
-@dataclass(frozen=True)
-class BotTaskResolution:
-    """Resolved HunyuanImage3 prompt/bot-task settings."""
-
-    task: str | None
-    sys_type: str | None
-    prompt_bot_task: str | None
-    bot_task: str
-    tokenizer_bot_task: str
-    trigger_tag: str | None
-    stop_token_ids: list[int] | None = None
-
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
+def resolve_stop_token_ids(
+    task: str = "it2i_think",
+    bot_task: str = "think", 
+    tokenizer: Any | None = None):
+    tkw = tokenizer
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    stop_token_ids = [127957]
+    if trigger_tag:
+        stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
+    return stop_token_ids
 
-def available_prompt_bot_tasks() -> list[str]:
-    """Sorted public bot_task values accepted by `resolve_bot_task` with modality."""
-    return sorted(PROMPT_BOT_TASKS)
-
-
-def _task_preset(task: str) -> tuple[str, str | None, str | None]:
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-    return _TASK_PRESETS[task]
-
-
-def _task_has_image_input(task: str) -> bool:
-    return task.startswith(("i2t", "it2i"))
-
-
-def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
-    if bot_task is None:
-        return "auto"
-
-    normalized = bot_task.strip().lower()
-    if normalized not in _PROMPT_BOT_TASK_ALIASES:
-        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}")
-    return _PROMPT_BOT_TASK_ALIASES[normalized]
-
-
-def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str:
-    modality_key = modality.strip().lower()
-    if modality_key not in _MODALITY_TO_TASK_PREFIX:
-        raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}")
-
-    task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key]
-    normalized_bot_task = _normalize_prompt_bot_task(bot_task)
-    if normalized_bot_task == "auto":
-        normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix]
-
-    task_key = (task_prefix, normalized_bot_task)
-    if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK:
-        valid_bot_tasks = sorted(
-            "none" if candidate is None else candidate
-            for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK
-            if prefix == task_prefix
-        )
-        raise ValueError(
-            f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}"
-        )
-
-    return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
-
-
-def _stop_token_ids_for_tokenizer_bot_task(
-    tokenizer,
-    tokenizer_bot_task: str,
-    image_size: int | str | None = None,
-) -> list[int]:
-    eos_id = _eos_token_id(tokenizer)
-
-    if image_size == "auto":
-        start_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"]
-        end_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"]
-        extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1))
-    else:
-        extra_auto_stops = [_token_id(tokenizer, "<boi>")]
-
-    stop_token_id = {
-        "auto": [eos_id] + extra_auto_stops,
-        "image": [eos_id],
-        "recaption": [
-            _token_id(tokenizer, "</recaption>"),
-            _token_id(tokenizer, "</answer>"),
-            eos_id,
-        ],
-        "think": [
-            _token_id(tokenizer, "</recaption>"),
-            _token_id(tokenizer, "</answer>"),
-            eos_id,
-        ],
-    }
-    return stop_token_id[tokenizer_bot_task]
-
-
-def resolve_bot_task(
-    bot_task: str | None = "auto",
-    *,
-    modality: str | None = None,
-    task: str | None = None,
-    tokenizer: Any | None = None,
-    image_size: int | str | None = None,
-) -> BotTaskResolution:
-    """Resolve HunyuanImage3 bot-task related prompt settings.
-
-    Pass `modality + bot_task` for CLI/request-level behavior, `task` for a
-    canonical prompt task, or only `bot_task` to validate/map a pipeline
-    HunyuanImage3 bot_task.
-    """
-    if task is not None and modality is not None:
-        raise ValueError("Pass either task or modality, not both.")
-
-    if task is None and modality is not None:
-        task = _task_for_modality_and_prompt_bot_task(modality, bot_task)
-
-    if task is not None:
-        sys_type, preset_bot_task, trigger_tag = _task_preset(task)
-        resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto"
-        prompt_bot_task = preset_bot_task
-    else:
-        sys_type = None
-        trigger_tag = None
-        prompt_bot_task = None
-        resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower()
-
-    if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
-        raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}")
-    tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task]
-    stop_token_ids = (
-        _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size)
-        if tokenizer is not None
-        else None
-    )
-
-    return BotTaskResolution(
-        task=task,
-        sys_type=sys_type,
-        prompt_bot_task=prompt_bot_task,
-        bot_task=resolved_bot_task,
-        tokenizer_bot_task=tokenizer_bot_task,
-        trigger_tag=trigger_tag,
-        stop_token_ids=stop_token_ids,
-    )
-
-
-def sys_type_for_task(task: str) -> str:
-    """Return the default system prompt type for a canonical prompt task."""
-    sys_type = resolve_bot_task(task=task).sys_type
-    assert sys_type is not None
-    return sys_type
-
-
-def _token_id(tokenizer, token: str) -> int:
-    if token in HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS:
-        return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token]
-
-    token_id = tokenizer.convert_tokens_to_ids(token)
-    if token_id is None:
-        raise ValueError(f"Tokenizer does not know special token {token!r}")
-    return int(token_id)
-
-
-def _eos_token_id(tokenizer) -> int:
-    return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
 
 
 def build_prompt(
@@ -298,13 +90,16 @@ def build_prompt(
     inputs that need to match HF baseline byte-for-byte, use
     `build_prompt_tokens` instead and feed the result via prompt_token_ids.
     """
-    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
     effective_sys_type = sys_type or preset_sys_type
 
     system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
     sys_text = system_prompt.strip() if system_prompt else ""
 
-    has_image_input = _task_has_image_input(task)
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
 
     # t2i_vanilla: pretrain mode for direct text->image generation. The
     # vanilla system prompt drives the model with no chat structure.
@@ -356,14 +151,17 @@ def build_prompt_tokens(
     boundary merge happens. We replicate that here and feed the result to
     Omni via OmniTokensPrompt (prompt_token_ids).
     """
-    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
     effective_sys_type = sys_type or preset_sys_type
 
-    bos_id = _token_id(tokenizer, "<|startoftext|>")
-    img_id = _token_id(tokenizer, "<img>")
-    trig_id = _token_id(tokenizer, trigger_tag) if trigger_tag else None
+    bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
+    img_id = tokenizer.convert_tokens_to_ids("<img>")
+    trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
-    has_image_input = _task_has_image_input(task)
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
 
     # t2i_vanilla uses pretrain template with no chat structure; the vanilla
     # system prompt drives the model directly. No segment boundaries to
@@ -391,15 +189,4 @@ def build_prompt_tokens(
     return ids
 
 
-__all__ = [
-    "BotTaskResolution",
-    "available_tasks",
-    "available_prompt_bot_tasks",
-    "BOT_TASKS",
-    "build_prompt",
-    "build_prompt_tokens",
-    "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
-    "PROMPT_BOT_TASKS",
-    "resolve_bot_task",
-    "sys_type_for_task",
-]
+__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS]

From 2612670ae02358816ce25a8d929a941468edfdb1 Mon Sep 17 00:00:00 2001
From: "Y. Fisher" <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 11:38:18 +0800
Subject: [PATCH 13/40] adjust end2end according to prompt utils

Signed-off-by: Y. Fisher <yukexiong1@huawei.com>
Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/end2end.py                 | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 9b717e198b8..ceebc2d3f39 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,10 +18,9 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    available_prompt_bot_tasks,
     build_prompt_tokens,
-    resolve_bot_task,
-    sys_type_for_task,
+    resolve_stop_token_ids,
+    _TASK_PRESETS
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -45,6 +44,12 @@
     "text2text": "text-to-text",
 }
 
+_MODALITY_TASK_MAP = {
+    "text2img": "t2i",
+    "img2img": "it2i",
+    "img2text": "i2t",
+    "text2text": "t2t",
+}
 
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
@@ -90,7 +95,7 @@ def parse_args():
         "--bot-task",
         type=str,
         default="auto",
-        choices=available_prompt_bot_tasks(),
+        choices=["auto", "think", "recaption", "vanilla"],
         help=(
             "Prompt behavior. 'auto' selects the default for the modality; "
             "'think' adds <think>; 'recaption' adds <recaption>; "
@@ -122,10 +127,11 @@ def main():
     os.makedirs(args.output, exist_ok=True)
 
     # Determine task for prompt formatting from modality + bot behavior.
-    bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality)
-    task = bot_task_resolution.task
+    task = _MODALITY_TASK_MAP[args.modality]
     assert task is not None
-    bot_task = bot_task_resolution.bot_task
+    bot_task = args.bot_task
+    if bot_task != "auto":
+        task = task + "_" + bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -176,7 +182,8 @@ def main():
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
         token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
-        effective_sys_type = args.sys_type or sys_type_for_task(task)
+        preset_sys_type, _, _ = _TASK_PRESETS[task]
+        effective_sys_type = args.sys_type or preset_sys_type
 
         # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
         # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
@@ -209,7 +216,7 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids
+    ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
     assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):

From 1dab1f0bf21304394cc6ceb7370715a2ae91edab Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 16:11:02 +0800
Subject: [PATCH 14/40] Fix HunyuanImage3 i2t think stop tokens

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/end2end.py                 |  25 +-
 .../hunyuan_image3/test_prompt_utils.py       |   3 +-
 .../models/hunyuan_image3/prompt_utils.py     | 285 +++++++++++++++---
 3 files changed, 255 insertions(+), 58 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index ceebc2d3f39..9b717e198b8 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,9 +18,10 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    available_prompt_bot_tasks,
     build_prompt_tokens,
-    resolve_stop_token_ids,
-    _TASK_PRESETS
+    resolve_bot_task,
+    sys_type_for_task,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -44,12 +45,6 @@
     "text2text": "text-to-text",
 }
 
-_MODALITY_TASK_MAP = {
-    "text2img": "t2i",
-    "img2img": "it2i",
-    "img2text": "i2t",
-    "text2text": "t2t",
-}
 
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
@@ -95,7 +90,7 @@ def parse_args():
         "--bot-task",
         type=str,
         default="auto",
-        choices=["auto", "think", "recaption", "vanilla"],
+        choices=available_prompt_bot_tasks(),
         help=(
             "Prompt behavior. 'auto' selects the default for the modality; "
             "'think' adds <think>; 'recaption' adds <recaption>; "
@@ -127,11 +122,10 @@ def main():
     os.makedirs(args.output, exist_ok=True)
 
     # Determine task for prompt formatting from modality + bot behavior.
-    task = _MODALITY_TASK_MAP[args.modality]
+    bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality)
+    task = bot_task_resolution.task
     assert task is not None
-    bot_task = args.bot_task
-    if bot_task != "auto":
-        task = task + "_" + bot_task
+    bot_task = bot_task_resolution.bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -182,8 +176,7 @@ def main():
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
         token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
-        preset_sys_type, _, _ = _TASK_PRESETS[task]
-        effective_sys_type = args.sys_type or preset_sys_type
+        effective_sys_type = args.sys_type or sys_type_for_task(task)
 
         # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
         # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
@@ -216,7 +209,7 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
+    ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids
     assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index e858944c0a4..664975d87fd 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -191,9 +191,10 @@ def test_resolve_bot_task_resolves_stop_ids_from_prompt_task():
         HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
     ]
     assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
         HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
         eos_id,
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
     ]
     assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id]
 
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 9754cf2c82f..231c965c3a7 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -22,6 +22,8 @@
 
 from .system_prompt import get_system_prompt
 
+BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
+PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla")
 
 # HunyuanImage-3.0-Instruct special token ids from tokenizer.json.
 # Keep offline AR prompt/stop-token behavior independent of runtime
@@ -46,9 +48,17 @@
     "<img_ratio_36>": 130106,
 }
 
+_BOT_TASK_TO_TOKENIZER_TASK = {
+    "auto": "auto",
+    "image": "image",
+    "recaption": "recaption",
+    "think_recaption": "think",
+}
+
 # task -> (sys_type, bot_task, trigger_tag)
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
+    "t2t_think": ("en_unified", "think", "<think>"),
     "i2t": ("en_unified", None, None),
     "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
@@ -58,23 +68,228 @@
     "t2i_vanilla": ("en_vanilla", "image", None),
 }
 
+_MODALITY_TO_TASK_PREFIX = {
+    "text2text": "t2t",
+    "t2t": "t2t",
+    "img2text": "i2t",
+    "image2text": "i2t",
+    "i2t": "i2t",
+    "text2img": "t2i",
+    "text2image": "t2i",
+    "t2i": "t2i",
+    "img2img": "it2i",
+    "image2image": "it2i",
+    "it2i": "it2i",
+    "ti2i": "it2i",
+}
+
+_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = {
+    "t2t": None,
+    "i2t": None,
+    "t2i": "think",
+    "it2i": "think",
+}
+
+_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = {
+    ("t2t", None): "t2t",
+    ("t2t", "think"): "t2t_think",
+    ("i2t", None): "i2t",
+    ("i2t", "think"): "i2t_think",
+    ("t2i", "think"): "t2i_think",
+    ("t2i", "recaption"): "t2i_recaption",
+    ("t2i", "vanilla"): "t2i_vanilla",
+    ("it2i", "think"): "it2i_think",
+    ("it2i", "recaption"): "it2i_recaption",
+}
+
+_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = {
+    "auto": "auto",
+    "default": "auto",
+    "none": None,
+    "no": None,
+    "false": None,
+    "think": "think",
+    "think_recaption": "think",
+    "recaption": "recaption",
+    "image": "vanilla",
+    "vanilla": "vanilla",
+}
+
+
+@dataclass(frozen=True)
+class BotTaskResolution:
+    """Resolved HunyuanImage3 prompt/bot-task settings."""
+
+    task: str | None
+    sys_type: str | None
+    prompt_bot_task: str | None
+    bot_task: str
+    tokenizer_bot_task: str
+    trigger_tag: str | None
+    stop_token_ids: list[int] | None = None
+
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
-def resolve_stop_token_ids(
-    task: str = "it2i_think",
-    bot_task: str = "think", 
-    tokenizer: Any | None = None):
-    tkw = tokenizer
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
-    stop_token_ids = [127957]
-    if trigger_tag:
-        stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
-    return stop_token_ids
+
+def available_prompt_bot_tasks() -> list[str]:
+    """Sorted public bot_task values accepted by `resolve_bot_task` with modality."""
+    return sorted(PROMPT_BOT_TASKS)
+
+
+def _task_preset(task: str) -> tuple[str, str | None, str | None]:
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+    return _TASK_PRESETS[task]
+
+
+def _task_has_image_input(task: str) -> bool:
+    return task.startswith(("i2t", "it2i"))
 
 
+def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
+    if bot_task is None:
+        return "auto"
+
+    normalized = bot_task.strip().lower()
+    if normalized not in _PROMPT_BOT_TASK_ALIASES:
+        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}")
+    return _PROMPT_BOT_TASK_ALIASES[normalized]
+
+
+def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str:
+    modality_key = modality.strip().lower()
+    if modality_key not in _MODALITY_TO_TASK_PREFIX:
+        raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}")
+
+    task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key]
+    normalized_bot_task = _normalize_prompt_bot_task(bot_task)
+    if normalized_bot_task == "auto":
+        normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix]
+
+    task_key = (task_prefix, normalized_bot_task)
+    if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK:
+        valid_bot_tasks = sorted(
+            "none" if candidate is None else candidate
+            for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK
+            if prefix == task_prefix
+        )
+        raise ValueError(
+            f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}"
+        )
+
+    return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
+
+
+def _special_token_id(token: str) -> int:
+    return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token]
+
+
+def _generic_stop_token_ids_for_tokenizer_bot_task(
+    tokenizer_bot_task: str,
+    image_size: int | str | None = None,
+) -> list[int]:
+    eos_id = _special_token_id("<|endoftext|>")
+
+    if image_size == "auto":
+        start_ratio_id = _special_token_id("<img_ratio_0>")
+        end_ratio_id = _special_token_id("<img_ratio_32>")
+        extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1))
+    else:
+        extra_auto_stops = [_special_token_id("<boi>")]
+
+    stop_token_ids = {
+        "auto": [eos_id] + extra_auto_stops,
+        "image": [eos_id],
+        "recaption": [
+            _special_token_id("</recaption>"),
+            _special_token_id("</answer>"),
+            eos_id,
+        ],
+        "think": [
+            _special_token_id("</recaption>"),
+            _special_token_id("</answer>"),
+            eos_id,
+        ],
+    }
+    return stop_token_ids[tokenizer_bot_task]
+
+
+def _stop_token_ids_for_task(
+    task: str,
+    tokenizer_bot_task: str,
+    image_size: int | str | None = None,
+) -> list[int]:
+    if task in ("t2t_think", "i2t_think"):
+        return [
+            _special_token_id("</think>"),
+            _special_token_id("</answer>"),
+            _special_token_id("<|endoftext|>"),
+            _special_token_id("<boi>"),
+        ]
+    return _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size)
+
+
+def resolve_bot_task(
+    bot_task: str | None = "auto",
+    *,
+    modality: str | None = None,
+    task: str | None = None,
+    tokenizer: Any | None = None,
+    image_size: int | str | None = None,
+) -> BotTaskResolution:
+    """Resolve HunyuanImage3 bot-task related prompt settings.
+
+    Pass `modality + bot_task` for CLI/request-level behavior, `task` for a
+    canonical prompt task, or only `bot_task` to validate/map a pipeline
+    HunyuanImage3 bot_task.
+    """
+    del tokenizer  # Stop tokens are fixed HunyuanImage3 control ids.
+
+    if task is not None and modality is not None:
+        raise ValueError("Pass either task or modality, not both.")
+
+    if task is None and modality is not None:
+        task = _task_for_modality_and_prompt_bot_task(modality, bot_task)
+
+    if task is not None:
+        sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+        resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto"
+        prompt_bot_task = preset_bot_task
+    else:
+        sys_type = None
+        trigger_tag = None
+        prompt_bot_task = None
+        resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower()
+
+    if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
+        raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}")
+    tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task]
+    stop_token_ids = (
+        _stop_token_ids_for_task(task, tokenizer_bot_task, image_size=image_size)
+        if task is not None
+        else _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size)
+    )
+
+    return BotTaskResolution(
+        task=task,
+        sys_type=sys_type,
+        prompt_bot_task=prompt_bot_task,
+        bot_task=resolved_bot_task,
+        tokenizer_bot_task=tokenizer_bot_task,
+        trigger_tag=trigger_tag,
+        stop_token_ids=stop_token_ids,
+    )
+
+
+def sys_type_for_task(task: str) -> str:
+    """Return the default system prompt type for a canonical prompt task."""
+    sys_type = resolve_bot_task(task=task).sys_type
+    assert sys_type is not None
+    return sys_type
+
 
 def build_prompt(
     user_prompt: str,
@@ -86,20 +301,17 @@ def build_prompt(
 
     NOTE: when this string is passed to the engine, the engine's tokenizer
     will run a single BPE pass over the whole string, which can merge
-    tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
+    tokens across segment boundaries (e.g. `X\n\n` into one token). For
     inputs that need to match HF baseline byte-for-byte, use
     `build_prompt_tokens` instead and feed the result via prompt_token_ids.
     """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
     effective_sys_type = sys_type or preset_sys_type
 
     system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
     sys_text = system_prompt.strip() if system_prompt else ""
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = _task_has_image_input(task)
 
     # t2i_vanilla: pretrain mode for direct text->image generation. The
     # vanilla system prompt drives the model with no chat structure.
@@ -110,16 +322,8 @@ def build_prompt(
         parts.append(user_prompt)
         return "".join(parts)
 
-    # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
-    # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
+    # All other tasks use HunyuanImage3 Instruct chat template:
     #   <|startoftext|>{system?}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger?}
-    # generation_config.json declares sequence_template="instruct", so the
-    # AR prefill MUST use this template -- verified to match HF's
-    # apply_chat_template output token-for-token (modulo BPE boundary merges).
-    # The trigger_tag (e.g. <think>) MUST come AFTER the `Assistant: ` prefix:
-    # if it goes BEFORE user_prompt (the old pretrain layout) the model puts
-    # the user's instructions inside the "thinking section" and collapses
-    # into repetition garbage under greedy decoding.
     parts = ["<|startoftext|>"]
     if sys_text:
         parts.append(f"{sys_text}\n\n")
@@ -141,27 +345,15 @@ def build_prompt_tokens(
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
 ) -> list[int]:
-    """Segment-by-segment tokenization that matches HF apply_chat_template.
-
-    Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE
-    merge tokens across segment boundaries (e.g. user_prompt ends with `。`
-    and the next segment is `\\n\\n` -> they merge into a single token id
-    3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes
-    each segment independently and concatenates token_ids, so no cross-
-    boundary merge happens. We replicate that here and feed the result to
-    Omni via OmniTokensPrompt (prompt_token_ids).
-    """
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    """Segment-by-segment tokenization that matches HF apply_chat_template."""
+    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
     effective_sys_type = sys_type or preset_sys_type
 
     bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
     img_id = tokenizer.convert_tokens_to_ids("<img>")
     trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
-    has_image_input = task.startswith("i2t") or task.startswith("it2i")
+    has_image_input = _task_has_image_input(task)
 
     # t2i_vanilla uses pretrain template with no chat structure; the vanilla
     # system prompt drives the model directly. No segment boundaries to
@@ -189,4 +381,15 @@ def build_prompt_tokens(
     return ids
 
 
-__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS]
+__all__ = [
+    "BotTaskResolution",
+    "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
+    "available_prompt_bot_tasks",
+    "available_tasks",
+    "BOT_TASKS",
+    "build_prompt",
+    "build_prompt_tokens",
+    "PROMPT_BOT_TASKS",
+    "resolve_bot_task",
+    "sys_type_for_task",
+]

From 5c3eda0c26db2c48b6fab8e646bb1d24a424c796 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 16:12:49 +0800
Subject: [PATCH 15/40] Revert "Fix HunyuanImage3 i2t think stop tokens"

This reverts commit e527e7bdd59d5d67064c3e74823d38f574f08f71.

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/end2end.py                 |  25 +-
 .../hunyuan_image3/test_prompt_utils.py       |   3 +-
 .../models/hunyuan_image3/prompt_utils.py     | 285 +++---------------
 3 files changed, 58 insertions(+), 255 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 9b717e198b8..ceebc2d3f39 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,10 +18,9 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    available_prompt_bot_tasks,
     build_prompt_tokens,
-    resolve_bot_task,
-    sys_type_for_task,
+    resolve_stop_token_ids,
+    _TASK_PRESETS
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -45,6 +44,12 @@
     "text2text": "text-to-text",
 }
 
+_MODALITY_TASK_MAP = {
+    "text2img": "t2i",
+    "img2img": "it2i",
+    "img2text": "i2t",
+    "text2text": "t2t",
+}
 
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
@@ -90,7 +95,7 @@ def parse_args():
         "--bot-task",
         type=str,
         default="auto",
-        choices=available_prompt_bot_tasks(),
+        choices=["auto", "think", "recaption", "vanilla"],
         help=(
             "Prompt behavior. 'auto' selects the default for the modality; "
             "'think' adds <think>; 'recaption' adds <recaption>; "
@@ -122,10 +127,11 @@ def main():
     os.makedirs(args.output, exist_ok=True)
 
     # Determine task for prompt formatting from modality + bot behavior.
-    bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality)
-    task = bot_task_resolution.task
+    task = _MODALITY_TASK_MAP[args.modality]
     assert task is not None
-    bot_task = bot_task_resolution.bot_task
+    bot_task = args.bot_task
+    if bot_task != "auto":
+        task = task + "_" + bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -176,7 +182,8 @@ def main():
     formatted_prompts: list[OmniPromptType] = []
     for p in prompts:
         token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
-        effective_sys_type = args.sys_type or sys_type_for_task(task)
+        preset_sys_type, _, _ = _TASK_PRESETS[task]
+        effective_sys_type = args.sys_type or preset_sys_type
 
         # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
         # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
@@ -209,7 +216,7 @@ def main():
     # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-    ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids
+    ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
     assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 664975d87fd..e858944c0a4 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -191,10 +191,9 @@ def test_resolve_bot_task_resolves_stop_ids_from_prompt_task():
         HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
     ]
     assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
         HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
         eos_id,
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
     ]
     assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id]
 
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 231c965c3a7..9754cf2c82f 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -22,8 +22,6 @@
 
 from .system_prompt import get_system_prompt
 
-BOT_TASKS = ("auto", "image", "recaption", "think_recaption")
-PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla")
 
 # HunyuanImage-3.0-Instruct special token ids from tokenizer.json.
 # Keep offline AR prompt/stop-token behavior independent of runtime
@@ -48,17 +46,9 @@
     "<img_ratio_36>": 130106,
 }
 
-_BOT_TASK_TO_TOKENIZER_TASK = {
-    "auto": "auto",
-    "image": "image",
-    "recaption": "recaption",
-    "think_recaption": "think",
-}
-
 # task -> (sys_type, bot_task, trigger_tag)
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
-    "t2t_think": ("en_unified", "think", "<think>"),
     "i2t": ("en_unified", None, None),
     "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
@@ -68,227 +58,22 @@
     "t2i_vanilla": ("en_vanilla", "image", None),
 }
 
-_MODALITY_TO_TASK_PREFIX = {
-    "text2text": "t2t",
-    "t2t": "t2t",
-    "img2text": "i2t",
-    "image2text": "i2t",
-    "i2t": "i2t",
-    "text2img": "t2i",
-    "text2image": "t2i",
-    "t2i": "t2i",
-    "img2img": "it2i",
-    "image2image": "it2i",
-    "it2i": "it2i",
-    "ti2i": "it2i",
-}
-
-_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = {
-    "t2t": None,
-    "i2t": None,
-    "t2i": "think",
-    "it2i": "think",
-}
-
-_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = {
-    ("t2t", None): "t2t",
-    ("t2t", "think"): "t2t_think",
-    ("i2t", None): "i2t",
-    ("i2t", "think"): "i2t_think",
-    ("t2i", "think"): "t2i_think",
-    ("t2i", "recaption"): "t2i_recaption",
-    ("t2i", "vanilla"): "t2i_vanilla",
-    ("it2i", "think"): "it2i_think",
-    ("it2i", "recaption"): "it2i_recaption",
-}
-
-_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = {
-    "auto": "auto",
-    "default": "auto",
-    "none": None,
-    "no": None,
-    "false": None,
-    "think": "think",
-    "think_recaption": "think",
-    "recaption": "recaption",
-    "image": "vanilla",
-    "vanilla": "vanilla",
-}
-
-
-@dataclass(frozen=True)
-class BotTaskResolution:
-    """Resolved HunyuanImage3 prompt/bot-task settings."""
-
-    task: str | None
-    sys_type: str | None
-    prompt_bot_task: str | None
-    bot_task: str
-    tokenizer_bot_task: str
-    trigger_tag: str | None
-    stop_token_ids: list[int] | None = None
-
 
 def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
+def resolve_stop_token_ids(
+    task: str = "it2i_think",
+    bot_task: str = "think", 
+    tokenizer: Any | None = None):
+    tkw = tokenizer
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    stop_token_ids = [127957]
+    if trigger_tag:
+        stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
+    return stop_token_ids
 
-def available_prompt_bot_tasks() -> list[str]:
-    """Sorted public bot_task values accepted by `resolve_bot_task` with modality."""
-    return sorted(PROMPT_BOT_TASKS)
-
-
-def _task_preset(task: str) -> tuple[str, str | None, str | None]:
-    if task not in _TASK_PRESETS:
-        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
-    return _TASK_PRESETS[task]
-
-
-def _task_has_image_input(task: str) -> bool:
-    return task.startswith(("i2t", "it2i"))
-
-
-def _normalize_prompt_bot_task(bot_task: str | None) -> str | None:
-    if bot_task is None:
-        return "auto"
-
-    normalized = bot_task.strip().lower()
-    if normalized not in _PROMPT_BOT_TASK_ALIASES:
-        raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}")
-    return _PROMPT_BOT_TASK_ALIASES[normalized]
-
-
-def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str:
-    modality_key = modality.strip().lower()
-    if modality_key not in _MODALITY_TO_TASK_PREFIX:
-        raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}")
-
-    task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key]
-    normalized_bot_task = _normalize_prompt_bot_task(bot_task)
-    if normalized_bot_task == "auto":
-        normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix]
-
-    task_key = (task_prefix, normalized_bot_task)
-    if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK:
-        valid_bot_tasks = sorted(
-            "none" if candidate is None else candidate
-            for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK
-            if prefix == task_prefix
-        )
-        raise ValueError(
-            f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}"
-        )
-
-    return _TASK_BY_PREFIX_AND_BOT_TASK[task_key]
-
-
-def _special_token_id(token: str) -> int:
-    return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token]
-
-
-def _generic_stop_token_ids_for_tokenizer_bot_task(
-    tokenizer_bot_task: str,
-    image_size: int | str | None = None,
-) -> list[int]:
-    eos_id = _special_token_id("<|endoftext|>")
-
-    if image_size == "auto":
-        start_ratio_id = _special_token_id("<img_ratio_0>")
-        end_ratio_id = _special_token_id("<img_ratio_32>")
-        extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1))
-    else:
-        extra_auto_stops = [_special_token_id("<boi>")]
-
-    stop_token_ids = {
-        "auto": [eos_id] + extra_auto_stops,
-        "image": [eos_id],
-        "recaption": [
-            _special_token_id("</recaption>"),
-            _special_token_id("</answer>"),
-            eos_id,
-        ],
-        "think": [
-            _special_token_id("</recaption>"),
-            _special_token_id("</answer>"),
-            eos_id,
-        ],
-    }
-    return stop_token_ids[tokenizer_bot_task]
-
-
-def _stop_token_ids_for_task(
-    task: str,
-    tokenizer_bot_task: str,
-    image_size: int | str | None = None,
-) -> list[int]:
-    if task in ("t2t_think", "i2t_think"):
-        return [
-            _special_token_id("</think>"),
-            _special_token_id("</answer>"),
-            _special_token_id("<|endoftext|>"),
-            _special_token_id("<boi>"),
-        ]
-    return _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size)
-
-
-def resolve_bot_task(
-    bot_task: str | None = "auto",
-    *,
-    modality: str | None = None,
-    task: str | None = None,
-    tokenizer: Any | None = None,
-    image_size: int | str | None = None,
-) -> BotTaskResolution:
-    """Resolve HunyuanImage3 bot-task related prompt settings.
-
-    Pass `modality + bot_task` for CLI/request-level behavior, `task` for a
-    canonical prompt task, or only `bot_task` to validate/map a pipeline
-    HunyuanImage3 bot_task.
-    """
-    del tokenizer  # Stop tokens are fixed HunyuanImage3 control ids.
-
-    if task is not None and modality is not None:
-        raise ValueError("Pass either task or modality, not both.")
-
-    if task is None and modality is not None:
-        task = _task_for_modality_and_prompt_bot_task(modality, bot_task)
-
-    if task is not None:
-        sys_type, preset_bot_task, trigger_tag = _task_preset(task)
-        resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto"
-        prompt_bot_task = preset_bot_task
-    else:
-        sys_type = None
-        trigger_tag = None
-        prompt_bot_task = None
-        resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower()
-
-    if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK:
-        raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}")
-    tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task]
-    stop_token_ids = (
-        _stop_token_ids_for_task(task, tokenizer_bot_task, image_size=image_size)
-        if task is not None
-        else _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size)
-    )
-
-    return BotTaskResolution(
-        task=task,
-        sys_type=sys_type,
-        prompt_bot_task=prompt_bot_task,
-        bot_task=resolved_bot_task,
-        tokenizer_bot_task=tokenizer_bot_task,
-        trigger_tag=trigger_tag,
-        stop_token_ids=stop_token_ids,
-    )
-
-
-def sys_type_for_task(task: str) -> str:
-    """Return the default system prompt type for a canonical prompt task."""
-    sys_type = resolve_bot_task(task=task).sys_type
-    assert sys_type is not None
-    return sys_type
 
 
 def build_prompt(
@@ -301,17 +86,20 @@ def build_prompt(
 
     NOTE: when this string is passed to the engine, the engine's tokenizer
     will run a single BPE pass over the whole string, which can merge
-    tokens across segment boundaries (e.g. `X\n\n` into one token). For
+    tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For
     inputs that need to match HF baseline byte-for-byte, use
     `build_prompt_tokens` instead and feed the result via prompt_token_ids.
     """
-    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
     effective_sys_type = sys_type or preset_sys_type
 
     system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt)
     sys_text = system_prompt.strip() if system_prompt else ""
 
-    has_image_input = _task_has_image_input(task)
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
 
     # t2i_vanilla: pretrain mode for direct text->image generation. The
     # vanilla system prompt drives the model with no chat structure.
@@ -322,8 +110,16 @@ def build_prompt(
         parts.append(user_prompt)
         return "".join(parts)
 
-    # All other tasks use HunyuanImage3 Instruct chat template:
+    # All other tasks (t2t / i2t / t2i_think / t2i_recaption /
+    # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template:
     #   <|startoftext|>{system?}\n\nUser: {<img>?}{user_prompt}\n\nAssistant: {trigger?}
+    # generation_config.json declares sequence_template="instruct", so the
+    # AR prefill MUST use this template -- verified to match HF's
+    # apply_chat_template output token-for-token (modulo BPE boundary merges).
+    # The trigger_tag (e.g. <think>) MUST come AFTER the `Assistant: ` prefix:
+    # if it goes BEFORE user_prompt (the old pretrain layout) the model puts
+    # the user's instructions inside the "thinking section" and collapses
+    # into repetition garbage under greedy decoding.
     parts = ["<|startoftext|>"]
     if sys_text:
         parts.append(f"{sys_text}\n\n")
@@ -345,15 +141,27 @@ def build_prompt_tokens(
     sys_type: str | None = None,
     custom_system_prompt: str | None = None,
 ) -> list[int]:
-    """Segment-by-segment tokenization that matches HF apply_chat_template."""
-    preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task)
+    """Segment-by-segment tokenization that matches HF apply_chat_template.
+
+    Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE
+    merge tokens across segment boundaries (e.g. user_prompt ends with `。`
+    and the next segment is `\\n\\n` -> they merge into a single token id
+    3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes
+    each segment independently and concatenates token_ids, so no cross-
+    boundary merge happens. We replicate that here and feed the result to
+    Omni via OmniTokensPrompt (prompt_token_ids).
+    """
+    if task not in _TASK_PRESETS:
+        raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}")
+
+    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
     effective_sys_type = sys_type or preset_sys_type
 
     bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>")
     img_id = tokenizer.convert_tokens_to_ids("<img>")
     trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None
 
-    has_image_input = _task_has_image_input(task)
+    has_image_input = task.startswith("i2t") or task.startswith("it2i")
 
     # t2i_vanilla uses pretrain template with no chat structure; the vanilla
     # system prompt drives the model directly. No segment boundaries to
@@ -381,15 +189,4 @@ def build_prompt_tokens(
     return ids
 
 
-__all__ = [
-    "BotTaskResolution",
-    "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS",
-    "available_prompt_bot_tasks",
-    "available_tasks",
-    "BOT_TASKS",
-    "build_prompt",
-    "build_prompt_tokens",
-    "PROMPT_BOT_TASKS",
-    "resolve_bot_task",
-    "sys_type_for_task",
-]
+__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS]

From 8d2970b4bcd997029546b46dce9e291e24bb226d Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 16:13:34 +0800
Subject: [PATCH 16/40] Fix HunyuanImage3 i2t think stop token

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../diffusion/models/hunyuan_image3/prompt_utils.py  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 9754cf2c82f..577f8de196e 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -65,12 +65,14 @@ def available_tasks() -> list[str]:
 
 def resolve_stop_token_ids(
     task: str = "it2i_think",
-    bot_task: str = "think", 
-    tokenizer: Any | None = None):
-    tkw = tokenizer
-    preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task]
+    bot_task: str = "think",
+    tokenizer: Any | None = None,
+):
+    _, _, trigger_tag = _TASK_PRESETS[task]
     stop_token_ids = [127957]
-    if trigger_tag:
+    if task in ("t2t_think", "i2t_think"):
+        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
+    elif trigger_tag:
         stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
     return stop_token_ids
 

From 85881e8a5b47d4479d58695d55033e624dba4358 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 16:23:22 +0800
Subject: [PATCH 17/40] Align HunyuanImage3 prompt utils tests

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../hunyuan_image3/test_prompt_utils.py       | 149 +++---------------
 1 file changed, 21 insertions(+), 128 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index e858944c0a4..984377f802f 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -25,12 +25,10 @@
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
     HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS,
-    available_prompt_bot_tasks,
     available_tasks,
     build_prompt,
     build_prompt_tokens,
-    resolve_bot_task,
-    sys_type_for_task,
+    resolve_stop_token_ids,
 )
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
@@ -76,7 +74,6 @@ def test_available_tasks_covers_all_modalities():
     tasks = set(available_tasks())
     assert tasks >= {
         "t2t",
-        "t2t_think",
         "i2t",
         "i2t_think",
         "it2i_think",
@@ -87,127 +84,31 @@ def test_available_tasks_covers_all_modalities():
     }
 
 
-@pytest.mark.parametrize(
-    "task,expected_bot_task",
-    [
-        ("t2t", "auto"),
-        ("t2t_think", "think_recaption"),
-        ("i2t", "auto"),
-        ("i2t_think", "think_recaption"),
-        ("it2i_think", "think_recaption"),
-        ("it2i_recaption", "recaption"),
-        ("t2i_think", "think_recaption"),
-        ("t2i_recaption", "recaption"),
-        ("t2i_vanilla", "image"),
-    ],
-)
-def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: str):
-    resolution = resolve_bot_task(task=task)
-    assert resolution.task == task
-    assert resolution.bot_task == expected_bot_task
-
-
-@pytest.mark.parametrize(
-    "modality,bot_task,expected_task",
-    [
-        ("text2text", "auto", "t2t"),
-        ("img2text", "auto", "i2t"),
-        ("text2img", "auto", "t2i_think"),
-        ("img2img", "auto", "it2i_think"),
-        ("i2t", "think", "i2t_think"),
-        ("ti2i", "recaption", "it2i_recaption"),
-        ("t2i", "vanilla", "t2i_vanilla"),
-        ("text2text", "none", "t2t"),
-    ],
-)
-def test_resolve_bot_task_composes_prompt_task(
-    modality: str,
-    bot_task: str,
-    expected_task: str,
-):
-    assert resolve_bot_task(bot_task, modality=modality).task == expected_task
-
-
-def test_resolve_bot_task_rejects_invalid_combinations():
-    assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"]
-
-    with pytest.raises(ValueError, match="not supported"):
-        resolve_bot_task("recaption", modality="img2text")
-
-    with pytest.raises(ValueError, match="not supported"):
-        resolve_bot_task("vanilla", modality="img2img")
-
-
-def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids():
-    tok = FakeTokenizer()
-
-    resolution = resolve_bot_task("think_recaption", tokenizer=tok)
-
-    assert resolution.task is None
-    assert resolution.bot_task == "think_recaption"
-    assert resolution.tokenizer_bot_task == "think"
-    assert resolution.stop_token_ids == [
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"],
-    ]
-
-
-def test_resolve_bot_task_resolves_stop_ids_from_bot_task():
+def test_resolve_stop_token_ids_uses_end_think_for_i2t_think():
     tok = FakeTokenizer()
 
     eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    boi_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"]
-    end_recaption_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"]
-    end_answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"]
-
-    assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [eos_id, boi_id]
-    assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [eos_id]
-    assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [
-        end_recaption_id,
-        end_answer_id,
-        eos_id,
-    ]
-    assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [
-        end_recaption_id,
-        end_answer_id,
-        eos_id,
-    ]
-    assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [
+    assert resolve_stop_token_ids(task="i2t_think", tokenizer=tok) == [
         eos_id,
-        *range(
-            HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_0>"],
-            HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img_ratio_32>"] + 1,
-        ),
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
     ]
 
 
-def test_resolve_bot_task_resolves_stop_ids_from_prompt_task():
+def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks():
     tok = FakeTokenizer()
 
     eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [
-        eos_id,
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<boi>"],
-    ]
-    assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</answer>"],
+    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL["<think>"]]
+    assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [
         eos_id,
+        FakeTokenizer.SPECIAL["<recaption>"],
     ]
-    assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id]
-
-
-def test_sys_type_for_task_returns_prompt_preset_default():
-    assert sys_type_for_task("i2t_think") == "en_unified"
-    assert sys_type_for_task("t2i_vanilla") == "en_vanilla"
 
 
 @pytest.mark.parametrize(
     "task",
     [
         "t2t",
-        "t2t_think",
         "i2t",
         "i2t_think",
         "it2i_think",
@@ -238,7 +139,7 @@ def test_build_prompt_string_structure_chat_template(task: str):
     # documentation, so substring index() catches the wrong occurrence -- use
     # endswith() which directly captures "trigger is at the tail" (the Part A
     # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
-    if task in ("t2t_think", "i2t_think", "it2i_think", "t2i_think"):
+    if task in ("i2t_think", "it2i_think", "t2i_think"):
         assert s.endswith("Assistant: <think>"), (
             f"Trigger <think> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
         )
@@ -294,31 +195,24 @@ def test_build_prompt_tokens_segments_each_boundary():
 def test_build_prompt_tokens_image_placeholder_present_for_image_tasks():
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="i2t")
-    assert ids[0] == HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|startoftext|>"], (
-        "BOS (<|startoftext|>) must be the first token"
-    )
-    assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img>"] in ids, (
-        "<img> placeholder must be present for i2t/it2i tasks"
-    )
+    assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"], "BOS (<|startoftext|>) must be the first token"
+    assert FakeTokenizer.SPECIAL["<img>"] in ids, "<img> placeholder must be present for i2t/it2i tasks"
 
 
 def test_build_prompt_tokens_no_image_for_text_only_tasks():
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="t2t")
-    assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<img>"] not in ids, (
-        "<img> must NOT appear for text-only tasks"
-    )
+    assert FakeTokenizer.SPECIAL["<img>"] not in ids, "<img> must NOT appear for text-only tasks"
 
 
 @pytest.mark.parametrize(
     "task,trigger_id",
     [
-        ("t2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
-        ("i2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
-        ("it2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
-        ("t2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"]),
-        ("it2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"]),
-        ("t2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"]),
+        ("i2t_think", FakeTokenizer.SPECIAL["<think>"]),
+        ("it2i_think", FakeTokenizer.SPECIAL["<think>"]),
+        ("t2i_think", FakeTokenizer.SPECIAL["<think>"]),
+        ("it2i_recaption", FakeTokenizer.SPECIAL["<recaption>"]),
+        ("t2i_recaption", FakeTokenizer.SPECIAL["<recaption>"]),
     ],
 )
 def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int):
@@ -333,8 +227,8 @@ def test_build_prompt_tokens_no_trigger_for_plain_tasks():
     tok = FakeTokenizer()
     ids = build_prompt_tokens("hi", tok, task="t2t")
     assert ids[-1] not in {
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<think>"],
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<recaption>"],
+        FakeTokenizer.SPECIAL["<think>"],
+        FakeTokenizer.SPECIAL["<recaption>"],
     }
 
 
@@ -381,10 +275,9 @@ def test_end2end_routes_through_shared_prompt_utils():
         if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"):
             imported_from_prompt_utils.update(alias.name for alias in node.names)
     expected_imports = {
-        "available_prompt_bot_tasks",
+        "_TASK_PRESETS",
         "build_prompt_tokens",
-        "resolve_bot_task",
-        "sys_type_for_task",
+        "resolve_stop_token_ids",
     }
     assert expected_imports <= imported_from_prompt_utils, (
         "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from "

From a72f4578ddd2b9ba190a4b1baa6ac20337ddc7e6 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 17:26:36 +0800
Subject: [PATCH 18/40] Remove unsupported HunyuanImage3 comprehension think
 tasks

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../offline_inference/hunyuan_image3/end2end.py   | 13 ++++++++++++-
 .../models/hunyuan_image3/test_prompt_utils.py    | 15 +--------------
 .../models/hunyuan_image3/prompt_utils.py         |  7 ++-----
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index ceebc2d3f39..09533a67ff0 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -18,9 +18,9 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
+    _TASK_PRESETS,
     build_prompt_tokens,
     resolve_stop_token_ids,
-    _TASK_PRESETS
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
@@ -51,6 +51,7 @@
     "text2text": "t2t",
 }
 
+
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
     parser.add_argument(
@@ -132,6 +133,16 @@ def main():
     bot_task = args.bot_task
     if bot_task != "auto":
         task = task + "_" + bot_task
+    if task not in _TASK_PRESETS:
+        valid_bot_tasks = {
+            "text2img": ["think", "recaption", "vanilla"],
+            "img2img": ["think", "recaption"],
+            "img2text": ["auto"],
+            "text2text": ["auto"],
+        }[args.modality]
+        raise ValueError(
+            f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}"
+        )
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 984377f802f..bb24797f44c 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -75,7 +75,6 @@ def test_available_tasks_covers_all_modalities():
     assert tasks >= {
         "t2t",
         "i2t",
-        "i2t_think",
         "it2i_think",
         "it2i_recaption",
         "t2i_think",
@@ -84,16 +83,6 @@ def test_available_tasks_covers_all_modalities():
     }
 
 
-def test_resolve_stop_token_ids_uses_end_think_for_i2t_think():
-    tok = FakeTokenizer()
-
-    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    assert resolve_stop_token_ids(task="i2t_think", tokenizer=tok) == [
-        eos_id,
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
-    ]
-
-
 def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks():
     tok = FakeTokenizer()
 
@@ -110,7 +99,6 @@ def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks():
     [
         "t2t",
         "i2t",
-        "i2t_think",
         "it2i_think",
         "it2i_recaption",
         "t2i_think",
@@ -139,7 +127,7 @@ def test_build_prompt_string_structure_chat_template(task: str):
     # documentation, so substring index() catches the wrong occurrence -- use
     # endswith() which directly captures "trigger is at the tail" (the Part A
     # fix: trigger goes AFTER `Assistant: `, not before user_prompt).
-    if task in ("i2t_think", "it2i_think", "t2i_think"):
+    if task in ("it2i_think", "t2i_think"):
         assert s.endswith("Assistant: <think>"), (
             f"Trigger <think> must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}"
         )
@@ -208,7 +196,6 @@ def test_build_prompt_tokens_no_image_for_text_only_tasks():
 @pytest.mark.parametrize(
     "task,trigger_id",
     [
-        ("i2t_think", FakeTokenizer.SPECIAL["<think>"]),
         ("it2i_think", FakeTokenizer.SPECIAL["<think>"]),
         ("t2i_think", FakeTokenizer.SPECIAL["<think>"]),
         ("it2i_recaption", FakeTokenizer.SPECIAL["<recaption>"]),
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 577f8de196e..2ca7f4c77cd 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -50,7 +50,6 @@
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
     "i2t": ("en_unified", None, None),
-    "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
     "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
     "t2i_think": ("en_unified", "think", "<think>"),
@@ -63,6 +62,7 @@ def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
+
 def resolve_stop_token_ids(
     task: str = "it2i_think",
     bot_task: str = "think",
@@ -70,14 +70,11 @@ def resolve_stop_token_ids(
 ):
     _, _, trigger_tag = _TASK_PRESETS[task]
     stop_token_ids = [127957]
-    if task in ("t2t_think", "i2t_think"):
-        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
-    elif trigger_tag:
+    if trigger_tag:
         stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
     return stop_token_ids
 
 
-
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",

From 596148bf012113b76c460000ffa2140d11677bd0 Mon Sep 17 00:00:00 2001
From: "Y. Fisher" <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 18:41:06 +0800
Subject: [PATCH 19/40] update

Signed-off-by: Y. Fisher <yukexiong1@huawei.com>
---
 vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index 2ca7f4c77cd..d0137001034 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -50,8 +50,10 @@
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
     "i2t": ("en_unified", None, None),
+    "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
     "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
+    "t2i": ("en_unified", "image", None),
     "t2i_think": ("en_unified", "think", "<think>"),
     "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
     "t2i_vanilla": ("en_vanilla", "image", None),
@@ -62,7 +64,6 @@ def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
-
 def resolve_stop_token_ids(
     task: str = "it2i_think",
     bot_task: str = "think",
@@ -70,11 +71,14 @@ def resolve_stop_token_ids(
 ):
     _, _, trigger_tag = _TASK_PRESETS[task]
     stop_token_ids = [127957]
-    if trigger_tag:
+    if task in ("t2t_think", "i2t_think"):
+        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
+    elif trigger_tag:
         stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
     return stop_token_ids
 
 
+
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",

From 29e9f945e9287e42546104ce038497c9d75d579a Mon Sep 17 00:00:00 2001
From: "Y. Fisher" <yukexiong1@huawei.com>
Date: Fri, 8 May 2026 18:43:04 +0800
Subject: [PATCH 20/40] update

Signed-off-by: Y. Fisher <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index b5238169786..505f4ed5919 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -3,11 +3,13 @@
 # (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the
 # verified NPU/XPU stage configs that previously lived under stage_configs/.
 pipeline: hunyuan_image3
+async_chunk: false
 
 stages:
   - stage_id: 0
     max_num_seqs: 1
     gpu_memory_utilization: 0.9
+    trust_remote_code: true
     enforce_eager: true
     max_num_batched_tokens: 32768
     devices: "0,1,2,3"
@@ -34,18 +36,12 @@ stages:
     cache_config:
     enable_cache_dit_summary: false
     parallel_config:
-      pipeline_parallel_size: 1
-      data_parallel_size: 1
       tensor_parallel_size: 4
       enable_expert_parallel: true
       sequence_parallel_size: 1
       ulysses_degree: 1
-      ring_degree: 1
       cfg_parallel_size: 1
       vae_patch_parallel_size: 1
-      use_hsdp: false
-      hsdp_shard_size: -1
-      hsdp_replicate_size: 1
     default_sampling_params:
       seed: 42
 

From 1ccedc6e26f0535b09c3768f694001c3a20b5e04 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 10:52:03 +0800
Subject: [PATCH 21/40] Update HunyuanImage3 stop token handling

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 examples/offline_inference/hunyuan_image3/end2end.py   |  2 +-
 vllm_omni/config/stage_config.py                       |  8 --------
 .../diffusion/models/hunyuan_image3/prompt_utils.py    | 10 ++++------
 vllm_omni/entrypoints/openai/serving_chat.py           |  1 +
 vllm_omni/entrypoints/utils.py                         |  1 -
 5 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 09533a67ff0..7fb267ab6cc 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -96,7 +96,7 @@ def parse_args():
         "--bot-task",
         type=str,
         default="auto",
-        choices=["auto", "think", "recaption", "vanilla"],
+        choices=["auto", "think", "recaption", "think_recaption", "vanilla"],
         help=(
             "Prompt behavior. 'auto' selects the default for the modality; "
             "'think' adds <think>; 'recaption' adds <recaption>; "
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 0bd1f2b7f8f..a879a9a0cda 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -1079,14 +1079,6 @@ def create_from_model(
         if model_type and model_type in _PIPELINE_REGISTRY:
             return cls._create_from_registry(model_type, cli_overrides, deploy_config_path)
 
-        if deploy_config_path is not None:
-            deploy_cfg = load_deploy_config(deploy_config_path)
-            if deploy_cfg.pipeline and deploy_cfg.pipeline in _PIPELINE_REGISTRY:
-                return cls._create_from_registry(
-                    deploy_cfg.pipeline,
-                    cli_overrides,
-                    deploy_config_path,
-                )
 
         # --- HF architecture fallback: some models report a generic
         # model_type that collides with another model. Match by the
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index d0137001034..c14ae8ced23 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -50,13 +50,12 @@
 _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
     "t2t": ("en_unified", None, None),
     "i2t": ("en_unified", None, None),
-    "i2t_think": ("en_unified", "think", "<think>"),
     "it2i_think": ("en_unified", "think", "<think>"),
     "it2i_recaption": ("en_unified", "recaption", "<recaption>"),
     "t2i": ("en_unified", "image", None),
+    "t2i_vanilla": ("en_vanilla", "image", None),
     "t2i_think": ("en_unified", "think", "<think>"),
     "t2i_recaption": ("en_unified", "recaption", "<recaption>"),
-    "t2i_vanilla": ("en_vanilla", "image", None),
 }
 
 
@@ -69,12 +68,11 @@ def resolve_stop_token_ids(
     bot_task: str = "think",
     tokenizer: Any | None = None,
 ):
-    _, _, trigger_tag = _TASK_PRESETS[task]
     stop_token_ids = [127957]
-    if task in ("t2t_think", "i2t_think"):
+    if "recaption" in task:
         stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
-    elif trigger_tag:
-        stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag))
+    if "think" in task:
+        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"])
     return stop_token_ids
 
 
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 09b62bf8972..7558e85aaac 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2149,6 +2149,7 @@ def _build_multistage_generation_inputs(
         lora_body = extra_body.get("lora")
         layers = extra_body.get("layers")
         resolution = extra_body.get("resolution")
+        bot_task = extra_body.get("bot_task")
 
         engine_prompt_data: dict[str, Any] | None = None
         modalities = ["image"]
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index 460c6985b0c..d728e76417c 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -340,7 +340,6 @@ def resolve_model_config_path(model: str) -> str:
         normalized_model_type = _DIFFUSERS_CLASS_TO_CONFIG[model_type]
     else:
         normalized_model_type = model_type.replace("-", "_")
-
     model_type_str = f"{normalized_model_type}.yaml"
     complete_config_path = PROJECT_ROOT / default_config_path / model_type_str
     if os.path.exists(complete_config_path):

From a63b9ffcdcea2bd13dcbaf3928853e66c984c301 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 11:03:57 +0800
Subject: [PATCH 22/40] Fix HunyuanImage3 pre-commit formatting

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/config/stage_config.py                          | 1 -
 vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index a879a9a0cda..dcc4d5ec9d6 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -1079,7 +1079,6 @@ def create_from_model(
         if model_type and model_type in _PIPELINE_REGISTRY:
             return cls._create_from_registry(model_type, cli_overrides, deploy_config_path)
 
-
         # --- HF architecture fallback: some models report a generic
         # model_type that collides with another model. Match by the
         # hf_architectures declared on each registered PipelineConfig.
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index c14ae8ced23..d0b9f2ca40f 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -17,12 +17,10 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
 from typing import Any
 
 from .system_prompt import get_system_prompt
 
-
 # HunyuanImage-3.0-Instruct special token ids from tokenizer.json.
 # Keep offline AR prompt/stop-token behavior independent of runtime
 # tokenizer lookup for these fixed control tokens.
@@ -63,6 +61,7 @@ def available_tasks() -> list[str]:
     """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`."""
     return sorted(_TASK_PRESETS)
 
+
 def resolve_stop_token_ids(
     task: str = "it2i_think",
     bot_task: str = "think",
@@ -76,7 +75,6 @@ def resolve_stop_token_ids(
     return stop_token_ids
 
 
-
 def build_prompt(
     user_prompt: str,
     task: str = "it2i_think",

From 21e16afa2310825a4fd9d28001a85faca25cb529 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 11:07:48 +0800
Subject: [PATCH 23/40] Add HunyuanImage3 KV reuse deploy config

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 505f4ed5919..2a331af5186 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -5,6 +5,17 @@
 pipeline: hunyuan_image3
 async_chunk: false
 
+connectors:
+  rdma_connector:
+    name: MooncakeTransferEngineConnector
+    extra:
+      host: "auto"
+      zmq_port: 50051
+      protocol: "rdma"
+      device_name: ""
+      memory_pool_size: 4294967296
+      memory_pool_device: "cpu"
+
 stages:
   - stage_id: 0
     max_num_seqs: 1
@@ -18,6 +29,10 @@ stages:
       rope_parameters:
         mrope_section: [0, 32, 32]
         rope_type: default
+    omni_kv_config:
+      need_send_cache: true
+    output_connectors:
+      to_stage_1: rdma_connector
     default_sampling_params:
       temperature: 0.6
       top_p: 0.95
@@ -35,6 +50,8 @@ stages:
     cache_backend:
     cache_config:
     enable_cache_dit_summary: false
+    omni_kv_config:
+      need_recv_cache: true
     parallel_config:
       tensor_parallel_size: 4
       enable_expert_parallel: true
@@ -44,6 +61,8 @@ stages:
       vae_patch_parallel_size: 1
     default_sampling_params:
       seed: 42
+    input_connectors:
+      from_stage_0: rdma_connector
 
 edges:
   - from: 0

From 6ae5389a89d353c7dca01ef5a035ff8e80b7ba11 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 11:14:07 +0800
Subject: [PATCH 24/40] Address HunyuanImage3 deploy path review

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 tests/e2e/offline_inference/test_hunyuanimage3.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/offline_inference/test_hunyuanimage3.py b/tests/e2e/offline_inference/test_hunyuanimage3.py
index 2a385f6a4c0..ac1cb13cba7 100644
--- a/tests/e2e/offline_inference/test_hunyuanimage3.py
+++ b/tests/e2e/offline_inference/test_hunyuanimage3.py
@@ -1,6 +1,5 @@
 # ruff: noqa: E501
 from collections.abc import Generator
-from pathlib import Path
 
 import pytest
 import torch
@@ -9,6 +8,7 @@
 from transformers import CLIPModel, CLIPProcessor
 
 from tests.helpers.runtime import OmniRunner
+from tests.helpers.stage_config import get_deploy_config_path
 from vllm_omni import Omni
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.platforms import current_omni_platform
@@ -16,8 +16,7 @@
 PROMPT = "A brown and white dog is running on the grass"
 MODEL_NAME = "tencent/HunyuanImage-3.0"
 LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32"
-REPO_ROOT = Path(__file__).resolve().parents[3]
-DEPLOY_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml"
+DEPLOY_CONFIG_PATH = get_deploy_config_path("hunyuan_image3.yaml")
 
 pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion]
 

From 02a83784af53577e5d618ec53a67c406518ca3d5 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 14:41:14 +0800
Subject: [PATCH 25/40] Limit HunyuanImage3 images per prompt

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 examples/offline_inference/hunyuan_image3/README.md  | 1 +
 examples/offline_inference/hunyuan_image3/end2end.py | 8 ++++++++
 vllm_omni/deploy/hunyuan_image3.yaml                 | 1 +
 vllm_omni/deploy/hunyuan_image3_dit.yaml             | 1 +
 4 files changed, 11 insertions(+)

diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 98908ace0d7..431f081300d 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -108,6 +108,7 @@ python examples/offline_inference/hunyuan_image3/end2end.py \
 | `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. |
 | `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. |
 | `--steps` | Number of diffusion inference steps for image generation. |
+| `--num-outputs-per-prompt` | Number of images to generate for each prompt. Defaults to `1`. |
 | `--guidance-scale` | Classifier-free guidance scale for image generation. |
 | `--height`, `--width` | Output image size for `text2img`. |
 | `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds `<think>`; `recaption` adds `<recaption>`; `vanilla` uses the text-to-image pretrain template. |
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 7fb267ab6cc..87f5c62a2a0 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -85,6 +85,12 @@ def parse_args():
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
     parser.add_argument("--height", type=int, default=1024, help="Output image height.")
     parser.add_argument("--width", type=int, default=1024, help="Output image width.")
+    parser.add_argument(
+        "--num-outputs-per-prompt",
+        type=int,
+        default=1,
+        help="Number of images to generate for each prompt.",
+    )
     parser.add_argument(
         "--vae-use-tiling",
         action="store_true",
@@ -232,6 +238,7 @@ def main():
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
+            sp.num_outputs_per_prompt = args.num_outputs_per_prompt
             sp.guidance_scale = args.guidance_scale
             sp.guidance_scale_provided = True
             if args.seed is not None:
@@ -256,6 +263,7 @@ def main():
     print(f"  Num stages: {omni.num_stages}")
     if args.modality in ("text2img", "img2img"):
         print(f"  Inference steps: {args.steps}")
+        print(f"  Outputs per prompt: {args.num_outputs_per_prompt}")
         print(f"  Guidance scale: {args.guidance_scale}")
         print(f"  Seed: {args.seed}")
     if args.modality == "text2img":
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 2a331af5186..d49800f72c8 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -61,6 +61,7 @@ stages:
       vae_patch_parallel_size: 1
     default_sampling_params:
       seed: 42
+      num_outputs_per_prompt: 1
     input_connectors:
       from_stage_0: rdma_connector
 
diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml
index 3c0ba190101..3b922df20ea 100644
--- a/vllm_omni/deploy/hunyuan_image3_dit.yaml
+++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml
@@ -31,6 +31,7 @@ stages:
       hsdp_replicate_size: 1
     default_sampling_params:
       seed: 42
+      num_outputs_per_prompt: 1
 
 platforms:
   npu:

From 476a7f03e3191a465ff52fb959a3b90159c14784 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 15:43:14 +0800
Subject: [PATCH 26/40] Revert "Limit HunyuanImage3 images per prompt"

This reverts commit dac00c4b7a7b24dd5e2fbfa987062a0bb9dcc3be.

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 examples/offline_inference/hunyuan_image3/README.md  | 1 -
 examples/offline_inference/hunyuan_image3/end2end.py | 8 --------
 vllm_omni/deploy/hunyuan_image3.yaml                 | 1 -
 vllm_omni/deploy/hunyuan_image3_dit.yaml             | 1 -
 4 files changed, 11 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md
index 431f081300d..98908ace0d7 100644
--- a/examples/offline_inference/hunyuan_image3/README.md
+++ b/examples/offline_inference/hunyuan_image3/README.md
@@ -108,7 +108,6 @@ python examples/offline_inference/hunyuan_image3/end2end.py \
 | `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. |
 | `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. |
 | `--steps` | Number of diffusion inference steps for image generation. |
-| `--num-outputs-per-prompt` | Number of images to generate for each prompt. Defaults to `1`. |
 | `--guidance-scale` | Classifier-free guidance scale for image generation. |
 | `--height`, `--width` | Output image size for `text2img`. |
 | `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds `<think>`; `recaption` adds `<recaption>`; `vanilla` uses the text-to-image pretrain template. |
diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 87f5c62a2a0..7fb267ab6cc 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -85,12 +85,6 @@ def parse_args():
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
     parser.add_argument("--height", type=int, default=1024, help="Output image height.")
     parser.add_argument("--width", type=int, default=1024, help="Output image width.")
-    parser.add_argument(
-        "--num-outputs-per-prompt",
-        type=int,
-        default=1,
-        help="Number of images to generate for each prompt.",
-    )
     parser.add_argument(
         "--vae-use-tiling",
         action="store_true",
@@ -238,7 +232,6 @@ def main():
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
-            sp.num_outputs_per_prompt = args.num_outputs_per_prompt
             sp.guidance_scale = args.guidance_scale
             sp.guidance_scale_provided = True
             if args.seed is not None:
@@ -263,7 +256,6 @@ def main():
     print(f"  Num stages: {omni.num_stages}")
     if args.modality in ("text2img", "img2img"):
         print(f"  Inference steps: {args.steps}")
-        print(f"  Outputs per prompt: {args.num_outputs_per_prompt}")
         print(f"  Guidance scale: {args.guidance_scale}")
         print(f"  Seed: {args.seed}")
     if args.modality == "text2img":
diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index d49800f72c8..2a331af5186 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -61,7 +61,6 @@ stages:
       vae_patch_parallel_size: 1
     default_sampling_params:
       seed: 42
-      num_outputs_per_prompt: 1
     input_connectors:
       from_stage_0: rdma_connector
 
diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml
index 3b922df20ea..3c0ba190101 100644
--- a/vllm_omni/deploy/hunyuan_image3_dit.yaml
+++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml
@@ -31,7 +31,6 @@ stages:
       hsdp_replicate_size: 1
     default_sampling_params:
       seed: 42
-      num_outputs_per_prompt: 1
 
 platforms:
   npu:

From 8f594ee92243e9baa01f96549b638335048511b1 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sat, 9 May 2026 16:21:33 +0800
Subject: [PATCH 27/40] Fix HunyuanImage3 stop token mapping

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index d0b9f2ca40f..c975bc3ab61 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -69,9 +69,9 @@ def resolve_stop_token_ids(
 ):
     stop_token_ids = [127957]
     if "recaption" in task:
-        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
-    if "think" in task:
         stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"])
+    if "think" in task:
+        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
     return stop_token_ids
 
 

From 5c03b7ca807c7df0c708dc1dd1f45619f8668104 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sun, 10 May 2026 14:19:46 +0800
Subject: [PATCH 28/40] Enable model sampler for NPU AR runner

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../npu/worker/npu_ar_model_runner.py         | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
index 8cff1849aa5..a4acd421cf8 100644
--- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
+++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
@@ -5,6 +5,7 @@
 
 import time
 from copy import copy, deepcopy
+from dataclasses import replace
 from typing import Any, NamedTuple
 
 import numpy as np
@@ -92,6 +93,83 @@ def _make_buffer(self, *size, dtype, numpy=True):
         with maybe_disable_pin_memory_for_ray(self, total_bytes):
             return super()._make_buffer(*size, dtype=dtype, numpy=numpy)
 
+    def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
+        """Build decoded-token history for custom model samplers.
+
+        vLLM only populates sampling_metadata.output_token_ids when penalties or
+        logits processors require it. HunyuanImage3's custom sampler needs this
+        history to force transitions such as </think> -> <recaption>, so mirror
+        the GPU AR runner behavior for prefer_model_sampler models.
+        """
+        req_output_token_ids = getattr(self.input_batch, "req_output_token_ids", [])
+        req_ids = list(getattr(self.input_batch, "req_ids", []))
+        output_token_ids = [list(req_output_token_ids[idx] or []) for idx in range(len(req_ids))]
+
+        sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None)
+        async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None)
+        prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None)
+        if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None:
+            return output_token_ids
+
+        sampled_token_ids: list[list[int]] | None = None
+        for index, req_id in enumerate(req_ids):
+            prev_index = prev_req_id_to_index.get(req_id)
+            if prev_index is None:
+                continue
+            req_history = output_token_ids[index]
+            if not req_history or req_history[-1] != -1:
+                continue
+            if sampled_token_ids is None:
+                assert async_copy_ready_event is not None
+                async_copy_ready_event.synchronize()
+                sampled_token_ids = sampled_token_ids_cpu.tolist()
+            new_ids = list(sampled_token_ids[prev_index])
+            if not new_ids:
+                continue
+            num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1)
+            first_placeholder = req_history.index(-1)
+            num_placeholders = len(req_history) - first_placeholder
+            num_to_replace = min(num_sampled_ids, num_placeholders)
+            req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
+
+        return output_token_ids
+
+    def _sampling_metadata_for_model_sampler(self, sampling_metadata):
+        output_token_ids = self._build_model_sampler_output_token_ids()
+        if output_token_ids == sampling_metadata.output_token_ids:
+            return sampling_metadata
+        return replace(sampling_metadata, output_token_ids=output_token_ids)
+
+    def _sample(
+        self,
+        logits: torch.Tensor | None,
+        spec_decode_metadata: Any,
+    ):
+        sampling_metadata = self.input_batch.sampling_metadata
+        if spec_decode_metadata is None:
+            model_sample = getattr(self.model, "sample", None)
+            if logits is not None and callable(model_sample) and getattr(self.model, "prefer_model_sampler", False):
+                if hasattr(self.sampler, "logit_bias_state"):
+                    self.sampler.logit_bias_state.apply_logit_bias(
+                        logits,
+                        self.input_batch.expanded_idx_mapping,
+                        self.input_batch.idx_mapping_np,
+                        self.input_batch.positions[self.input_batch.logits_indices],
+                    )
+                sampler_output = model_sample(
+                    logits,
+                    self._sampling_metadata_for_model_sampler(sampling_metadata),
+                )
+                if sampler_output is not None:
+                    return sampler_output
+            self.input_batch.update_async_output_token_ids()
+            return self.sampler(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
+
+        return super()._sample(logits, spec_decode_metadata)
+
     #  -------------------------------------- Omni-new -------------------------------------------------
     def capture_model(self) -> int:
         npugraph_memory_bytes = super().capture_model()

From 32ea60f470cbcf08c96ce27ae7a26f445f5174ee Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sun, 10 May 2026 14:31:25 +0800
Subject: [PATCH 29/40] Update HunyuanImage3 KV reuse deploy config

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 59 +++++++++++++---------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 2a331af5186..775b0c0f34a 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -1,7 +1,7 @@
-# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1).
-# The base CUDA layout follows the existing 8-GPU AR->DiT config
-# (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the
-# verified NPU/XPU stage configs that previously lived under stage_configs/.
+# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1)
+# with AR-to-DiT KV reuse. The base CUDA layout follows
+# model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml from
+# PR #3346: 2 GPUs for AR and 2 GPUs for DiT.
 pipeline: hunyuan_image3
 async_chunk: false
 
@@ -19,12 +19,11 @@ connectors:
 stages:
   - stage_id: 0
     max_num_seqs: 1
-    gpu_memory_utilization: 0.9
-    trust_remote_code: true
+    gpu_memory_utilization: 0.95
     enforce_eager: true
     max_num_batched_tokens: 32768
-    devices: "0,1,2,3"
-    tensor_parallel_size: 4
+    devices: "0,1"
+    tensor_parallel_size: 2
     hf_overrides:
       rope_parameters:
         mrope_section: [0, 32, 32]
@@ -34,35 +33,29 @@ stages:
     output_connectors:
       to_stage_1: rdma_connector
     default_sampling_params:
-      temperature: 0.6
-      top_p: 0.95
-      top_k: 1024
-      max_tokens: 4096
-      detokenize: false
+      temperature: 0.0
+      top_p: 1
+      top_k: -1
+      max_tokens: 8192
+      stop_token_ids: [128025]
+      detokenize: true
+      skip_special_tokens: false
 
   - stage_id: 1
     max_num_seqs: 1
-    gpu_memory_utilization: 0.9
     enforce_eager: true
-    devices: "4,5,6,7"
-    vae_use_slicing: false
-    vae_use_tiling: false
-    cache_backend:
-    cache_config:
-    enable_cache_dit_summary: false
+    devices: "2,3"
+    distributed_executor_backend: "mp"
     omni_kv_config:
       need_recv_cache: true
     parallel_config:
-      tensor_parallel_size: 4
+      tensor_parallel_size: 2
       enable_expert_parallel: true
-      sequence_parallel_size: 1
-      ulysses_degree: 1
-      cfg_parallel_size: 1
-      vae_patch_parallel_size: 1
-    default_sampling_params:
-      seed: 42
     input_connectors:
       from_stage_0: rdma_connector
+    default_sampling_params:
+      num_inference_steps: 50
+      guidance_scale: 0
 
 edges:
   - from: 0
@@ -75,15 +68,15 @@ platforms:
     stages:
       - stage_id: 0
         gpu_memory_utilization: 0.65
-        devices: "0,1,2,3"
-        tensor_parallel_size: 4
+        devices: "0,1,2,3,4,5,6,7"
+        tensor_parallel_size: 8
       - stage_id: 1
         gpu_memory_utilization: 0.65
-        devices: "4,5,6,7"
+        devices: "8,9,10,11"
         max_num_batched_tokens: 32768
         parallel_config:
           tensor_parallel_size: 4
-          enable_expert_parallel: false
+          enable_expert_parallel: true
 
   xpu:
     stages:
@@ -95,6 +88,10 @@ platforms:
         quantization: fp8
         enable_expert_parallel: true
         worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker
+        default_sampling_params:
+          max_tokens: 2048
+          seed: 42
+          repetition_penalty: 1.1
       - stage_id: 1
         gpu_memory_utilization: 0.9
         devices: "0,1,2,3,4,5,6,7"

From c7643df0891b08d406e3b5e826bda1a2c5f09318 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sun, 10 May 2026 15:27:44 +0800
Subject: [PATCH 30/40] Fix HunyuanImage3 stop token unit test

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index bb24797f44c..a9d570936bf 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -55,6 +55,7 @@ class FakeTokenizer:
         "</recaption>": 6,
         "</answer>": 7,
         "<boi>": 8,
+        "</think>": 9,
         **{f"<img_ratio_{i}>": 1000 + i for i in range(33)},
     }
 
@@ -83,14 +84,14 @@ def test_available_tasks_covers_all_modalities():
     }
 
 
-def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks():
+def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks():
     tok = FakeTokenizer()
 
     eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL["<think>"]]
+    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL["</think>"]]
     assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [
         eos_id,
-        FakeTokenizer.SPECIAL["<recaption>"],
+        FakeTokenizer.SPECIAL["</recaption>"],
     ]
 
 

From 553bd8b9623394bcb847bb3e3ef8970b9608beeb Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sun, 10 May 2026 16:26:26 +0800
Subject: [PATCH 31/40] Update HunyuanImage3 deploy config

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index 775b0c0f34a..b414f6eb78a 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -4,6 +4,7 @@
 # PR #3346: 2 GPUs for AR and 2 GPUs for DiT.
 pipeline: hunyuan_image3
 async_chunk: false
+trust_remote_code: true
 
 connectors:
   rdma_connector:
@@ -67,7 +68,7 @@ platforms:
   npu:
     stages:
       - stage_id: 0
-        gpu_memory_utilization: 0.65
+        gpu_memory_utilization: 0.75
         devices: "0,1,2,3,4,5,6,7"
         tensor_parallel_size: 8
       - stage_id: 1

From badd206bac720377378039704a34edbea5f209d5 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Sun, 10 May 2026 17:22:00 +0800
Subject: [PATCH 32/40] Fix HunyuanImage3 stop token test ids

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index a9d570936bf..83280ebefd5 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -88,10 +88,13 @@ def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks():
     tok = FakeTokenizer()
 
     eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL["</think>"]]
+    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [
+        eos_id,
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
+    ]
     assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [
         eos_id,
-        FakeTokenizer.SPECIAL["</recaption>"],
+        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
     ]
 
 

From 3975e50c552bfbf26dc7c838fc4df86d344812a4 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 15:05:47 +0800
Subject: [PATCH 33/40] Print HunyuanImage3 AR generated text

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 examples/offline_inference/hunyuan_image3/end2end.py  | 11 +++++++++--
 .../models/hunyuan_image3/pipeline_hunyuan_image3.py  |  4 ++++
 .../diffusion/models/hunyuan_image3/prompt_utils.py   |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py
index 7fb267ab6cc..0f41150b65d 100644
--- a/examples/offline_inference/hunyuan_image3/end2end.py
+++ b/examples/offline_inference/hunyuan_image3/end2end.py
@@ -273,10 +273,17 @@ def main():
     for req_output in omni_outputs:
         # Text output (AR stage or text-only)
         ro = getattr(req_output, "request_output", None)
+        txt = ""
         if ro and getattr(ro, "outputs", None):
             txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs)
-            if txt:
-                print(f"[Output] Text:\n{txt}")
+        if not txt:
+            ar_text = getattr(req_output, "custom_output", {}).get("ar_generated_text")
+            if isinstance(ar_text, list):
+                txt = "\n".join(text for text in ar_text if text)
+            else:
+                txt = ar_text or ""
+        if txt:
+            print(f"[Output] Text:\n{txt}")
 
         # Image output (DiT stage)
         images = getattr(req_output, "images", None)
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
index 7a8be07456d..5c0ef163506 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py
@@ -1424,7 +1424,11 @@ def forward(
         model_inputs.update(ar_kv_kwargs)
 
         outputs = self._generate(**model_inputs, **kwargs)
+        custom_output = {}
+        if any(t is not None for t in cot_text_list):
+            custom_output["ar_generated_text"] = cot_text_list[0] if len(cot_text_list) == 1 else cot_text_list
         return DiffusionOutput(
             output=outputs[0],
+            custom_output=custom_output,
             stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None,
         )
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index c975bc3ab61..a8b2c743a82 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -67,7 +67,7 @@ def resolve_stop_token_ids(
     bot_task: str = "think",
     tokenizer: Any | None = None,
 ):
-    stop_token_ids = [127957]
+    stop_token_ids = [128025]
     if "recaption" in task:
         stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"])
     if "think" in task:

From 015b34ffb82ec33c16a4aa9ed745fa4c124867c3 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 15:20:37 +0800
Subject: [PATCH 34/40] Preserve HunyuanImage3 AR tag output

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../models/hunyuan_image3/test_prompt_utils.py     | 14 ++++----------
 .../models/hunyuan_image3/prompt_utils.py          |  7 +------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
index 83280ebefd5..f899e5a65d1 100644
--- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
+++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py
@@ -84,18 +84,12 @@ def test_available_tasks_covers_all_modalities():
     }
 
 
-def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks():
+def test_resolve_stop_token_ids_uses_answer_for_generation_tasks():
     tok = FakeTokenizer()
 
-    eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"]
-    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [
-        eos_id,
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"],
-    ]
-    assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [
-        eos_id,
-        HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"],
-    ]
+    answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]
+    assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id]
+    assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id]
 
 
 @pytest.mark.parametrize(
diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
index a8b2c743a82..bfc0146d8e8 100644
--- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
+++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py
@@ -67,12 +67,7 @@ def resolve_stop_token_ids(
     bot_task: str = "think",
     tokenizer: Any | None = None,
 ):
-    stop_token_ids = [128025]
-    if "recaption" in task:
-        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</recaption>"])
-    if "think" in task:
-        stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["</think>"])
-    return stop_token_ids
+    return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<answer>"]]
 
 
 def build_prompt(

From 4f6b5732fb5fa7705c28866fe9f87f5d11dceba6 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 15:38:42 +0800
Subject: [PATCH 35/40] Fix HunyuanImage3 NPU AR output flow

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml            | 17 ++++++++++++-----
 .../platforms/npu/worker/npu_ar_model_runner.py | 16 ++++++++++------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index b414f6eb78a..fce8d71dc58 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -16,9 +16,16 @@ connectors:
       device_name: ""
       memory_pool_size: 4294967296
       memory_pool_device: "cpu"
+  shared_memory_connector:
+    name: SharedMemoryConnector
+    extra:
+      shm_threshold_bytes: 65536
 
 stages:
   - stage_id: 0
+    is_comprehension: false
+    final_output: true
+    final_output_type: text
     max_num_seqs: 1
     gpu_memory_utilization: 0.95
     enforce_eager: true
@@ -32,13 +39,12 @@ stages:
     omni_kv_config:
       need_send_cache: true
     output_connectors:
-      to_stage_1: rdma_connector
+      to_stage_1: shared_memory_connector
     default_sampling_params:
       temperature: 0.0
       top_p: 1
       top_k: -1
       max_tokens: 8192
-      stop_token_ids: [128025]
       detokenize: true
       skip_special_tokens: false
 
@@ -53,7 +59,7 @@ stages:
       tensor_parallel_size: 2
       enable_expert_parallel: true
     input_connectors:
-      from_stage_0: rdma_connector
+      from_stage_0: shared_memory_connector
     default_sampling_params:
       num_inference_steps: 50
       guidance_scale: 0
@@ -68,13 +74,14 @@ platforms:
   npu:
     stages:
       - stage_id: 0
-        gpu_memory_utilization: 0.75
+        gpu_memory_utilization: 0.6
         devices: "0,1,2,3,4,5,6,7"
         tensor_parallel_size: 8
+        max_num_batched_tokens: 8192
       - stage_id: 1
         gpu_memory_utilization: 0.65
         devices: "8,9,10,11"
-        max_num_batched_tokens: 32768
+        max_num_batched_tokens: 8192
         parallel_config:
           tensor_parallel_size: 4
           enable_expert_parallel: true
diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
index a4acd421cf8..e78e57101f5 100644
--- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
+++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
@@ -117,8 +117,6 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
             if prev_index is None:
                 continue
             req_history = output_token_ids[index]
-            if not req_history or req_history[-1] != -1:
-                continue
             if sampled_token_ids is None:
                 assert async_copy_ready_event is not None
                 async_copy_ready_event.synchronize()
@@ -127,10 +125,16 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
             if not new_ids:
                 continue
             num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1)
-            first_placeholder = req_history.index(-1)
-            num_placeholders = len(req_history) - first_placeholder
-            num_to_replace = min(num_sampled_ids, num_placeholders)
-            req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
+            new_ids = new_ids[:num_sampled_ids]
+            if not new_ids:
+                continue
+            if req_history and req_history[-1] == -1:
+                first_placeholder = req_history.index(-1)
+                num_placeholders = len(req_history) - first_placeholder
+                num_to_replace = min(len(new_ids), num_placeholders)
+                req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
+            elif req_history[-len(new_ids) :] != new_ids:
+                req_history.extend(new_ids)
 
         return output_token_ids
 

From 4807452a3794416821226c99459c7dffd0860c7d Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 16:04:30 +0800
Subject: [PATCH 36/40] Fix NPU AR sampler history fallback

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/platforms/npu/worker/npu_ar_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
index e78e57101f5..4962fe728fa 100644
--- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
+++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
@@ -108,12 +108,12 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
         sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None)
         async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None)
         prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None)
-        if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None:
+        if sampled_token_ids_cpu is None or not output_token_ids:
             return output_token_ids
 
         sampled_token_ids: list[list[int]] | None = None
         for index, req_id in enumerate(req_ids):
-            prev_index = prev_req_id_to_index.get(req_id)
+            prev_index = prev_req_id_to_index.get(req_id) if prev_req_id_to_index is not None else index
             if prev_index is None:
                 continue
             req_history = output_token_ids[index]
@@ -121,6 +121,8 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
                 assert async_copy_ready_event is not None
                 async_copy_ready_event.synchronize()
                 sampled_token_ids = sampled_token_ids_cpu.tolist()
+            if prev_index >= len(sampled_token_ids):
+                continue
             new_ids = list(sampled_token_ids[prev_index])
             if not new_ids:
                 continue

From a0dd770b796e491d9f9ddbeb887cc9bd20a2b14a Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 16:36:15 +0800
Subject: [PATCH 37/40] Revert NPU AR sampler history fallback

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../npu/worker/npu_ar_model_runner.py         | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
index 4962fe728fa..a4acd421cf8 100644
--- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
+++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
@@ -108,35 +108,29 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
         sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None)
         async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None)
         prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None)
-        if sampled_token_ids_cpu is None or not output_token_ids:
+        if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None:
             return output_token_ids
 
         sampled_token_ids: list[list[int]] | None = None
         for index, req_id in enumerate(req_ids):
-            prev_index = prev_req_id_to_index.get(req_id) if prev_req_id_to_index is not None else index
+            prev_index = prev_req_id_to_index.get(req_id)
             if prev_index is None:
                 continue
             req_history = output_token_ids[index]
+            if not req_history or req_history[-1] != -1:
+                continue
             if sampled_token_ids is None:
                 assert async_copy_ready_event is not None
                 async_copy_ready_event.synchronize()
                 sampled_token_ids = sampled_token_ids_cpu.tolist()
-            if prev_index >= len(sampled_token_ids):
-                continue
             new_ids = list(sampled_token_ids[prev_index])
             if not new_ids:
                 continue
             num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1)
-            new_ids = new_ids[:num_sampled_ids]
-            if not new_ids:
-                continue
-            if req_history and req_history[-1] == -1:
-                first_placeholder = req_history.index(-1)
-                num_placeholders = len(req_history) - first_placeholder
-                num_to_replace = min(len(new_ids), num_placeholders)
-                req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
-            elif req_history[-len(new_ids) :] != new_ids:
-                req_history.extend(new_ids)
+            first_placeholder = req_history.index(-1)
+            num_placeholders = len(req_history) - first_placeholder
+            num_to_replace = min(num_sampled_ids, num_placeholders)
+            req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
 
         return output_token_ids
 

From 64a65c7c9b960b52b61e360df069cd6d7cd7583d Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 18:44:33 +0800
Subject: [PATCH 38/40] Revert NPU AR model sampler override

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 .../npu/worker/npu_ar_model_runner.py         | 78 -------------------
 1 file changed, 78 deletions(-)

diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
index a4acd421cf8..8cff1849aa5 100644
--- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
+++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py
@@ -5,7 +5,6 @@
 
 import time
 from copy import copy, deepcopy
-from dataclasses import replace
 from typing import Any, NamedTuple
 
 import numpy as np
@@ -93,83 +92,6 @@ def _make_buffer(self, *size, dtype, numpy=True):
         with maybe_disable_pin_memory_for_ray(self, total_bytes):
             return super()._make_buffer(*size, dtype=dtype, numpy=numpy)
 
-    def _build_model_sampler_output_token_ids(self) -> list[list[int]]:
-        """Build decoded-token history for custom model samplers.
-
-        vLLM only populates sampling_metadata.output_token_ids when penalties or
-        logits processors require it. HunyuanImage3's custom sampler needs this
-        history to force transitions such as </think> -> <recaption>, so mirror
-        the GPU AR runner behavior for prefer_model_sampler models.
-        """
-        req_output_token_ids = getattr(self.input_batch, "req_output_token_ids", [])
-        req_ids = list(getattr(self.input_batch, "req_ids", []))
-        output_token_ids = [list(req_output_token_ids[idx] or []) for idx in range(len(req_ids))]
-
-        sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None)
-        async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None)
-        prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None)
-        if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None:
-            return output_token_ids
-
-        sampled_token_ids: list[list[int]] | None = None
-        for index, req_id in enumerate(req_ids):
-            prev_index = prev_req_id_to_index.get(req_id)
-            if prev_index is None:
-                continue
-            req_history = output_token_ids[index]
-            if not req_history or req_history[-1] != -1:
-                continue
-            if sampled_token_ids is None:
-                assert async_copy_ready_event is not None
-                async_copy_ready_event.synchronize()
-                sampled_token_ids = sampled_token_ids_cpu.tolist()
-            new_ids = list(sampled_token_ids[prev_index])
-            if not new_ids:
-                continue
-            num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1)
-            first_placeholder = req_history.index(-1)
-            num_placeholders = len(req_history) - first_placeholder
-            num_to_replace = min(num_sampled_ids, num_placeholders)
-            req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace]
-
-        return output_token_ids
-
-    def _sampling_metadata_for_model_sampler(self, sampling_metadata):
-        output_token_ids = self._build_model_sampler_output_token_ids()
-        if output_token_ids == sampling_metadata.output_token_ids:
-            return sampling_metadata
-        return replace(sampling_metadata, output_token_ids=output_token_ids)
-
-    def _sample(
-        self,
-        logits: torch.Tensor | None,
-        spec_decode_metadata: Any,
-    ):
-        sampling_metadata = self.input_batch.sampling_metadata
-        if spec_decode_metadata is None:
-            model_sample = getattr(self.model, "sample", None)
-            if logits is not None and callable(model_sample) and getattr(self.model, "prefer_model_sampler", False):
-                if hasattr(self.sampler, "logit_bias_state"):
-                    self.sampler.logit_bias_state.apply_logit_bias(
-                        logits,
-                        self.input_batch.expanded_idx_mapping,
-                        self.input_batch.idx_mapping_np,
-                        self.input_batch.positions[self.input_batch.logits_indices],
-                    )
-                sampler_output = model_sample(
-                    logits,
-                    self._sampling_metadata_for_model_sampler(sampling_metadata),
-                )
-                if sampler_output is not None:
-                    return sampler_output
-            self.input_batch.update_async_output_token_ids()
-            return self.sampler(
-                logits=logits,
-                sampling_metadata=sampling_metadata,
-            )
-
-        return super()._sample(logits, spec_decode_metadata)
-
     #  -------------------------------------- Omni-new -------------------------------------------------
     def capture_model(self) -> int:
         npugraph_memory_bytes = super().capture_model()

From 6d9b2f91c82023dc218461c06e1315f42b4c50b5 Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 18:49:37 +0800
Subject: [PATCH 39/40] Adjust HunyuanImage3 NPU stage 0 batching

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml    | 8 ++++----
 vllm_omni/deploy/hunyuan_image3_ar.yaml | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index fce8d71dc58..bbcf78a16b4 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -74,13 +74,13 @@ platforms:
   npu:
     stages:
       - stage_id: 0
-        gpu_memory_utilization: 0.6
-        devices: "0,1,2,3,4,5,6,7"
-        tensor_parallel_size: 8
+        gpu_memory_utilization: 0.8
+        devices: "0,1,2,3"
+        tensor_parallel_size: 4
         max_num_batched_tokens: 8192
       - stage_id: 1
         gpu_memory_utilization: 0.65
-        devices: "8,9,10,11"
+        devices: "4,5,6,7"
         max_num_batched_tokens: 8192
         parallel_config:
           tensor_parallel_size: 4
diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml
index 27cbf0f9a60..a59fbfcc95f 100644
--- a/vllm_omni/deploy/hunyuan_image3_ar.yaml
+++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml
@@ -33,6 +33,7 @@ platforms:
         gpu_memory_utilization: 0.75
         devices: "0,1,2,3,4,5,6,7"
         tensor_parallel_size: 8
+        max_num_batched_tokens: 8192
 
   xpu:
     stages:

From 2b44288e7a0ed262d1a8876833f982c4284b0edd Mon Sep 17 00:00:00 2001
From: KexiongYu <yukexiong1@huawei.com>
Date: Mon, 11 May 2026 19:40:46 +0800
Subject: [PATCH 40/40] Remove legacy HunyuanImage3 stage config

Signed-off-by: KexiongYu <yukexiong1@huawei.com>
---
 vllm_omni/deploy/hunyuan_image3.yaml          |  5 +-
 .../hunyuan_image3_it2i_kv_reuse.yaml         | 89 -------------------
 2 files changed, 2 insertions(+), 92 deletions(-)
 delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml

diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml
index bbcf78a16b4..634165cd33a 100644
--- a/vllm_omni/deploy/hunyuan_image3.yaml
+++ b/vllm_omni/deploy/hunyuan_image3.yaml
@@ -1,7 +1,6 @@
 # HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1)
-# with AR-to-DiT KV reuse. The base CUDA layout follows
-# model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml from
-# PR #3346: 2 GPUs for AR and 2 GPUs for DiT.
+# with AR-to-DiT KV reuse. The base CUDA layout uses 2 GPUs for AR
+# and 2 GPUs for DiT.
 pipeline: hunyuan_image3
 async_chunk: false
 trust_remote_code: true
diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml
deleted file mode 100644
index 23c3bbbb262..00000000000
--- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-stage_args:
-  # Stage 0: AR Model
-  - stage_id: 0
-    stage_type: llm
-    runtime:
-      process: true
-      devices: "0,1"
-      max_batch_size: 1
-      requires_multimodal_data: true  # AR needs the original image
-    engine_args:
-      model_stage: AR
-      model_arch: HunyuanImage3ForCausalMM
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.95
-      enforce_eager: true
-      trust_remote_code: true
-      engine_output_type: latent  # AR outputs latent for DiT
-      enable_prefix_caching: false
-      max_num_batched_tokens: 32768
-      tensor_parallel_size: 2
-      pipeline_parallel_size: 1
-      hf_overrides:
-        rope_parameters:
-          mrope_section: [0, 32, 32]
-          rope_type: default
-      omni_kv_config:
-        need_send_cache: true
-    is_comprehension: false  # Generation task, not comprehension
-    final_output: true
-    final_output_type: text
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1
-      top_k: -1
-      max_tokens: 8192
-      stop_token_ids: [128025]  # <answer>
-      detokenize: true  # DiT bridge consumes ar_generated_text; let the AR engine produce it
-      skip_special_tokens: False
-    output_connectors:
-      to_stage_1: rdma_connector
-
-  # Stage 1: Diffusion (DiT + VAE)
-  # Receives latents from AR stage, performs denoising + VAE decode
-  - stage_id: 1
-    stage_type: diffusion
-    runtime:
-      process: true
-      devices: "2,3"
-      max_batch_size: 1
-      requires_multimodal_data: true  # May need condition images
-    engine_args:
-      model_stage: dit
-      model_arch: HunyuanImage3ForCausalMM
-      enforce_eager: true
-      trust_remote_code: true
-      distributed_executor_backend: "mp"
-      omni_kv_config:
-        need_recv_cache: true  # Receive AR KV cache from stage 0
-      parallel_config:
-        tensor_parallel_size: 2
-        enable_expert_parallel: true
-    engine_input_source: [0]  # Input from AR stage
-    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hunyuan_image3.ar2diffusion
-    final_output: true
-    final_output_type: image
-    default_sampling_params:
-      num_inference_steps: 50
-      guidance_scale: 0
-    input_connectors:
-      from_stage_0: rdma_connector
-
-
-# Top-level runtime config
-runtime:
-  enabled: true
-  connectors:
-    rdma_connector:
-      name: MooncakeTransferEngineConnector
-      extra:
-        host: "auto"
-        zmq_port: 50051
-        protocol: "rdma"
-        device_name: ""
-        memory_pool_size: 4294967296
-        memory_pool_device: "cpu"
-  edges:
-    - from: 0  # AR → Diffusion
-      to: 1