From 550f93880c7c436efa089622ee4827ffe4ae9995 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Wed, 6 May 2026 15:42:44 +0800 Subject: [PATCH 01/40] [Config] Add HunyuanImage3 deploy configs Signed-off-by: KexiongYu --- .../hunyuan_image3/README.md | 241 ++++++++---------- .../hunyuan_image3/end2end.py | 49 +++- .../offline_inference/test_hunyuanimage3.py | 5 +- tests/entrypoints/test_utils.py | 39 +++ tests/test_config_factory.py | 28 ++ vllm_omni/config/pipeline_registry.py | 12 + vllm_omni/config/stage_config.py | 10 + vllm_omni/deploy/hunyuan_image3.yaml | 100 ++++++++ vllm_omni/deploy/hunyuan_image3_ar.yaml | 47 ++++ vllm_omni/deploy/hunyuan_image3_dit.yaml | 64 +++++ vllm_omni/entrypoints/cli/serve.py | 1 + vllm_omni/entrypoints/utils.py | 1 + .../models/hunyuan_image3/pipeline.py | 87 +++++++ .../stage_configs/hunyuan_image3_i2t.yaml | 41 --- .../stage_configs/hunyuan_image3_it2i.yaml | 72 ------ .../stage_configs/hunyuan_image3_moe.yaml | 96 ------- .../hunyuan_image3_moe_dit_2gpu_fp8.yaml | 32 --- .../stage_configs/hunyuan_image3_t2i.yaml | 31 --- .../hunyuan_image3_t2i_2gpu.yaml | 41 --- .../stage_configs/hunyuan_image3_t2t.yaml | 42 --- .../npu/stage_configs/hunyuan_image3_t2i.yaml | 35 --- .../xpu/stage_configs/hunyuan_image3_t2i.yaml | 80 ------ 22 files changed, 539 insertions(+), 615 deletions(-) create mode 100644 vllm_omni/deploy/hunyuan_image3.yaml create mode 100644 vllm_omni/deploy/hunyuan_image3_ar.yaml create mode 100644 vllm_omni/deploy/hunyuan_image3_dit.yaml create mode 100644 vllm_omni/model_executor/models/hunyuan_image3/pipeline.py delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml delete mode 100644 vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml delete mode 100644 vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index c1c97bfe1fa..82cca4db6db 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -1,172 +1,153 @@ # HunyuanImage-3.0-Instruct -## Set up - -Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. - -## Run examples - -**Note**: These examples work with the default configuration on **8x NVIDIA L40S (48GB)**. For different GPU setups, modify the stage configuration to adjust device allocation and memory utilization. - -Get into the hunyuan_image3 folder: +This example runs HunyuanImage-3.0-Instruct offline with the unified deploy +YAMLs under `vllm_omni/deploy/`. + +## Deploy Configs + +| File | Topology | Default use | +| :--- | :--- | :--- | +| `vllm_omni/deploy/hunyuan_image3.yaml` | AR + DiT | Default for `text2img` and `img2img`. | +| `vllm_omni/deploy/hunyuan_image3_ar.yaml` | AR only | Default for `img2text` and `text2text`. | +| `vllm_omni/deploy/hunyuan_image3_dit.yaml` | DiT only | Standalone diffusion stage. Pass it explicitly with `--deploy-config`. | + +The example chooses a deploy config automatically when `--deploy-config` and +`--stage-configs-path` are both omitted: + +| `--modality` | `mode` passed to Omni | Default deploy | +| :--- | :--- | :--- | +| `text2img` | `text-to-image` | `hunyuan_image3.yaml` | +| `img2img` | `image-editing` | `hunyuan_image3.yaml` | +| `img2text` | `image-to-text` | `hunyuan_image3_ar.yaml` | +| `text2text` | `text-to-text` | `hunyuan_image3_ar.yaml` | + +`--modality` is an offline example convenience flag. It maps to the internal +`mode` argument passed to `Omni(...)` by this script. HunyuanImage3 uses +separate deploy YAMLs for AR + DiT, AR-only, and DiT-only topologies, so the +stage topology is selected by the deploy file rather than by YAML mode +overrides. + +Online serving does not expose a `--modality` flag or accept `mode` as an API +request field. Choose the deploy topology when starting the server with +`--deploy-config`, then use the OpenAI-compatible endpoint and request shape for +the scenario. The `modalities` request field is used by the chat completions +path; the image endpoints infer the image task from the endpoint and payload. + +| Online scenario | Server deploy | Request | +| :--- | :--- | :--- | +| Text to image | `--deploy-config vllm_omni/deploy/hunyuan_image3.yaml` | `POST /v1/images/generations`, or `POST /v1/chat/completions` with `"modalities": ["image"]`. | +| Image editing | `--deploy-config vllm_omni/deploy/hunyuan_image3.yaml` | `POST /v1/images/edits`. | +| Image/text to text | `--deploy-config vllm_omni/deploy/hunyuan_image3_ar.yaml` | `POST /v1/chat/completions` for text output, for example with `"modalities": ["text"]`. | +| DiT-only image generation | `--deploy-config vllm_omni/deploy/hunyuan_image3_dit.yaml` | `POST /v1/images/generations`. | + +## Run Examples + +Text to image, using the default AR + DiT deploy: ```bash -cd examples/offline_inference/hunyuan_image3 +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality text2img \ + --prompts "A cute cat sitting on a windowsill watching the sunset" ``` -### Modality Control - -HunyuanImage-3.0-Instruct supports multiple modality modes. You can control the mode using the `--modality` argument: - -#### Text to Image (text2img) - -- **Pipeline**: Text → AR (CoT + latent tokens) → DiT (denoise) → VAE Decode → Image -- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT) -- **KV Transfer**: AR sends KV cache to DiT for conditioned generation -- **Default Config**: `hunyuan_image3_t2i.yaml` +Image editing, using the default AR + DiT deploy: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2img \ - --prompts "A cute cat sitting on a windowsill watching the sunset" +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality img2img \ + --image-path /path/to/image.png \ + --prompts "Make the petals neon pink" ``` -**With VAE tiling (required on A100 GPUs):** -```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2img \ - --prompts "A cute cat sitting on a windowsill watching the sunset" \ - --vae-use-tiling -``` - -#### Image to Image (img2img) - -- **Pipeline**: Image + Text → AR (CoT + recaption + latent) → DiT → Edited Image -- **Stages Used**: Stage 0 (AR) + Stage 1 (DiT) -- **KV Transfer**: AR sends KV cache to DiT -- **Default Config**: `hunyuan_image3_it2i.yaml` +Image to text, using the AR-only deploy: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality img2img \ - --image-path /path/to/image.png \ - --prompts "Make the petals neon pink" +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality img2text \ + --image-path /path/to/image.jpg \ + --prompts "Describe the content of the picture." ``` -#### Image to Text (img2text) - -- **Pipeline**: Image + Question → AR → Text description -- **Stages Used**: Stage 0 (AR) only -- **Default Config**: `hunyuan_image3_i2t.yaml` +Text to text, using the AR-only deploy: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality img2text \ - --image-path /path/to/image.jpg \ - --prompts "Describe the content of the picture." +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality text2text \ + --prompts "What is the capital of France?" ``` -#### Text to Text (text2text) - -- **Pipeline**: Text → AR → Text -- **Stages Used**: Stage 0 (AR) only -- **Default Config**: `hunyuan_image3_t2t.yaml` +Standalone DiT, using the DiT-only deploy explicitly: ```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2text \ - --prompts "What is the capital of France?" +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality text2img \ + --deploy-config vllm_omni/deploy/hunyuan_image3_dit.yaml \ + --prompts "A cinematic portrait of an astronaut in a greenhouse" ``` -### Inference Steps & Guidance - -Control generation quality for image modalities: +Override the default full AR + DiT deploy explicitly: ```bash -python end2end.py --modality text2img \ - --steps 50 \ - --guidance-scale 5.0 \ - --height 1024 --width 1024 \ - --prompts "A photo-realistic sunset over the ocean" +python examples/offline_inference/hunyuan_image3/end2end.py \ + --model tencent/HunyuanImage-3.0-Instruct \ + --modality text2img \ + --deploy-config vllm_omni/deploy/hunyuan_image3.yaml \ + --prompts "A cute cat" ``` -### Key Arguments - -#### 📌 Command Line Arguments (end2end.py) - -| Argument | Type | Default | Description | -| :--------------------- | :----- | :----------------------------------- | :----------------------------------------------------------- | -| `--model` | string | `tencent/HunyuanImage-3.0-Instruct` | Model path or name | -| `--modality` | choice | `text2img` | Modality: `text2img`, `img2img`, `img2text`, `text2text` | -| `--prompts` | list | `None` | Input text prompts | -| `--image-path` | string | `None` | Input image path (for `img2img`/`img2text`) | -| `--output` | string | `.` | Output directory for saved images | -| `--steps` | int | `50` | Number of inference steps | -| `--guidance-scale` | float | `5.0` | Classifier-free guidance scale | -| `--seed` | int | `42` | Random seed | -| `--height` | int | `1024` | Output image height | -| `--width` | int | `1024` | Output image width | -| `--bot-task` | string | auto | Override prompt task (e.g. `it2i_think`, `t2i_recaption`) | -| `--sys-type` | string | auto | Override system prompt type (e.g. `en_unified`, `en_vanilla`) | -| `--stage-configs-path` | string | auto | Custom stage config YAML path | -| `--enforce-eager` | flag | `False` | Disable torch.compile | -| `--init-timeout` | int | `300` | Initialization timeout (seconds) | -| `--vae-use-tiling` | flag | `False` | Enable VAE tiling for memory optimization (required to avoid OOM on A100) | - ------- - -#### ⚙️ Stage Configurations - -| Config YAML | Modality | Stages | GPUs | Description | -| :---------------------------------- | :-------- | :----- | :----- | :------------------------------------ | -| `hunyuan_image3_t2i.yaml` | text2img | 2 | 8 | T2I with AR→DiT, 4 GPU each | -| `hunyuan_image3_it2i.yaml` | img2img | 2 | 8 | IT2I with AR→DiT, 4 GPU each | -| `hunyuan_image3_i2t.yaml` | img2text | 1 | 4 | I2T (AR only) | -| `hunyuan_image3_t2t.yaml` | text2text | 1 | 4 | T2T (AR only) | -| `hunyuan_image3_t2i_2gpu.yaml` | text2img | 2 | 2 | T2I for 2-GPU setups | -| `hunyuan_image3_moe.yaml` | text2img | 2 | 8 | T2I with MoE AR→DiT KV reuse | -| `hunyuan_image3_moe_dit_2gpu_fp8.yaml` | text2img | 2 | 2 | T2I with FP8 quantization | - ------- - -## Using MoE Config - -The `hunyuan_image3_moe.yaml` config enables AR→DiT KV cache reuse with 8 GPUs (4 for AR + 4 for DiT). +## Key Arguments -```bash -python end2end.py --model tencent/HunyuanImage-3.0-Instruct \ - --modality text2img \ - --stage-configs-path hunyuan_image3_moe.yaml \ - --prompts "A cute cat" -``` +| Argument | Description | +| :--- | :--- | +| `--deploy-config` | Preferred config path for unified deploy YAMLs. | +| `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. | +| `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. | +| `--steps` | Number of diffusion inference steps for image generation. | +| `--guidance-scale` | Classifier-free guidance scale for image generation. | +| `--height`, `--width` | Output image size for `text2img`. | +| `--bot-task` | Override the prompt task, for example `t2i_think` or `t2i_recaption`. | +| `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. | +| `--vae-use-tiling` | Enable VAE tiling for memory reduction. | ------- +## Notes + +- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults. +- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage. +- The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs. +- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only. ## Prompt Format HunyuanImage-3.0-Instruct uses an instruct chat template: -``` -<|startoftext|>{system_prompt}\n\nUser: {?}{user_prompt}\n\nAssistant: {trigger_tag?} +```text +<|startoftext|>{system_prompt} + +User: {?}{user_prompt} + +Assistant: {trigger_tag?} ``` -- ``: Placeholder for each input image (single token; expanded by the multimodal pipeline) -- Trigger tags: `` (CoT), `` (recaptioning) — placed AFTER `Assistant: ` -- System prompt: Auto-selected based on task -- `t2i_vanilla` is the only task that uses the bare pretrain template (no chat structure) +- ``: Placeholder for each input image (single token; expanded by the multimodal pipeline). +- Trigger tags: `` for CoT and `` for recaptioning, placed after `Assistant: `. +- System prompt: Auto-selected based on task. +- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure. The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()` -helper handles segment-by-segment tokenization (matches HF `apply_chat_template` byte-for-byte). - ------- +helper handles segment-by-segment tokenization and matches HF `apply_chat_template`. ## FAQ -- **OOM errors**: Decrease `gpu_memory_utilization` in the YAML stage config, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling` (required on A100 GPUs). +- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`. - **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended). -| Stage | VRAM (approx) | -| :---------------- | :------------------- | -| Stage 0 (AR) | ~15 GiB + KV Cache | -| Stage 1 (DiT) | ~30 GiB | -| Total (8-GPU) | ~45 GiB + KV Cache | +| Stage | VRAM (approx) | +| :--- | :--- | +| Stage 0 (AR) | ~15 GiB + KV Cache | +| Stage 1 (DiT) | ~30 GiB | +| Total (8-GPU) | ~45 GiB + KV Cache | diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index f8f92944f71..1eaa669c53a 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -15,6 +15,7 @@ import argparse import os +from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( build_prompt_tokens, @@ -42,12 +43,23 @@ } -# Modality → default stage config -_MODALITY_DEFAULT_CONFIG = { - "text2img": "hunyuan_image3_t2i.yaml", - "img2img": "hunyuan_image3_it2i.yaml", - "img2text": "hunyuan_image3_i2t.yaml", - "text2text": "hunyuan_image3_t2t.yaml", +# Default deploy configs are absolute so this example works from any cwd. +_REPO_ROOT = Path(__file__).resolve().parents[3] +_DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") +_DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml") + +_MODALITY_DEFAULT_DEPLOY_CONFIG = { + "text2img": _DEFAULT_DEPLOY_CONFIG, + "img2img": _DEFAULT_DEPLOY_CONFIG, + "img2text": _DEFAULT_AR_DEPLOY_CONFIG, + "text2text": _DEFAULT_AR_DEPLOY_CONFIG, +} + +_MODALITY_MODE = { + "text2img": "text-to-image", + "img2img": "image-editing", + "img2text": "image-to-text", + "text2text": "text-to-text", } @@ -105,7 +117,8 @@ def parse_args(): ) # Omni init args - parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom stage config YAML path.") + parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.") + parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.") parser.add_argument("--log-stats", action="store_true", default=False) parser.add_argument("--init-timeout", type=int, default=300, help="Initialization timeout in seconds.") parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.") @@ -123,20 +136,27 @@ def main(): # Determine task for prompt formatting task = args.bot_task or _MODALITY_TASK_MAP[args.modality] - # Determine stage config - stage_configs_path = args.stage_configs_path or _MODALITY_DEFAULT_CONFIG[args.modality] + if args.deploy_config is not None and args.stage_configs_path is not None: + raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") + + deploy_config = args.deploy_config + stage_configs_path = args.stage_configs_path + if deploy_config is None and stage_configs_path is None: + deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality] # Build Omni omni_kwargs = { "model": args.model, "vae_use_tiling": args.vae_use_tiling, - "stage_configs_path": stage_configs_path, "log_stats": args.log_stats, "init_timeout": args.init_timeout, "enforce_eager": args.enforce_eager, } - if args.modality in ("text2img", "img2img"): - omni_kwargs["mode"] = "text-to-image" + if deploy_config is not None: + omni_kwargs["deploy_config"] = deploy_config + else: + omni_kwargs["stage_configs_path"] = stage_configs_path + omni_kwargs["mode"] = _MODALITY_MODE[args.modality] omni = Omni(**omni_kwargs) @@ -215,7 +235,10 @@ def main(): print("HunyuanImage-3.0 Generation Configuration:") print(f" Model: {args.model}") print(f" Modality: {args.modality}") - print(f" Stage config: {stage_configs_path}") + if deploy_config is not None: + print(f" Deploy config: {deploy_config}") + else: + print(f" Stage config: {stage_configs_path}") print(f" Num stages: {omni.num_stages}") if args.modality in ("text2img", "img2img"): print(f" Inference steps: {args.steps}") diff --git a/tests/e2e/offline_inference/test_hunyuanimage3.py b/tests/e2e/offline_inference/test_hunyuanimage3.py index 5b34faa988e..2a385f6a4c0 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3.py @@ -17,7 +17,7 @@ MODEL_NAME = "tencent/HunyuanImage-3.0" LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" REPO_ROOT = Path(__file__).resolve().parents[3] -STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml" +DEPLOY_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml" pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] @@ -274,7 +274,8 @@ def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: def omni() -> Generator[Omni, None, None]: with OmniRunner( MODEL_NAME, - stage_configs_path=str(STAGE_CONFIG_PATH), + deploy_config=str(DEPLOY_CONFIG_PATH), + mode="text-to-image", ) as runner: yield runner.omni diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 98ecc8ae586..b52b49d68f5 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -18,6 +18,7 @@ _filter_dict_like_object, coerce_param_message_types, filter_dataclass_kwargs, + filter_stages, load_and_resolve_stage_configs, load_stage_configs_from_yaml, resolve_model_config_path, @@ -401,6 +402,44 @@ def test_stage_configs_path_promotes_new_deploy_yaml_without_expanding_replicas( assert stage_configs[1].runtime.num_replicas == 3 assert stage_configs[1].runtime.devices == "1,2,3" + def test_filter_stages_selects_mode_stages_without_mutating_stage_config(self, tmp_path): + config_path = tmp_path / "deploy.yaml" + config_path.write_text( + """modes: + - mode: text-to-text + stages: [0] + - mode: text-to-image + stages: [0, 1] +""", + encoding="utf-8", + ) + stages = [ + create_config( + { + "stage_id": 0, + "runtime": {"requires_multimodal_data": True}, + "final_output": False, + "final_output_type": None, + } + ), + create_config( + { + "stage_id": 1, + "runtime": {"requires_multimodal_data": True}, + "final_output": True, + "final_output_type": "image", + } + ), + ] + + filtered = filter_stages(str(config_path), stages, {"mode": "text-to-text"}) + + assert len(filtered) == 1 + assert filtered[0].stage_id == 0 + assert filtered[0].runtime.requires_multimodal_data is True + assert filtered[0].final_output is False + assert filtered[0].final_output_type is None + class TestLoadStageConfigsFromYaml: """Regression tests for stage-config loading and merging.""" diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py index 7b620ea6e80..57313fe3be2 100644 --- a/tests/test_config_factory.py +++ b/tests/test_config_factory.py @@ -941,6 +941,34 @@ def test_merge_pipeline_deploy_preserves_num_replicas(self, tmp_path): assert stages[1].yaml_runtime["devices"] == "1,2" assert stages[1].yaml_runtime["num_replicas"] == 2 + def test_merge_pipeline_deploy_preserves_requires_multimodal_data(self): + from vllm_omni.config.stage_config import ( + DeployConfig, + PipelineConfig, + StageDeployConfig, + StageExecutionType, + StagePipelineConfig, + merge_pipeline_deploy, + ) + + pipeline = PipelineConfig( + model_type="test_mm", + model_arch="TestModel", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="ar", + execution_type=StageExecutionType.LLM_AR, + requires_multimodal_data=True, + ), + ), + ) + deploy = DeployConfig(async_chunk=False, stages=[StageDeployConfig(stage_id=0)]) + + stages = merge_pipeline_deploy(pipeline, deploy) + + assert stages[0].yaml_runtime["requires_multimodal_data"] is True + class TestQwen3OmniPipeline: def test_registered(self): diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py index 1a129cad8c0..3d44d1bff93 100644 --- a/vllm_omni/config/pipeline_registry.py +++ b/vllm_omni/config/pipeline_registry.py @@ -65,6 +65,18 @@ "vllm_omni.model_executor.models.glm_image.pipeline", "GLM_IMAGE_PIPELINE", ), + "hunyuan_image3": ( + "vllm_omni.model_executor.models.hunyuan_image3.pipeline", + "HUNYUAN_IMAGE3_PIPELINE", + ), + "hunyuan_image3_ar": ( + "vllm_omni.model_executor.models.hunyuan_image3.pipeline", + "HUNYUAN_IMAGE3_AR_PIPELINE", + ), + "hunyuan_image3_dit": ( + "vllm_omni.model_executor.models.hunyuan_image3.pipeline", + "HUNYUAN_IMAGE3_DIT_PIPELINE", + ), "voxcpm2": ( "vllm_omni.model_executor.models.voxcpm2.pipeline", "VOXCPM2_PIPELINE", diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 17c70302312..0bd1f2b7f8f 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -863,6 +863,7 @@ def merge_pipeline_deploy( if ds.devices is not None: runtime["devices"] = ds.devices runtime["num_replicas"] = ds.num_replicas + runtime["requires_multimodal_data"] = ps.requires_multimodal_data result.append( StageConfig( @@ -1078,6 +1079,15 @@ def create_from_model( if model_type and model_type in _PIPELINE_REGISTRY: return cls._create_from_registry(model_type, cli_overrides, deploy_config_path) + if deploy_config_path is not None: + deploy_cfg = load_deploy_config(deploy_config_path) + if deploy_cfg.pipeline and deploy_cfg.pipeline in _PIPELINE_REGISTRY: + return cls._create_from_registry( + deploy_cfg.pipeline, + cli_overrides, + deploy_config_path, + ) + # --- HF architecture fallback: some models report a generic # model_type that collides with another model. Match by the # hf_architectures declared on each registered PipelineConfig. diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml new file mode 100644 index 00000000000..dd176fe3d51 --- /dev/null +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -0,0 +1,100 @@ +# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1). +# The base CUDA layout follows the existing 8-GPU AR->DiT config +# (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the +# verified NPU/XPU stage configs that previously lived under stage_configs/. +pipeline: hunyuan_image3 + +stages: + - stage_id: 0 + max_num_seqs: 1 + gpu_memory_utilization: 0.9 + enforce_eager: true + max_num_batched_tokens: 32768 + devices: "0,1,2,3" + tensor_parallel_size: 4 + hf_overrides: + rope_parameters: + mrope_section: [0, 32, 32] + rope_type: default + default_sampling_params: + temperature: 0.6 + top_p: 0.95 + top_k: 1024 + max_tokens: 4096 + stop_token_ids: [127957] + detokenize: false + + - stage_id: 1 + max_num_seqs: 1 + gpu_memory_utilization: 0.9 + enforce_eager: true + devices: "4,5,6,7" + vae_use_slicing: false + vae_use_tiling: false + cache_backend: + cache_config: + enable_cache_dit_summary: false + parallel_config: + pipeline_parallel_size: 1 + data_parallel_size: 1 + tensor_parallel_size: 4 + enable_expert_parallel: true + sequence_parallel_size: 1 + ulysses_degree: 1 + ring_degree: 1 + cfg_parallel_size: 1 + vae_patch_parallel_size: 1 + use_hsdp: false + hsdp_shard_size: -1 + hsdp_replicate_size: 1 + default_sampling_params: + seed: 42 + +edges: + - from: 0 + to: 1 + window_size: -1 + max_inflight: 1 + +platforms: + npu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.65 + devices: "0,1,2,3" + tensor_parallel_size: 4 + - stage_id: 1 + gpu_memory_utilization: 0.65 + devices: "4,5,6,7" + max_num_batched_tokens: 32768 + parallel_config: + tensor_parallel_size: 4 + enable_expert_parallel: false + + xpu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.95 + devices: "0,1,2,3,4,5,6,7" + tensor_parallel_size: 8 + max_num_batched_tokens: 32784 + quantization: fp8 + enable_expert_parallel: true + worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker + - stage_id: 1 + gpu_memory_utilization: 0.9 + devices: "0,1,2,3,4,5,6,7" + quantization: fp8 + parallel_config: + pipeline_parallel_size: 1 + data_parallel_size: 1 + tensor_parallel_size: 8 + enable_expert_parallel: true + sequence_parallel_size: 1 + ulysses_degree: 1 + ring_degree: 1 + cfg_parallel_size: 1 + vae_patch_parallel_size: 1 + use_hsdp: false + hsdp_shard_size: -1 + hsdp_replicate_size: 1 diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml new file mode 100644 index 00000000000..44cd96b72ce --- /dev/null +++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml @@ -0,0 +1,47 @@ +# HunyuanImage-3.0-Instruct AR-only deploy. +# +# Use this when AR and DiT are deployed as independent services. This file +# resolves to stage 0 only, avoiding the default two-stage topology. +pipeline: hunyuan_image3_ar +async_chunk: false + +stages: + - stage_id: 0 + max_num_seqs: 1 + gpu_memory_utilization: 0.75 + trust_remote_code: true + enforce_eager: true + enable_prefix_caching: false + max_num_batched_tokens: 32768 + devices: "0,1,2,3" + tensor_parallel_size: 4 + hf_overrides: + rope_parameters: + mrope_section: [0, 32, 32] + rope_type: default + default_sampling_params: + temperature: 0.0 + top_p: 0.95 + top_k: 1024 + max_tokens: 1024 + stop_token_ids: [127957, 128026] + detokenize: true + +platforms: + npu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.75 + devices: "0,1,2,3,4,5,6,7" + tensor_parallel_size: 8 + + xpu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.95 + devices: "0,1,2,3,4,5,6,7" + tensor_parallel_size: 8 + max_num_batched_tokens: 32784 + quantization: fp8 + enable_expert_parallel: true + worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml new file mode 100644 index 00000000000..3c0ba190101 --- /dev/null +++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml @@ -0,0 +1,64 @@ +# HunyuanImage-3.0-Instruct DiT-only deploy. +# +# Use this for standalone DiT/offline execution. This file resolves to one +# diffusion stage with stage_id 0 and does not depend on the AR stage. +pipeline: hunyuan_image3_dit +async_chunk: false + +stages: + - stage_id: 0 + max_num_seqs: 1 + gpu_memory_utilization: 0.9 + enforce_eager: true + devices: "0,1,2,3" + vae_use_slicing: false + vae_use_tiling: false + cache_backend: + cache_config: + enable_cache_dit_summary: false + parallel_config: + pipeline_parallel_size: 1 + data_parallel_size: 1 + tensor_parallel_size: 4 + enable_expert_parallel: true + sequence_parallel_size: 1 + ulysses_degree: 1 + ring_degree: 1 + cfg_parallel_size: 1 + vae_patch_parallel_size: 1 + use_hsdp: false + hsdp_shard_size: -1 + hsdp_replicate_size: 1 + default_sampling_params: + seed: 42 + +platforms: + npu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.65 + devices: "0,1,2,3" + max_num_batched_tokens: 32768 + parallel_config: + tensor_parallel_size: 4 + enable_expert_parallel: true + + xpu: + stages: + - stage_id: 0 + gpu_memory_utilization: 0.9 + devices: "0,1,2,3,4,5,6,7" + quantization: fp8 + parallel_config: + pipeline_parallel_size: 1 + data_parallel_size: 1 + tensor_parallel_size: 8 + enable_expert_parallel: true + sequence_parallel_size: 1 + ulysses_degree: 1 + ring_degree: 1 + cfg_parallel_size: 1 + vae_patch_parallel_size: 1 + use_hsdp: false + hsdp_shard_size: -1 + hsdp_replicate_size: 1 diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index 540d5c0cfdf..b4293d59fd7 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -593,6 +593,7 @@ def run_headless(args: argparse.Namespace) -> None: model, args_dict.get("stage_configs_path"), args_dict, + deploy_config_path=args_dict.get("deploy_config"), ) # Locate the stage config that matches stage_id. diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index d728e76417c..460c6985b0c 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -340,6 +340,7 @@ def resolve_model_config_path(model: str) -> str: normalized_model_type = _DIFFUSERS_CLASS_TO_CONFIG[model_type] else: normalized_model_type = model_type.replace("-", "_") + model_type_str = f"{normalized_model_type}.yaml" complete_config_path = PROJECT_ROOT / default_config_path / model_type_str if os.path.exists(complete_config_path): diff --git a/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py b/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py new file mode 100644 index 00000000000..3ff53af6292 --- /dev/null +++ b/vllm_omni/model_executor/models/hunyuan_image3/pipeline.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""HunyuanImage3 pipeline topology.""" + +from vllm_omni.config.stage_config import ( + PipelineConfig, + StageExecutionType, + StagePipelineConfig, +) + +_HUNYUAN_IMAGE3_HF_ARCHS = ( + "HunyuanImage3ForConditionalGeneration", + "HunyuanImage3ForCausalMM", +) +_HUNYUAN_IMAGE3_MODEL_ARCH = "HunyuanImage3ForCausalMM" +_HUNYUAN_IMAGE3_INPUT_PROCESSOR = "vllm_omni.model_executor.stage_input_processors.hunyuan_image3" + + +HUNYUAN_IMAGE3_PIPELINE = PipelineConfig( + model_type="hunyuan_image3", + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + hf_architectures=_HUNYUAN_IMAGE3_HF_ARCHS, + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="AR", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=False, + owns_tokenizer=False, + requires_multimodal_data=True, + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + engine_output_type="latent", + ), + StagePipelineConfig( + stage_id=1, + model_stage="dit", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(0,), + final_output=True, + final_output_type="image", + requires_multimodal_data=True, + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + custom_process_input_func=f"{_HUNYUAN_IMAGE3_INPUT_PROCESSOR}.ar2diffusion", + ), + ), +) + + +HUNYUAN_IMAGE3_AR_PIPELINE = PipelineConfig( + model_type="hunyuan_image3_ar", + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + hf_architectures=(), + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="AR", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + final_output_type="text", + owns_tokenizer=False, + requires_multimodal_data=True, + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + engine_output_type="latent", + ), + ), +) + + +HUNYUAN_IMAGE3_DIT_PIPELINE = PipelineConfig( + model_type="hunyuan_image3_dit", + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + hf_architectures=(), + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="dit", + execution_type=StageExecutionType.DIFFUSION, + input_sources=(), + final_output=True, + final_output_type="image", + requires_multimodal_data=True, + model_arch=_HUNYUAN_IMAGE3_MODEL_ARCH, + ), + ), +) diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml deleted file mode 100644 index 0614a9f1179..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Stage config for HunyuanImage-3.0 Image-to-Text (I2T / image understanding). -# Single LLM stage: AR model reads image + text prompt, generates text output. - -stage_args: - - stage_id: 0 - stage_type: llm - runtime: - process: true - devices: "0,1,2,3" - max_batch_size: 1 - requires_multimodal_data: true - engine_args: - model_stage: AR - max_num_seqs: 1 - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.95 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 4 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 0.95 - top_k: 1024 - max_tokens: 2048 - stop_token_ids: [127957, 128024, 128026] # <|endoftext|>, , - detokenize: True - -runtime: - enabled: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml deleted file mode 100644 index 31511697371..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml +++ /dev/null @@ -1,72 +0,0 @@ -# Stage config for HunyuanImage-3.0 Image+Text-to-Image (image editing). -# Stage 0: AR (HunyuanImage3ForConditionalGeneration) — reads (image, text), emits latent tokens -# Stage 1: Diffusion (HunyuanImage3Pipeline / DiT + VAE) — denoise + decode latents → image - -stage_args: - # Stage 0: AR Model - - stage_id: 0 - stage_type: llm - runtime: - process: true - devices: "0,1,2,3" - max_batch_size: 1 - requires_multimodal_data: true # AR needs the original image - engine_args: - model_stage: AR - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.95 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # AR outputs latent for DiT - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 4 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - is_comprehension: false # Generation task, not comprehension - final_output: false # AR is not the final output - default_sampling_params: - temperature: 0.6 - top_p: 0.95 - top_k: 1024 - max_tokens: 4096 - stop_token_ids: [127957] # <|endoftext|> - detokenize: true # DiT bridge consumes ar_generated_text; let the AR engine produce it - - # Stage 1: Diffusion (DiT + VAE) - # Receives latents from AR stage, performs denoising + VAE decode - - stage_id: 1 - stage_type: diffusion - runtime: - process: true - devices: "4,5,6,7" - max_batch_size: 1 - requires_multimodal_data: true # May need condition images - engine_args: - model_stage: dit - model_arch: HunyuanImage3ForCausalMM - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - parallel_config: - tensor_parallel_size: 4 - enable_expert_parallel: true - engine_input_source: [0] # Input from AR stage - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hunyuan_image3.ar2diffusion - final_output: true - final_output_type: image - default_sampling_params: - num_inference_steps: 50 - guidance_scale: 2.5 - -# Top-level runtime config -runtime: - enabled: true - edges: - - from: 0 # AR → Diffusion - to: 1 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml deleted file mode 100644 index f0797c63270..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 with AR→DiT KV reuse. -# Stage 0: AR Model (vLLM implementation) -# Stage 1: DiT Model (diffusion) -# -# text-to-image flow: AR (stage 0) → KV transfer → DiT (stage 1) -# image-to-text flow: AR (stage 0) only -# -# Compared to hunyuan_image3_t2i.yaml, this config: -# 1. Enables both stages [0, 1] for text-to-image (AR prefill + DiT denoising) -# 2. Adds omni_kv_config to send/receive KV cache between stages - -# The following config has been verified on 8x L40S-48G GPU (4 for AR + 4 for DiT). -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0,1,2,3" # AR stage uses GPU 0-3 - engine_args: - model_stage: AR - max_num_seqs: 1 - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 4 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - omni_kv_config: - need_send_cache: true - kv_transfer_criteria: - type: prefill_finished # Send KV cache after AR prefill completes - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: diffusion - runtime: - process: true - devices: "4,5,6,7" # DiT stage uses GPU 4-7 - max_batch_size: 1 - engine_args: - model_stage: diffusion - enforce_eager: true - distributed_executor_backend: "mp" - vae_use_slicing: false - vae_use_tiling: false - cache_backend: null - cache_config: null - enable_cache_dit_summary: false - omni_kv_config: - need_recv_cache: true # Receive AR KV cache from stage 0 - parallel_config: - pipeline_parallel_size: 1 - data_parallel_size: 1 - tensor_parallel_size: 4 - enable_expert_parallel: false - sequence_parallel_size: 1 - ulysses_degree: 1 - ring_degree: 1 - cfg_parallel_size: 1 - vae_patch_parallel_size: 1 - use_hsdp: false - hsdp_shard_size: -1 - hsdp_replicate_size: 1 - engine_input_source: [0] # Receive input (including KV) from stage 0 - final_output: true - final_output_type: image - -# Top-level runtime config: windows, edges, and connectors -runtime: - enabled: true - defaults: - window_size: -1 # Trigger downstream only after full upstream completion - max_inflight: 1 # Process serially within each stage - - edges: - - from: 0 - to: 1 - window_size: -1 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml deleted file mode 100644 index 586b601bc5a..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 DiT with FP8 online quantization. -# The following config is for 2x H200 GPU. - -# Stage 0: Diffusion (DiT + VAE) -# This stage receives noise and timesteps and performs denoising + VAE decode -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1" - max_batch_size: 1 - engine_args: - model_stage: dit - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - quantization: "fp8" - parallel_config: - tensor_parallel_size: 2 - enable_expert_parallel: true - omni_kv_config: - need_recv_cache: true - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 42 - -# Runtime edges -runtime: - enabled: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml deleted file mode 100644 index 1d8c7f4812d..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 DiT. -# The following config has been verified on 4x H20 GPU. - -# Stage 0: Diffusion (DiT + VAE) -# This stage receives noise and timesteps and performs denoising + VAE decode -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1,2,3" - engine_args: - max_num_seqs: 1 - model_stage: dit - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - parallel_config: - tensor_parallel_size: 4 - enable_expert_parallel: true - omni_kv_config: - need_recv_cache: true - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 42 - -# Runtime edges -runtime: - enabled: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml deleted file mode 100644 index 41ed74ba62a..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 on 2 GPUs with FP8. -# Stage 0: AR Model (vLLM implementation) - -stage_args: - - stage_id: 0 - stage_type: llm - runtime: - process: true - devices: "0,1" - engine_args: - model_stage: AR - max_num_seqs: 1 - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -runtime: - enabled: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml deleted file mode 100644 index c9daa5e5f39..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Stage config for HunyuanImage-3.0 Text-to-Text (T2T / pure text generation). -# Single LLM stage: AR model reads text prompt only, generates text output. -# Sampling params aligned with official generation_config.json. - -stage_args: - - stage_id: 0 - stage_type: llm - runtime: - process: true - devices: "0,1,2,3" - max_batch_size: 1 - requires_multimodal_data: false - engine_args: - model_stage: AR - max_num_seqs: 1 - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.95 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 4 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 0.95 - top_k: 1024 - max_tokens: 2048 - stop_token_ids: [127957, 128024, 128026] # <|endoftext|>, , - detokenize: True - -runtime: - enabled: true diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml deleted file mode 100644 index 0fd03949d11..00000000000 --- a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 DiT on NPU. -# The following config has been verified on 8x A3-64G NPUs. - -# Stage 0: Diffusion (DiT + VAE) -# This stage receives noise and timesteps and performs denoising + VAE decode. -stage_args: - - stage_id: 0 - stage_type: diffusion - runtime: - devices: "0,1,2,3,4,5,6,7" - engine_args: - max_num_seqs: 1 - model_stage: dit - gpu_memory_utilization: 0.65 - enforce_eager: true - trust_remote_code: true - engine_output_type: image - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - parallel_config: - tensor_parallel_size: 8 - enable_expert_parallel: true - omni_kv_config: - need_recv_cache: true - - final_output: true - final_output_type: image - is_comprehension: false - default_sampling_params: - seed: 42 - -# Runtime defaults -runtime: - enabled: true diff --git a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml b/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml deleted file mode 100644 index 4e0005f82a1..00000000000 --- a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 with architecture of OmniLLM. -# Stage 0: AR Model (vLLM implementation) - -# The following config has been verified on 8x Max 1550 GPU. -modes: - - mode: text-to-image - stages: [1] - - mode: image-to-text - stages: [0] -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - process: true # Run this stage in a separate process - devices: "0,1,2,3,4,5,6,7" # Visible devices for this stage - max_batch_size: 1 - engine_args: - model_stage: AR - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.95 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32784 - tensor_parallel_size: 8 - pipeline_parallel_size: 1 - enable_expert_parallel: true - quantization: "fp8" - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: diffusion - runtime: - process: true - devices: "0,1,2,3,4,5,6,7" - max_batch_size: 1 - engine_args: - model_stage: diffusion - gpu_memory_utilization: 0.9 - enforce_eager: true - engine_output_type: image - distributed_executor_backend: "mp" - enable_prefix_caching: false - vae_use_slicing: false - vae_use_tiling: false - cache_backend: null - cache_config: null - enable_cache_dit_summary: false - quantization: "fp8" - parallel_config: - pipeline_parallel_size: 1 - data_parallel_size: 1 - tensor_parallel_size: 8 - enable_expert_parallel: true - sequence_parallel_size: 1 - ulysses_degree: 1 - ring_degree: 1 - cfg_parallel_size: 1 - vae_patch_parallel_size: 1 - use_hsdp: false - hsdp_shard_size: -1 - hsdp_replicate_size: 1 - final_output: true - final_output_type: image - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true From 6fddd0e7925b28eb6593ee43c6fd49abcb0ffbc8 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 10:52:03 +0800 Subject: [PATCH 02/40] Add request-level HunyuanImage3 bot task controls Signed-off-by: KexiongYu --- .../hunyuan_image3/end2end.py | 9 +- .../hunyuan_image3/test_prompt_utils.py | 75 ++++++++- vllm_omni/deploy/hunyuan_image3.yaml | 1 - vllm_omni/deploy/hunyuan_image3_ar.yaml | 1 - .../models/hunyuan_image3/prompt_utils.py | 117 ++++++++++++- vllm_omni/entrypoints/openai/api_server.py | 5 + .../entrypoints/openai/protocol/images.py | 4 + vllm_omni/entrypoints/openai/serving_chat.py | 155 ++++++++++++++++-- .../models/hunyuan_image3/hunyuan_image3.py | 11 +- 9 files changed, 353 insertions(+), 25 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 1eaa669c53a..b6ffa535463 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,7 +18,9 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + bot_task_for_task, build_prompt_tokens, + stop_token_ids_for_bot_task, ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -135,6 +137,7 @@ def main(): # Determine task for prompt formatting task = args.bot_task or _MODALITY_TASK_MAP[args.modality] + bot_task = bot_task_for_task(task) if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -219,7 +222,8 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - for i, sp in enumerate(params_list): + ar_stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task) + for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps sp.guidance_scale = args.guidance_scale @@ -229,12 +233,15 @@ def main(): if args.modality in ("text2img",): sp.height = args.height sp.width = args.width + elif hasattr(sp, "stop_token_ids"): + sp.stop_token_ids = ar_stop_token_ids # Print configuration print(f"\n{'=' * 60}") print("HunyuanImage-3.0 Generation Configuration:") print(f" Model: {args.model}") print(f" Modality: {args.modality}") + print(f" Bot task: {bot_task}") if deploy_config is not None: print(f" Deploy config: {deploy_config}") else: diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 501664fe688..62beb45a1f6 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -24,9 +24,12 @@ import pytest from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + apply_bot_task_to_sampling_params, available_tasks, + bot_task_for_task, build_prompt, build_prompt_tokens, + stop_token_ids_for_bot_task, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -49,10 +52,16 @@ class FakeTokenizer: "": 2, "": 3, "": 4, + "<|endoftext|>": 5, + "": 6, + "": 7, + "": 8, + **{f"": 1000 + i for i in range(33)}, } def __init__(self) -> None: self.encode_calls: list[str] = [] + self.eos_token_id = self.SPECIAL["<|endoftext|>"] def convert_tokens_to_ids(self, tok: str) -> int: return self.SPECIAL.get(tok, 0) @@ -75,6 +84,60 @@ def test_available_tasks_covers_all_modalities(): } +@pytest.mark.parametrize( + "task,expected_bot_task", + [ + ("t2t", "auto"), + ("i2t", "auto"), + ("it2i_think", "think_recaption"), + ("it2i_recaption", "recaption"), + ("t2i_think", "think_recaption"), + ("t2i_recaption", "recaption"), + ("t2i_vanilla", "image"), + ], +) +def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task: str): + assert bot_task_for_task(task) == expected_bot_task + + +def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): + tok = FakeTokenizer() + + assert stop_token_ids_for_bot_task(tok, "auto") == [5, 8] + assert stop_token_ids_for_bot_task(tok, "image") == [5] + assert stop_token_ids_for_bot_task(tok, "think_recaption") == [6, 7, 5] + assert stop_token_ids_for_bot_task(tok, "recaption") == [6, 7, 5] + assert stop_token_ids_for_bot_task(tok, "auto", image_size="auto") == [ + 5, + *range(1000, 1033), + ] + + +class FakeSamplingParams: + + def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: + self.stop_token_ids = stop_token_ids + self.max_tokens = max_tokens + + +def test_apply_bot_task_to_sampling_params_updates_only_target_stage(): + tok = FakeTokenizer() + stage0 = FakeSamplingParams(stop_token_ids=[999]) + stage1 = FakeSamplingParams(stop_token_ids=[888]) + + updated = apply_bot_task_to_sampling_params( + [stage0, stage1], + tok, + "think_recaption", + stage_index=0, + ) + + assert updated[0] is stage0 + assert updated[0].stop_token_ids == [6, 7, 5] + assert updated[1] is stage1 + assert stage0.stop_token_ids == [6, 7, 5] + + @pytest.mark.parametrize( "task", [ @@ -234,10 +297,16 @@ def test_end2end_routes_through_shared_prompt_utils(): for node in ast.walk(tree): if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"): imported_from_prompt_utils.update(alias.name for alias in node.names) - assert "build_prompt_tokens" in imported_from_prompt_utils, ( - "end2end.py must import build_prompt_tokens from " + expected_imports = { + "bot_task_for_task", + "build_prompt_tokens", + "stop_token_ids_for_bot_task", + } + assert expected_imports <= imported_from_prompt_utils, ( + "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from " "vllm_omni.diffusion.models.hunyuan_image3.prompt_utils -- the shared " - "helper is the single source of truth for the AR-prefill template." + "module is the single source of truth for the AR-prefill template and " + "bot_task-derived AR stop token ids." ) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index dd176fe3d51..b5238169786 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -21,7 +21,6 @@ stages: top_p: 0.95 top_k: 1024 max_tokens: 4096 - stop_token_ids: [127957] detokenize: false - stage_id: 1 diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml index 44cd96b72ce..27cbf0f9a60 100644 --- a/vllm_omni/deploy/hunyuan_image3_ar.yaml +++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml @@ -24,7 +24,6 @@ stages: top_p: 0.95 top_k: 1024 max_tokens: 1024 - stop_token_ids: [127957, 128026] detokenize: true platforms: diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 6e8efac3133..a92b4a0848c 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -17,12 +17,24 @@ from __future__ import annotations +from typing import Any + from .system_prompt import get_system_prompt +BOT_TASKS = ("auto", "image", "recaption", "think_recaption") +_BOT_TASK_TO_TOKENIZER_TASK = { + "auto": "auto", + "image": "image", + "recaption": "recaption", + "think_recaption": "think", +} + # task -> (sys_type, bot_task, trigger_tag) _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), + "t2t_think": ("en_unified", "think", ""), "i2t": ("en_unified", None, None), + "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), "it2i_recaption": ("en_unified", "recaption", ""), "t2i_think": ("en_unified", "think", ""), @@ -36,6 +48,100 @@ def available_tasks() -> list[str]: return sorted(_TASK_PRESETS) +def bot_task_for_task(task: str) -> str: + """Return the HunyuanImage3 bot_task associated with a prompt task.""" + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + + _, preset_bot_task, _ = _TASK_PRESETS[task] + if preset_bot_task == "think": + return "think_recaption" + return preset_bot_task or "auto" + + +def tokenizer_bot_task_for_bot_task(bot_task: str) -> str: + """Map the public HunyuanImage3 bot_task to tokenizer-internal task.""" + if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") + return _BOT_TASK_TO_TOKENIZER_TASK[bot_task] + + +def _token_id(tokenizer, token: str) -> int: + token_id = tokenizer.convert_tokens_to_ids(token) + if token_id is None: + raise ValueError(f"Tokenizer does not know special token {token!r}") + return int(token_id) + + +def _eos_token_id(tokenizer) -> int: + token_id = getattr(tokenizer, "eos_token_id", None) + if token_id is not None: + return int(token_id) + return _token_id(tokenizer, "<|endoftext|>") + + +def stop_token_ids_for_bot_task( + tokenizer, + bot_task: str, + image_size: int | str | None = None, +) -> list[int]: + """Return AR stop token ids for a HunyuanImage3 bot_task. + + Mirrors the official HunyuanImage-3.0 generation logic: `auto` + additionally stops on image-start markers, text/image tasks stop on + their structural end tokens, and all ids are resolved from the + tokenizer instead of being hard-coded in deploy YAML. + """ + eos_id = _eos_token_id(tokenizer) + + if image_size == "auto": + extra_auto_stops = [ + _token_id(tokenizer, f"") for i in range(33) + ] + else: + extra_auto_stops = [_token_id(tokenizer, "")] + + tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) + stop_token_id = { + "auto": [eos_id] + extra_auto_stops, + "image": [eos_id], + "recaption": [ + _token_id(tokenizer, ""), + _token_id(tokenizer, ""), + eos_id, + ], + "think": [ + _token_id(tokenizer, ""), + _token_id(tokenizer, ""), + eos_id, + ], + } + return stop_token_id[tokenizer_bot_task] + + +def apply_bot_task_to_sampling_params( + sampling_params_list: list[Any], + tokenizer: Any, + bot_task: str, + *, + stage_index: int = 0, + image_size: int | str | None = None, +) -> list[Any]: + """Apply a per-request HunyuanImage3 bot_task to one AR stage.""" + if stage_index < 0 or stage_index >= len(sampling_params_list): + raise IndexError( + f"stage_index {stage_index} is out of range for " + f"{len(sampling_params_list)} sampling params" + ) + + updated_params_list = list(sampling_params_list) + params = updated_params_list[stage_index] + params.stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task, image_size=image_size) + + updated_params_list[stage_index] = params + return updated_params_list + + def build_prompt( user_prompt: str, task: str = "it2i_think", @@ -149,4 +255,13 @@ def build_prompt_tokens( return ids -__all__ = ["build_prompt", "build_prompt_tokens", "available_tasks"] +__all__ = [ + "available_tasks", + "apply_bot_task_to_sampling_params", + "bot_task_for_task", + "BOT_TASKS", + "build_prompt", + "build_prompt_tokens", + "stop_token_ids_for_bot_task", + "tokenizer_bot_task_for_bot_task", +] diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 06fb0a7f4cb..9b3aec58f21 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1527,6 +1527,8 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_body["guidance_scale"] = request.guidance_scale if request.true_cfg_scale is not None: extra_body["true_cfg_scale"] = request.true_cfg_scale + if request.bot_task is not None: + extra_body["bot_task"] = request.bot_task if request.generator_device is not None: extra_body["generator_device"] = request.generator_device if request.lora is not None: @@ -1693,6 +1695,7 @@ async def edit_images( guidance_scale: float | None = Form(None), strength: float | None = Form(None), true_cfg_scale: float | None = Form(None), + bot_task: str | None = Form(None), seed: int | None = Form(None), generator_device: str | None = Form(None), # vllm-omni extension for per-request LoRA. @@ -1896,6 +1899,8 @@ async def edit_images( extra_body["strength"] = strength if true_cfg_scale is not None: extra_body["true_cfg_scale"] = true_cfg_scale + if bot_task is not None: + extra_body["bot_task"] = bot_task if layers is not None: extra_body["layers"] = layers if resolution is not None: diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 0fb22a548cf..c78a95de058 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -117,6 +117,10 @@ def validate_use_system_prompt(cls, v): le=20.0, description="True CFG scale (model-specific parameter, may be ignored if not supported)", ) + bot_task: str | None = Field( + default=None, + description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.", + ) seed: int | None = Field(default=None, description="Random seed for reproducibility") generator_device: str | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 7558e85aaac..b2375fd38b4 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -197,10 +197,25 @@ async def create_chat_completion( if tokenizer is None: tokenizer = await self.engine_client.get_tokenizer() + extra_body = self._get_extra_body_from_request(request) + bot_task = ( + extra_body.get("bot_task") + if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or [])) + is not None + else None + ) + request_chat_template_kwargs = request.chat_template_kwargs or {} + if bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + + tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) + request_chat_template_kwargs = dict(request_chat_template_kwargs) + request_chat_template_kwargs["bot_task"] = tokenizer_bot_task + reasoning_parser: ReasoningParser | None = None if self.reasoning_parser_cls: chat_template_kwargs = self._prepare_extra_chat_template_kwargs( - request.chat_template_kwargs, + request_chat_template_kwargs, self.default_chat_template_kwargs, ) reasoning_parser = self.reasoning_parser_cls( @@ -248,13 +263,13 @@ async def create_chat_completion( if not self.use_harmony: error_check_ret = self._validate_chat_template( request_chat_template=request.chat_template, - chat_template_kwargs=request.chat_template_kwargs, + chat_template_kwargs=request_chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, ) if error_check_ret is not None: return error_check_ret - chat_template_kwargs = request.chat_template_kwargs or {} + chat_template_kwargs = dict(request_chat_template_kwargs) chat_template_kwargs.update(reasoning_effort=request.reasoning_effort) # Merge chat_template_kwargs with defaults @@ -321,9 +336,7 @@ async def create_chat_completion( # `extra_body` is flattented and merged into the payload's root. # These extra fields are accessible via `model_extra` property (from Pydantic base class). # When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict. - extra_body = getattr(request, "extra_body", None) - if not extra_body: - extra_body = request.model_extra or {} + extra_body = self._get_extra_body_from_request(request) height, width = self._resolve_height_width_from_extra_body(extra_body) @@ -367,6 +380,9 @@ async def create_chat_completion( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width + if bot_task is not None: + mm_processor_kwargs["bot_task"] = tokenizer_bot_task + tprompt["bot_task"] = bot_task tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image @@ -404,6 +420,12 @@ async def create_chat_completion( # to delta to ensure emitted outputs are correctly drained. Otherwise # convert cumulative to Final Only to ensure the output is correct. sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream) + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + engine=self.engine_client, + sampling_params_list=sampling_params_list, + bot_task=bot_task, + tokenizer=tokenizer, + ) # Apply user-specified overrides to diffusion stage(s) for image generation for idx, sp in enumerate(sampling_params_list): @@ -685,6 +707,89 @@ def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[Sam raise ValueError(f"Invalid sampling params: {sampling_params}") return final_sampling_params_list + @staticmethod + def _get_extra_body_from_request(request: Any) -> dict[str, Any]: + body: dict[str, Any] = {} + model_extra = getattr(request, "model_extra", None) + if isinstance(model_extra, dict): + body.update(model_extra) + extra_body = getattr(request, "extra_body", None) + if isinstance(extra_body, dict): + body.update(extra_body) + return body + + @staticmethod + def _stage_config_get(stage_config: Any, key: str) -> Any: + if isinstance(stage_config, dict): + return stage_config.get(key) + if hasattr(stage_config, "get"): + try: + return stage_config.get(key) + except Exception: + pass + return getattr(stage_config, key, None) + + @classmethod + def _is_hunyuan_image3_stage(cls, stage_config: Any) -> bool: + model_arch = cls._stage_config_get(stage_config, "model_arch") + if model_arch == "HunyuanImage3ForCausalMM": + return True + + engine_args = cls._stage_config_get(stage_config, "engine_args") + if isinstance(engine_args, dict): + return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM" + if engine_args is not None and hasattr(engine_args, "get"): + try: + return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM" + except Exception: + pass + return getattr(engine_args, "model_arch", None) == "HunyuanImage3ForCausalMM" + + @classmethod + def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | None: + for idx, stage_config in enumerate(stage_configs): + if cls._is_hunyuan_image3_stage(stage_config) and get_stage_type(stage_config) != "diffusion": + return idx + return None + + async def _apply_hunyuan_image3_bot_task_sampling_params( + self, + *, + engine: Any, + sampling_params_list: list[Any], + bot_task: Any, + tokenizer: Any | None = None, + ) -> list[Any]: + if bot_task is None: + return sampling_params_list + + stage_configs = list(getattr(engine, "stage_configs", []) or []) + stage_index = self._get_hunyuan_image3_ar_stage_index(stage_configs) + if stage_index is None: + return sampling_params_list + + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + BOT_TASKS, + apply_bot_task_to_sampling_params, + tokenizer_bot_task_for_bot_task, + ) + + if bot_task not in BOT_TASKS: + raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") + tokenizer_bot_task_for_bot_task(bot_task) + + if tokenizer is None and hasattr(engine, "get_tokenizer"): + tokenizer = await engine.get_tokenizer() + if tokenizer is None: + raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.") + + return apply_bot_task_to_sampling_params( + sampling_params_list, + tokenizer, + bot_task, + stage_index=stage_index, + ) + def _get_comprehension_stage_index(self) -> int: for idx, stage in enumerate(self.engine_client.stage_configs): if stage.is_comprehension: @@ -2149,7 +2254,11 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - bot_task = extra_body.get("bot_task") + bot_task = ( + extra_body.get("bot_task") + if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None + else None + ) engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] @@ -2188,6 +2297,11 @@ def _build_multistage_generation_inputs( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width + if bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + + mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task) + engine_prompt["bot_task"] = bot_task if mm_processor_kwargs: engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_data is not None: @@ -2284,6 +2398,7 @@ async def generate_diffusion_images( negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) lora_body = extra_body.get("lora") + bot_task = extra_body.get("bot_task") pil_images: list[Image.Image] = [] for img_b64 in reference_images: @@ -2367,6 +2482,12 @@ async def generate_diffusion_images( engine_prompt = gen_prompt sampling_params_list = [gen_params] + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + engine=diffusion_engine, + sampling_params_list=sampling_params_list, + bot_task=bot_task, + ) + result = None async for output in diffusion_engine.generate( prompt=engine_prompt, @@ -2435,9 +2556,7 @@ async def _create_diffusion_chat_completion( # `extra_body` is flattented and merged into the payload's root. # These extra fields are accessible via `model_extra` property (from Pydantic base class). # When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict. - extra_body = getattr(request, "extra_body", None) - if not extra_body: - extra_body = request.model_extra or {} + extra_body = self._get_extra_body_from_request(request) # Parse size if provided (supports "1024x1024" format) height, width = self._resolve_height_width_from_extra_body(extra_body) @@ -2456,6 +2575,7 @@ async def _create_diffusion_chat_completion( seed = getattr(request, "seed", None) negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) + bot_task = extra_body.get("bot_task") # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") @@ -2569,6 +2689,15 @@ async def _create_diffusion_chat_completion( # Generate image or audio (e.g. AudioX) via AsyncOmni diffusion_engine = cast(AsyncOmni, self._diffusion_engine) stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) + if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None: + bot_task = None + elif bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + + gen_prompt["bot_task"] = bot_task + gen_prompt["mm_processor_kwargs"] = { + "bot_task": tokenizer_bot_task_for_bot_task(bot_task), + } sampling_params_list = build_stage_sampling_params_list( stage_configs, get_default_sampling_params_list(diffusion_engine), @@ -2579,6 +2708,12 @@ async def _create_diffusion_chat_completion( if not sampling_params_list: sampling_params_list = [gen_params] + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + engine=diffusion_engine, + sampling_params_list=sampling_params_list, + bot_task=bot_task, + ) + result = None async for output in diffusion_engine.generate( prompt=gen_prompt, diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 88494eda456..1e057a71efa 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1517,14 +1517,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # For comprehension mode, block image generation tokens but allow # text structure tokens (, , etc.) so the model can - # follow its natural generation pattern. The yaml stop_token_ids - # for i2t/t2t now includes (128024) so the AR-only output - # terminates after the analysis section, matching HF's - # `bot_task="think"` behavior. Without that stop, the model - # continues into a recaption section even in comprehension mode - # (the stage-transition processor only fires in generation mode, - # but the instruct-tuned model writes recaption on its own from - # internal habit). + # follow its natural generation pattern. Runtime sampling params + # decide stop tokens from the active bot_task, matching the official + # HunyuanImage3 generation path without hard-coded YAML token ids. self._blocked_token_ids: set[int] = set() if self._is_comprehension: self._blocked_token_ids.update( From f032d5f12a7e7de012651d7d72a26ae950a04c6e Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 15:53:37 +0800 Subject: [PATCH 03/40] Apply ruff format for HunyuanImage3 files Signed-off-by: KexiongYu --- .../diffusion/models/hunyuan_image3/test_prompt_utils.py | 1 - .../diffusion/models/hunyuan_image3/prompt_utils.py | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 62beb45a1f6..50bbf9b704c 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -114,7 +114,6 @@ def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): class FakeSamplingParams: - def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: self.stop_token_ids = stop_token_ids self.max_tokens = max_tokens diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index a92b4a0848c..079f14b9fda 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -95,9 +95,7 @@ def stop_token_ids_for_bot_task( eos_id = _eos_token_id(tokenizer) if image_size == "auto": - extra_auto_stops = [ - _token_id(tokenizer, f"") for i in range(33) - ] + extra_auto_stops = [_token_id(tokenizer, f"") for i in range(33)] else: extra_auto_stops = [_token_id(tokenizer, "")] @@ -129,10 +127,7 @@ def apply_bot_task_to_sampling_params( ) -> list[Any]: """Apply a per-request HunyuanImage3 bot_task to one AR stage.""" if stage_index < 0 or stage_index >= len(sampling_params_list): - raise IndexError( - f"stage_index {stage_index} is out of range for " - f"{len(sampling_params_list)} sampling params" - ) + raise IndexError(f"stage_index {stage_index} is out of range for {len(sampling_params_list)} sampling params") updated_params_list = list(sampling_params_list) params = updated_params_list[stage_index] From 851baf60694bb133966e860cd81d12509d275620 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 16:22:00 +0800 Subject: [PATCH 04/40] Refine HunyuanImage3 prompt task composition Signed-off-by: KexiongYu --- .../hunyuan_image3/README.md | 5 +- .../hunyuan_image3/end2end.py | 44 ++---- .../hunyuan_image3/test_prompt_utils.py | 70 ++++++++- .../models/hunyuan_image3/prompt_utils.py | 144 ++++++++++++++++-- 4 files changed, 218 insertions(+), 45 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index 82cca4db6db..98908ace0d7 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -110,7 +110,7 @@ python examples/offline_inference/hunyuan_image3/end2end.py \ | `--steps` | Number of diffusion inference steps for image generation. | | `--guidance-scale` | Classifier-free guidance scale for image generation. | | `--height`, `--width` | Output image size for `text2img`. | -| `--bot-task` | Override the prompt task, for example `t2i_think` or `t2i_recaption`. | +| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds ``; `recaption` adds ``; `vanilla` uses the text-to-image pretrain template. | | `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. | | `--vae-use-tiling` | Enable VAE tiling for memory reduction. | @@ -137,6 +137,9 @@ Assistant: {trigger_tag?} - Trigger tags: `` for CoT and `` for recaptioning, placed after `Assistant: `. - System prompt: Auto-selected based on task. - `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure. +- The example composes the internal prompt task from `--modality` and `--bot-task` + before calling `prompt_utils`; for example, `img2text + think` becomes + `i2t_think` for prompt and stop-token lookup. The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()` helper handles segment-by-segment tokenization and matches HF `apply_chat_template`. diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index b6ffa535463..8233e2bf820 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,33 +18,16 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + available_prompt_bot_tasks, bot_task_for_task, build_prompt_tokens, - stop_token_ids_for_bot_task, + stop_token_ids_for_task, + sys_type_for_task, + task_for_modality_and_bot_task, ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType -# task -> (sys_type, bot_task, trigger_tag) -_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { - "t2t": ("en_unified", None, None), - "i2t": ("en_unified", None, None), - "it2i_think": ("en_unified", "think", ""), - "it2i_recaption": ("en_unified", "recaption", ""), - "t2i_think": ("en_unified", "think", ""), - "t2i_recaption": ("en_unified", "recaption", ""), - "t2i_vanilla": ("en_vanilla", "image", None), -} - -# Modality → prompt_utils task mapping -_MODALITY_TASK_MAP = { - "text2img": "t2i_think", - "img2img": "it2i_think", - "img2text": "i2t", - "text2text": "t2t", -} - - # Default deploy configs are absolute so this example works from any cwd. _REPO_ROOT = Path(__file__).resolve().parents[3] _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml") @@ -108,8 +91,13 @@ def parse_args(): parser.add_argument( "--bot-task", type=str, - default=None, - help="Override prompt task (e.g. it2i_think, t2i_recaption). Default: auto from modality.", + default="auto", + choices=available_prompt_bot_tasks(), + help=( + "Prompt behavior. 'auto' selects the default for the modality; " + "'think' adds ; 'recaption' adds ; " + "'vanilla' uses the t2i pretrain template." + ), ) parser.add_argument( "--sys-type", @@ -135,8 +123,8 @@ def main(): args = parse_args() os.makedirs(args.output, exist_ok=True) - # Determine task for prompt formatting - task = args.bot_task or _MODALITY_TASK_MAP[args.modality] + # Determine task for prompt formatting from modality + bot behavior. + task = task_for_modality_and_bot_task(args.modality, args.bot_task) bot_task = bot_task_for_task(task) if args.deploy_config is not None and args.stage_configs_path is not None: @@ -188,8 +176,7 @@ def main(): formatted_prompts: list[OmniPromptType] = [] for p in prompts: token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) - preset_sys_type, _, _ = _TASK_PRESETS[task] - effective_sys_type = args.sys_type or preset_sys_type + effective_sys_type = args.sys_type or sys_type_for_task(task) # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte). # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to @@ -222,7 +209,7 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - ar_stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task) + ar_stop_token_ids = stop_token_ids_for_task(tokenizer, task) for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps @@ -241,6 +228,7 @@ def main(): print("HunyuanImage-3.0 Generation Configuration:") print(f" Model: {args.model}") print(f" Modality: {args.modality}") + print(f" Prompt task: {task}") print(f" Bot task: {bot_task}") if deploy_config is not None: print(f" Deploy config: {deploy_config}") diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 50bbf9b704c..e634fdb09aa 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -25,11 +25,15 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( apply_bot_task_to_sampling_params, + available_prompt_bot_tasks, available_tasks, bot_task_for_task, build_prompt, build_prompt_tokens, stop_token_ids_for_bot_task, + stop_token_ids_for_task, + sys_type_for_task, + task_for_modality_and_bot_task, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -75,7 +79,9 @@ def test_available_tasks_covers_all_modalities(): tasks = set(available_tasks()) assert tasks >= { "t2t", + "t2t_think", "i2t", + "i2t_think", "it2i_think", "it2i_recaption", "t2i_think", @@ -88,7 +94,9 @@ def test_available_tasks_covers_all_modalities(): "task,expected_bot_task", [ ("t2t", "auto"), + ("t2t_think", "think_recaption"), ("i2t", "auto"), + ("i2t_think", "think_recaption"), ("it2i_think", "think_recaption"), ("it2i_recaption", "recaption"), ("t2i_think", "think_recaption"), @@ -100,6 +108,37 @@ def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task: assert bot_task_for_task(task) == expected_bot_task +@pytest.mark.parametrize( + "modality,bot_task,expected_task", + [ + ("text2text", "auto", "t2t"), + ("img2text", "auto", "i2t"), + ("text2img", "auto", "t2i_think"), + ("img2img", "auto", "it2i_think"), + ("i2t", "think", "i2t_think"), + ("ti2i", "recaption", "it2i_recaption"), + ("t2i", "vanilla", "t2i_vanilla"), + ("text2text", "none", "t2t"), + ], +) +def test_task_for_modality_and_bot_task_composes_prompt_task( + modality: str, + bot_task: str, + expected_task: str, +): + assert task_for_modality_and_bot_task(modality, bot_task) == expected_task + + +def test_task_for_modality_and_bot_task_rejects_invalid_combinations(): + assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"] + + with pytest.raises(ValueError, match="not supported"): + task_for_modality_and_bot_task("img2text", "recaption") + + with pytest.raises(ValueError, match="not supported"): + task_for_modality_and_bot_task("img2img", "vanilla") + + def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): tok = FakeTokenizer() @@ -113,6 +152,19 @@ def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): ] +def test_stop_token_ids_for_task_are_resolved_from_prompt_task(): + tok = FakeTokenizer() + + assert stop_token_ids_for_task(tok, "i2t") == [5, 8] + assert stop_token_ids_for_task(tok, "i2t_think") == [6, 7, 5] + assert stop_token_ids_for_task(tok, "t2i_vanilla") == [5] + + +def test_sys_type_for_task_returns_prompt_preset_default(): + assert sys_type_for_task("i2t_think") == "en_unified" + assert sys_type_for_task("t2i_vanilla") == "en_vanilla" + + class FakeSamplingParams: def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: self.stop_token_ids = stop_token_ids @@ -141,7 +193,9 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage(): "task", [ "t2t", + "t2t_think", "i2t", + "i2t_think", "it2i_think", "it2i_recaption", "t2i_think", @@ -170,7 +224,7 @@ def test_build_prompt_string_structure_chat_template(task: str): # documentation, so substring index() catches the wrong occurrence -- use # endswith() which directly captures "trigger is at the tail" (the Part A # fix: trigger goes AFTER `Assistant: `, not before user_prompt). - if task in ("it2i_think", "t2i_think"): + if task in ("t2t_think", "i2t_think", "it2i_think", "t2i_think"): assert s.endswith("Assistant: "), ( f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}" ) @@ -238,7 +292,14 @@ def test_build_prompt_tokens_no_image_for_text_only_tasks(): @pytest.mark.parametrize( "task,trigger_id", - [("it2i_think", 3), ("t2i_think", 3), ("it2i_recaption", 4), ("t2i_recaption", 4)], + [ + ("t2t_think", 3), + ("i2t_think", 3), + ("it2i_think", 3), + ("t2i_think", 3), + ("it2i_recaption", 4), + ("t2i_recaption", 4), + ], ) def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int): """Trigger tag id must be the LAST token (after `Assistant: ` segment).""" @@ -297,9 +358,12 @@ def test_end2end_routes_through_shared_prompt_utils(): if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"): imported_from_prompt_utils.update(alias.name for alias in node.names) expected_imports = { + "available_prompt_bot_tasks", "bot_task_for_task", "build_prompt_tokens", - "stop_token_ids_for_bot_task", + "stop_token_ids_for_task", + "sys_type_for_task", + "task_for_modality_and_bot_task", } assert expected_imports <= imported_from_prompt_utils, ( "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from " diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 079f14b9fda..b22acbdaf7a 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -22,6 +22,7 @@ from .system_prompt import get_system_prompt BOT_TASKS = ("auto", "image", "recaption", "think_recaption") +PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla") _BOT_TASK_TO_TOKENIZER_TASK = { "auto": "auto", "image": "image", @@ -42,18 +43,123 @@ "t2i_vanilla": ("en_vanilla", "image", None), } +_MODALITY_TO_TASK_PREFIX = { + "text2text": "t2t", + "t2t": "t2t", + "img2text": "i2t", + "image2text": "i2t", + "i2t": "i2t", + "text2img": "t2i", + "text2image": "t2i", + "t2i": "t2i", + "img2img": "it2i", + "image2image": "it2i", + "it2i": "it2i", + "ti2i": "it2i", +} + +_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = { + "t2t": None, + "i2t": None, + "t2i": "think", + "it2i": "think", +} + +_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = { + ("t2t", None): "t2t", + ("t2t", "think"): "t2t_think", + ("i2t", None): "i2t", + ("i2t", "think"): "i2t_think", + ("t2i", "think"): "t2i_think", + ("t2i", "recaption"): "t2i_recaption", + ("t2i", "vanilla"): "t2i_vanilla", + ("it2i", "think"): "it2i_think", + ("it2i", "recaption"): "it2i_recaption", +} + +_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = { + "auto": "auto", + "default": "auto", + "none": None, + "no": None, + "false": None, + "think": "think", + "think_recaption": "think", + "recaption": "recaption", + "image": "vanilla", + "vanilla": "vanilla", +} + def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) -def bot_task_for_task(task: str) -> str: - """Return the HunyuanImage3 bot_task associated with a prompt task.""" +def available_prompt_bot_tasks() -> list[str]: + """Sorted public bot_task values accepted by `task_for_modality_and_bot_task`.""" + return sorted(PROMPT_BOT_TASKS) + + +def _task_preset(task: str) -> tuple[str, str | None, str | None]: if task not in _TASK_PRESETS: raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + return _TASK_PRESETS[task] + + +def _task_has_image_input(task: str) -> bool: + return task.startswith(("i2t", "it2i")) + + +def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: + if bot_task is None: + return "auto" + + normalized = bot_task.strip().lower() + if normalized not in _PROMPT_BOT_TASK_ALIASES: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}") + return _PROMPT_BOT_TASK_ALIASES[normalized] + + +def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: + """Return the canonical prompt task for an input/output modality. - _, preset_bot_task, _ = _TASK_PRESETS[task] + `modality` chooses the base route (t2t, t2i, i2t, or it2i/ti2i), while + `bot_task` chooses the prompt behavior such as thinking, recaptioning, + or the vanilla text-to-image template. + """ + modality_key = modality.strip().lower() + if modality_key not in _MODALITY_TO_TASK_PREFIX: + raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}") + + task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key] + normalized_bot_task = _normalize_prompt_bot_task(bot_task) + if normalized_bot_task == "auto": + normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix] + + task_key = (task_prefix, normalized_bot_task) + if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK: + valid_bot_tasks = sorted( + "none" if candidate is None else candidate + for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK + if prefix == task_prefix + ) + raise ValueError( + f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}" + ) + + return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] + + +def sys_type_for_task(task: str) -> str: + """Return the default system prompt type for a canonical prompt task.""" + preset_sys_type, _, _ = _task_preset(task) + return preset_sys_type + + +def bot_task_for_task(task: str) -> str: + """Return the HunyuanImage3 bot_task associated with a prompt task.""" + _, preset_bot_task, _ = _task_preset(task) if preset_bot_task == "think": return "think_recaption" return preset_bot_task or "auto" @@ -117,6 +223,19 @@ def stop_token_ids_for_bot_task( return stop_token_id[tokenizer_bot_task] +def stop_token_ids_for_task( + tokenizer, + task: str, + image_size: int | str | None = None, +) -> list[int]: + """Return AR stop token ids for a canonical prompt task.""" + return stop_token_ids_for_bot_task( + tokenizer, + bot_task_for_task(task), + image_size=image_size, + ) + + def apply_bot_task_to_sampling_params( sampling_params_list: list[Any], tokenizer: Any, @@ -151,16 +270,13 @@ def build_prompt( inputs that need to match HF baseline byte-for-byte, use `build_prompt_tokens` instead and feed the result via prompt_token_ids. """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) effective_sys_type = sys_type or preset_sys_type system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) sys_text = system_prompt.strip() if system_prompt else "" - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = _task_has_image_input(task) # t2i_vanilla: pretrain mode for direct text->image generation. The # vanilla system prompt drives the model with no chat structure. @@ -212,17 +328,14 @@ def build_prompt_tokens( boundary merge happens. We replicate that here and feed the result to Omni via OmniTokensPrompt (prompt_token_ids). """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) effective_sys_type = sys_type or preset_sys_type bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") img_id = tokenizer.convert_tokens_to_ids("") trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = _task_has_image_input(task) # t2i_vanilla uses pretrain template with no chat structure; the vanilla # system prompt drives the model directly. No segment boundaries to @@ -252,11 +365,16 @@ def build_prompt_tokens( __all__ = [ "available_tasks", + "available_prompt_bot_tasks", "apply_bot_task_to_sampling_params", "bot_task_for_task", "BOT_TASKS", "build_prompt", "build_prompt_tokens", + "PROMPT_BOT_TASKS", "stop_token_ids_for_bot_task", + "stop_token_ids_for_task", + "sys_type_for_task", + "task_for_modality_and_bot_task", "tokenizer_bot_task_for_bot_task", ] From d6ed92fa2a21c40ef528879d9fed43b1aef0e189 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 16:51:34 +0800 Subject: [PATCH 05/40] Unify online HunyuanImage3 bot task handling Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 43 +++++ .../openai_api/test_image_server.py | 102 +++++++++++ ...test_serving_chat_multistage_generation.py | 69 ++++++++ .../models/hunyuan_image3/prompt_utils.py | 56 ++++++ .../entrypoints/openai/protocol/images.py | 28 ++- vllm_omni/entrypoints/openai/serving_chat.py | 163 +++++++++++++----- 6 files changed, 413 insertions(+), 48 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index e634fdb09aa..6a69888684f 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -25,6 +25,7 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( apply_bot_task_to_sampling_params, + apply_task_to_sampling_params, available_prompt_bot_tasks, available_tasks, bot_task_for_task, @@ -34,6 +35,8 @@ stop_token_ids_for_task, sys_type_for_task, task_for_modality_and_bot_task, + task_for_modality_and_request_bot_task, + tokenizer_bot_task_for_task, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -139,6 +142,24 @@ def test_task_for_modality_and_bot_task_rejects_invalid_combinations(): task_for_modality_and_bot_task("img2img", "vanilla") +@pytest.mark.parametrize( + "modality,bot_task,expected_task", + [ + ("text2img", "think", "t2i_think"), + ("text2img", "think_recaption", "t2i_think"), + ("text2img", "image", "t2i_vanilla"), + ("img2img", "recaption", "it2i_recaption"), + ("img2text", "auto", "i2t"), + ], +) +def test_task_for_modality_and_request_bot_task_accepts_legacy_and_unified_values( + modality: str, + bot_task: str, + expected_task: str, +): + assert task_for_modality_and_request_bot_task(modality, bot_task) == expected_task + + def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): tok = FakeTokenizer() @@ -165,6 +186,11 @@ def test_sys_type_for_task_returns_prompt_preset_default(): assert sys_type_for_task("t2i_vanilla") == "en_vanilla" +def test_tokenizer_bot_task_for_task_returns_internal_task_name(): + assert tokenizer_bot_task_for_task("t2i_think") == "think" + assert tokenizer_bot_task_for_task("t2i_vanilla") == "image" + + class FakeSamplingParams: def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: self.stop_token_ids = stop_token_ids @@ -189,6 +215,23 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage(): assert stage0.stop_token_ids == [6, 7, 5] +def test_apply_task_to_sampling_params_updates_only_target_stage(): + tok = FakeTokenizer() + stage0 = FakeSamplingParams(stop_token_ids=[999]) + stage1 = FakeSamplingParams(stop_token_ids=[888]) + + updated = apply_task_to_sampling_params( + [stage0, stage1], + tok, + "i2t_think", + stage_index=0, + ) + + assert updated[0] is stage0 + assert updated[0].stop_token_ids == [6, 7, 5] + assert updated[1] is stage1 + + @pytest.mark.parametrize( "task", [ diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index b5ff891f8f6..81d4aa0ad19 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -578,6 +578,103 @@ def test_multistage_images_async_omni_construction(async_omni_test_client): assert captured[1].guidance_scale == 6.5 +def test_multistage_hunyuan_images_accept_unified_bot_task(): + """Regression: /v1/images/generations maps unified bot_task values for HunyuanImage3.""" + + class FakeTokenizer: + eos_token_id = 5 + + def convert_tokens_to_ids(self, token): + mapping = { + "": 6, + "": 7, + "": 8, + } + mapping.update({f"": 1000 + i for i in range(33)}) + return mapping[token] + + class FakeAsyncOmniClass(AsyncOmni): + def __init__(self): + stage_configs = [ + SimpleNamespace( + stage_type="llm", + is_comprehension=True, + model_arch="HunyuanImage3ForCausalMM", + ), + SimpleNamespace( + stage_type="diffusion", + is_comprehension=False, + model_arch="HunyuanImage3Pipeline", + ), + ] + default_sampling_params_list = [ + SamplingParams(temperature=0.1), + OmniDiffusionSamplingParams(), + ] + self.engine = SimpleNamespace( + stage_configs=stage_configs, + default_sampling_params_list=default_sampling_params_list, + ) + self.default_sampling_params_list = default_sampling_params_list + self.captured_sampling_params_list = None + self.captured_prompt = None + self._images = [Image.new("RGB", (64, 64), color="green")] + self.od_config = SimpleNamespace(supports_multimodal_inputs=True) + + async def generate(self, prompt, request_id, sampling_params=None, sampling_params_list=None): + self.captured_sampling_params_list = ( + sampling_params_list if sampling_params_list is not None else [sampling_params] + ) + self.captured_prompt = prompt + yield MockGenerationResult([img.copy() for img in self._images]) + + async def get_tokenizer(self): + return FakeTokenizer() + + def __class_getitem__(cls, item): + return cls + + def get_diffusion_od_config(self): + return self.od_config + + app = FastAPI() + app.include_router(router) + + engine = FakeAsyncOmniClass() + chat_handler = object.__new__(OmniOpenAIServingChat) + chat_handler.engine_client = engine + chat_handler._diffusion_engine = None + app.state.openai_serving_chat = chat_handler + app.state.engine_client = engine + app.state.stage_configs = engine.engine.stage_configs + app.state.args = Namespace( + default_sampling_params='{"1": {"num_inference_steps":4, "guidance_scale":7.5, "generator_device":"cpu"}}', + max_generated_image_size=1048576, + ) + app.state.openai_serving_models = _DiffusionServingModels( + [BaseModelPath(name="tencent/HunyuanImage-3.0-Instruct", model_path="tencent/HunyuanImage-3.0-Instruct")] + ) + client = TestClient(app) + + response = client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "bot_task": "think", + "size": "128x256", + }, + ) + assert response.status_code == 200 + + captured_prompt = engine.captured_prompt + assert captured_prompt["bot_task"] == "think_recaption" + assert captured_prompt["mm_processor_kwargs"]["bot_task"] == "think" + + captured = engine.captured_sampling_params_list + assert captured is not None + assert captured[0].stop_token_ids == [6, 7, 5] + + def test_generate_images_async_omni_glm_image_sets_stage0_max_tokens(): """GLM-Image multistage: stage-0 gets target_h/w from requested size. @@ -906,6 +1003,8 @@ def test_parameter_validation(): assert req.size is None # Engine will use model defaults assert req.num_inference_steps is None # Engine will use model defaults assert req.true_cfg_scale is None # Engine will use model defaults + assert ImageGenerationRequest(prompt="test", bot_task="think").bot_task == "think" + assert ImageGenerationRequest(prompt="test", bot_task="think_recaption").bot_task == "think_recaption" # Invalid num_inference_steps (out of range) with pytest.raises(ValueError): @@ -928,6 +1027,9 @@ def test_parameter_validation(): with pytest.raises(ValueError): ImageGenerationRequest(prompt="test", layers=11) + with pytest.raises(ValueError): + ImageGenerationRequest(prompt="test", bot_task="bogus") + # Pass-Through Tests diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 144a0e97a6c..45eee6eb04a 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -91,3 +91,72 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser assert engine.default_sampling_params_list[1].lora_request is None assert engine.default_sampling_params_list[2].resolution == 640 assert engine.default_sampling_params_list[2].lora_request is None + + +@pytest.mark.parametrize( + "output_modalities,messages,bot_task,expected_task", + [ + (["image"], [{"role": "user", "content": "draw a cat"}], "think", "t2i_think"), + ( + ["image"], + [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}], + "recaption", + "it2i_recaption", + ), + ( + ["text"], + [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}], + "think_recaption", + "i2t_think", + ), + (["text"], [{"role": "user", "content": "describe"}], "none", "t2t"), + ], +) +def test_resolve_hunyuan_image3_request_task(serving_chat, output_modalities, messages, bot_task, expected_task): + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + stage_configs = [SimpleNamespace(stage_type="llm", model_arch="HunyuanImage3ForCausalMM", is_comprehension=True)] + task = OmniOpenAIServingChat._resolve_hunyuan_image3_request_task( + stage_configs=stage_configs, + output_modalities=output_modalities, + messages=messages, + bot_task=bot_task, + ) + + assert task == expected_task + + +def test_build_multistage_generation_inputs_maps_unified_bot_task_for_hunyuan(serving_chat): + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + engine = SimpleNamespace( + stage_configs=[ + SimpleNamespace( + stage_type="llm", + is_comprehension=True, + model_arch="HunyuanImage3ForCausalMM", + ), + SimpleNamespace( + stage_type="diffusion", + is_comprehension=False, + model_arch="HunyuanImage3Pipeline", + ), + ], + default_sampling_params_list=[ + SamplingParams(temperature=0.2, seed=11), + OmniDiffusionSamplingParams(), + ], + ) + + engine_prompt, _sampling_params_list = OmniOpenAIServingChat._build_multistage_generation_inputs( + serving_chat, + engine=engine, + prompt="draw a robot", + extra_body={"bot_task": "think"}, + reference_images=[], + gen_params=OmniDiffusionSamplingParams(height=768, width=1024, seed=0, num_outputs_per_prompt=1), + ) + + assert engine_prompt["modalities"] == ["image"] + assert engine_prompt["bot_task"] == "think_recaption" + assert engine_prompt["mm_processor_kwargs"]["bot_task"] == "think" diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index b22acbdaf7a..d9d0d508288 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -90,6 +90,11 @@ "vanilla": "vanilla", } +_REQUEST_BOT_TASK_ALIASES: dict[str, str | None] = { + **_PROMPT_BOT_TASK_ALIASES, + "image": "vanilla", +} + def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" @@ -121,6 +126,17 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: return _PROMPT_BOT_TASK_ALIASES[normalized] +def _normalize_request_bot_task(bot_task: str | None) -> str | None: + if bot_task is None: + return "auto" + + normalized = bot_task.strip().lower() + if normalized not in _REQUEST_BOT_TASK_ALIASES: + valid_bot_tasks = sorted(set(PROMPT_BOT_TASKS) | set(BOT_TASKS)) + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {valid_bot_tasks}") + return _REQUEST_BOT_TASK_ALIASES[normalized] + + def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: """Return the canonical prompt task for an input/output modality. @@ -151,6 +167,20 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] +def task_for_modality_and_request_bot_task(modality: str, bot_task: str | None = "auto") -> str: + """Resolve a request bot_task into a canonical prompt task. + + Request values accept both the unified public bot_task vocabulary + (`think`, `recaption`, `vanilla`, `none`, `auto`) and the legacy + HunyuanImage3 values (`auto`, `image`, `recaption`, + `think_recaption`). + """ + return task_for_modality_and_bot_task( + modality, + _normalize_request_bot_task(bot_task), + ) + + def sys_type_for_task(task: str) -> str: """Return the default system prompt type for a canonical prompt task.""" preset_sys_type, _, _ = _task_preset(task) @@ -236,6 +266,11 @@ def stop_token_ids_for_task( ) +def tokenizer_bot_task_for_task(task: str) -> str: + """Return the tokenizer-internal bot_task for a canonical prompt task.""" + return tokenizer_bot_task_for_bot_task(bot_task_for_task(task)) + + def apply_bot_task_to_sampling_params( sampling_params_list: list[Any], tokenizer: Any, @@ -256,6 +291,24 @@ def apply_bot_task_to_sampling_params( return updated_params_list +def apply_task_to_sampling_params( + sampling_params_list: list[Any], + tokenizer: Any, + task: str, + *, + stage_index: int = 0, + image_size: int | str | None = None, +) -> list[Any]: + """Apply a canonical prompt task to one AR stage's stop tokens.""" + return apply_bot_task_to_sampling_params( + sampling_params_list, + tokenizer, + bot_task_for_task(task), + stage_index=stage_index, + image_size=image_size, + ) + + def build_prompt( user_prompt: str, task: str = "it2i_think", @@ -367,6 +420,7 @@ def build_prompt_tokens( "available_tasks", "available_prompt_bot_tasks", "apply_bot_task_to_sampling_params", + "apply_task_to_sampling_params", "bot_task_for_task", "BOT_TASKS", "build_prompt", @@ -376,5 +430,7 @@ def build_prompt_tokens( "stop_token_ids_for_task", "sys_type_for_task", "task_for_modality_and_bot_task", + "task_for_modality_and_request_bot_task", + "tokenizer_bot_task_for_task", "tokenizer_bot_task_for_bot_task", ] diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index c78a95de058..548fe55fe30 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -119,8 +119,34 @@ def validate_use_system_prompt(cls, v): ) bot_task: str | None = Field( default=None, - description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.", + description=( + "HunyuanImage3 prompt behavior for this request. Preferred values: " + "auto, none, think, recaption, vanilla. Legacy values auto, image, " + "recaption, and think_recaption are also accepted." + ), ) + + @field_validator("bot_task") + @classmethod + def validate_bot_task(cls, v): + """Validate HunyuanImage3 bot_task / prompt behavior.""" + if v is None: + return None + + normalized = v.strip().lower() + valid_values = { + "auto", + "none", + "think", + "recaption", + "vanilla", + "image", + "think_recaption", + } + if normalized not in valid_values: + raise ValueError(f"Invalid bot_task: {v}. Must be one of: {sorted(valid_values)}") + return normalized + seed: int | None = Field(default=None, description="Random seed for reproducibility") generator_device: str | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index b2375fd38b4..cef07b0ac18 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -198,19 +198,24 @@ async def create_chat_completion( tokenizer = await self.engine_client.get_tokenizer() extra_body = self._get_extra_body_from_request(request) - bot_task = ( - extra_body.get("bot_task") - if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or [])) - is not None - else None + output_modalities = getattr(request, "modalities", self.engine_client.output_modalities) + hunyuan_task = self._resolve_hunyuan_image3_request_task( + stage_configs=list(getattr(self.engine_client, "stage_configs", []) or []), + output_modalities=output_modalities, + messages=request.messages, + bot_task=extra_body.get("bot_task"), ) + hunyuan_bot_task = None request_chat_template_kwargs = request.chat_template_kwargs or {} - if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + if hunyuan_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + bot_task_for_task, + tokenizer_bot_task_for_task, + ) - tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) + hunyuan_bot_task = bot_task_for_task(hunyuan_task) request_chat_template_kwargs = dict(request_chat_template_kwargs) - request_chat_template_kwargs["bot_task"] = tokenizer_bot_task + request_chat_template_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) reasoning_parser: ReasoningParser | None = None if self.reasoning_parser_cls: @@ -311,7 +316,6 @@ async def create_chat_completion( if raw_request: raw_request.state.request_metadata = request_metadata - output_modalities = getattr(request, "modalities", self.engine_client.output_modalities) request.modalities = ( output_modalities if output_modalities is not None else self.engine_client.output_modalities ) @@ -380,9 +384,11 @@ async def create_chat_completion( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if bot_task is not None: - mm_processor_kwargs["bot_task"] = tokenizer_bot_task - tprompt["bot_task"] = bot_task + if hunyuan_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_task + + mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) + tprompt["bot_task"] = hunyuan_bot_task tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image @@ -420,10 +426,10 @@ async def create_chat_completion( # to delta to ensure emitted outputs are correctly drained. Otherwise # convert cumulative to Final Only to ensure the output is correct. sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream) - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( engine=self.engine_client, sampling_params_list=sampling_params_list, - bot_task=bot_task, + task=hunyuan_task, tokenizer=tokenizer, ) @@ -752,15 +758,69 @@ def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | N return idx return None - async def _apply_hunyuan_image3_bot_task_sampling_params( + @staticmethod + def _infer_hunyuan_image3_request_modality( + output_modalities: list[str] | None, + has_reference_images: bool, + ) -> str: + image_output_requested = bool(output_modalities) and "image" in output_modalities + if image_output_requested: + return "img2img" if has_reference_images else "text2img" + return "img2text" if has_reference_images else "text2text" + + @classmethod + def _resolve_hunyuan_image3_request_task( + cls, + *, + stage_configs: list[Any], + output_modalities: list[str] | None, + bot_task: str | None, + messages: list[Any] | None = None, + reference_images: list[Any] | None = None, + ) -> str | None: + if bot_task is None: + return None + + if cls._get_hunyuan_image3_ar_stage_index(stage_configs) is None: + return None + + has_reference_images = False + if reference_images is not None: + has_reference_images = len(reference_images) > 0 + elif messages is not None: + normalized_messages = cls._messages_to_dicts(messages) + for message in normalized_messages: + if message.get("role", "") != "user": + continue + content = message.get("content", "") + if isinstance(content, list): + if any( + (isinstance(item, dict) and (item.get("type") == "image_url" or "image" in item)) + for item in content + ): + has_reference_images = True + break + elif isinstance(content, dict) and (content.get("type") == "image_url" or "image" in content): + has_reference_images = True + break + + modality = cls._infer_hunyuan_image3_request_modality(output_modalities, has_reference_images) + + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + task_for_modality_and_request_bot_task, + ) + + return task_for_modality_and_request_bot_task(modality, bot_task) + + async def _apply_hunyuan_image3_task_sampling_params( self, *, engine: Any, sampling_params_list: list[Any], - bot_task: Any, + task: str | None, tokenizer: Any | None = None, ) -> list[Any]: - if bot_task is None: + if task is None: return sampling_params_list stage_configs = list(getattr(engine, "stage_configs", []) or []) @@ -769,24 +829,18 @@ async def _apply_hunyuan_image3_bot_task_sampling_params( return sampling_params_list from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - BOT_TASKS, - apply_bot_task_to_sampling_params, - tokenizer_bot_task_for_bot_task, + apply_task_to_sampling_params, ) - if bot_task not in BOT_TASKS: - raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") - tokenizer_bot_task_for_bot_task(bot_task) - if tokenizer is None and hasattr(engine, "get_tokenizer"): tokenizer = await engine.get_tokenizer() if tokenizer is None: raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.") - return apply_bot_task_to_sampling_params( + return apply_task_to_sampling_params( sampling_params_list, tokenizer, - bot_task, + task, stage_index=stage_index, ) @@ -2254,10 +2308,11 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - bot_task = ( - extra_body.get("bot_task") - if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None - else None + hunyuan_task = self._resolve_hunyuan_image3_request_task( + stage_configs=list(stage_configs), + output_modalities=["image"], + reference_images=reference_images, + bot_task=extra_body.get("bot_task"), ) engine_prompt_data: dict[str, Any] | None = None @@ -2297,11 +2352,14 @@ def _build_multistage_generation_inputs( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + if hunyuan_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + bot_task_for_task, + tokenizer_bot_task_for_task, + ) - mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task) - engine_prompt["bot_task"] = bot_task + mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) + engine_prompt["bot_task"] = bot_task_for_task(hunyuan_task) if mm_processor_kwargs: engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_data is not None: @@ -2398,7 +2456,12 @@ async def generate_diffusion_images( negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) lora_body = extra_body.get("lora") - bot_task = extra_body.get("bot_task") + hunyuan_task = self._resolve_hunyuan_image3_request_task( + stage_configs=list(getattr(engine, "stage_configs", None) or []), + output_modalities=["image"], + reference_images=reference_images, + bot_task=extra_body.get("bot_task"), + ) pil_images: list[Image.Image] = [] for img_b64 in reference_images: @@ -2482,10 +2545,10 @@ async def generate_diffusion_images( engine_prompt = gen_prompt sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( engine=diffusion_engine, sampling_params_list=sampling_params_list, - bot_task=bot_task, + task=hunyuan_task, ) result = None @@ -2575,7 +2638,12 @@ async def _create_diffusion_chat_completion( seed = getattr(request, "seed", None) negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) - bot_task = extra_body.get("bot_task") + hunyuan_task = self._resolve_hunyuan_image3_request_task( + stage_configs=list(getattr(self._diffusion_engine, "stage_configs", []) or []), + output_modalities=["image"], + reference_images=reference_images, + bot_task=extra_body.get("bot_task"), + ) # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") @@ -2689,14 +2757,15 @@ async def _create_diffusion_chat_completion( # Generate image or audio (e.g. AudioX) via AsyncOmni diffusion_engine = cast(AsyncOmni, self._diffusion_engine) stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) - if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None: - bot_task = None - elif bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + if hunyuan_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + bot_task_for_task, + tokenizer_bot_task_for_task, + ) - gen_prompt["bot_task"] = bot_task + gen_prompt["bot_task"] = bot_task_for_task(hunyuan_task) gen_prompt["mm_processor_kwargs"] = { - "bot_task": tokenizer_bot_task_for_bot_task(bot_task), + "bot_task": tokenizer_bot_task_for_task(hunyuan_task), } sampling_params_list = build_stage_sampling_params_list( stage_configs, @@ -2708,10 +2777,10 @@ async def _create_diffusion_chat_completion( if not sampling_params_list: sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( engine=diffusion_engine, sampling_params_list=sampling_params_list, - bot_task=bot_task, + task=hunyuan_task, ) result = None From a10219d615927c000964621296ca06a5dde4fca6 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 17:50:31 +0800 Subject: [PATCH 06/40] Revert "Unify online HunyuanImage3 bot task handling" This reverts commit 6b67a5f426a3d02e263d0014cff917b171f39943. Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 43 ----- .../openai_api/test_image_server.py | 102 ----------- ...test_serving_chat_multistage_generation.py | 69 -------- .../models/hunyuan_image3/prompt_utils.py | 56 ------ .../entrypoints/openai/protocol/images.py | 28 +-- vllm_omni/entrypoints/openai/serving_chat.py | 163 +++++------------- 6 files changed, 48 insertions(+), 413 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 6a69888684f..e634fdb09aa 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -25,7 +25,6 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( apply_bot_task_to_sampling_params, - apply_task_to_sampling_params, available_prompt_bot_tasks, available_tasks, bot_task_for_task, @@ -35,8 +34,6 @@ stop_token_ids_for_task, sys_type_for_task, task_for_modality_and_bot_task, - task_for_modality_and_request_bot_task, - tokenizer_bot_task_for_task, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -142,24 +139,6 @@ def test_task_for_modality_and_bot_task_rejects_invalid_combinations(): task_for_modality_and_bot_task("img2img", "vanilla") -@pytest.mark.parametrize( - "modality,bot_task,expected_task", - [ - ("text2img", "think", "t2i_think"), - ("text2img", "think_recaption", "t2i_think"), - ("text2img", "image", "t2i_vanilla"), - ("img2img", "recaption", "it2i_recaption"), - ("img2text", "auto", "i2t"), - ], -) -def test_task_for_modality_and_request_bot_task_accepts_legacy_and_unified_values( - modality: str, - bot_task: str, - expected_task: str, -): - assert task_for_modality_and_request_bot_task(modality, bot_task) == expected_task - - def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): tok = FakeTokenizer() @@ -186,11 +165,6 @@ def test_sys_type_for_task_returns_prompt_preset_default(): assert sys_type_for_task("t2i_vanilla") == "en_vanilla" -def test_tokenizer_bot_task_for_task_returns_internal_task_name(): - assert tokenizer_bot_task_for_task("t2i_think") == "think" - assert tokenizer_bot_task_for_task("t2i_vanilla") == "image" - - class FakeSamplingParams: def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: self.stop_token_ids = stop_token_ids @@ -215,23 +189,6 @@ def test_apply_bot_task_to_sampling_params_updates_only_target_stage(): assert stage0.stop_token_ids == [6, 7, 5] -def test_apply_task_to_sampling_params_updates_only_target_stage(): - tok = FakeTokenizer() - stage0 = FakeSamplingParams(stop_token_ids=[999]) - stage1 = FakeSamplingParams(stop_token_ids=[888]) - - updated = apply_task_to_sampling_params( - [stage0, stage1], - tok, - "i2t_think", - stage_index=0, - ) - - assert updated[0] is stage0 - assert updated[0].stop_token_ids == [6, 7, 5] - assert updated[1] is stage1 - - @pytest.mark.parametrize( "task", [ diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index 81d4aa0ad19..b5ff891f8f6 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -578,103 +578,6 @@ def test_multistage_images_async_omni_construction(async_omni_test_client): assert captured[1].guidance_scale == 6.5 -def test_multistage_hunyuan_images_accept_unified_bot_task(): - """Regression: /v1/images/generations maps unified bot_task values for HunyuanImage3.""" - - class FakeTokenizer: - eos_token_id = 5 - - def convert_tokens_to_ids(self, token): - mapping = { - "": 6, - "": 7, - "": 8, - } - mapping.update({f"": 1000 + i for i in range(33)}) - return mapping[token] - - class FakeAsyncOmniClass(AsyncOmni): - def __init__(self): - stage_configs = [ - SimpleNamespace( - stage_type="llm", - is_comprehension=True, - model_arch="HunyuanImage3ForCausalMM", - ), - SimpleNamespace( - stage_type="diffusion", - is_comprehension=False, - model_arch="HunyuanImage3Pipeline", - ), - ] - default_sampling_params_list = [ - SamplingParams(temperature=0.1), - OmniDiffusionSamplingParams(), - ] - self.engine = SimpleNamespace( - stage_configs=stage_configs, - default_sampling_params_list=default_sampling_params_list, - ) - self.default_sampling_params_list = default_sampling_params_list - self.captured_sampling_params_list = None - self.captured_prompt = None - self._images = [Image.new("RGB", (64, 64), color="green")] - self.od_config = SimpleNamespace(supports_multimodal_inputs=True) - - async def generate(self, prompt, request_id, sampling_params=None, sampling_params_list=None): - self.captured_sampling_params_list = ( - sampling_params_list if sampling_params_list is not None else [sampling_params] - ) - self.captured_prompt = prompt - yield MockGenerationResult([img.copy() for img in self._images]) - - async def get_tokenizer(self): - return FakeTokenizer() - - def __class_getitem__(cls, item): - return cls - - def get_diffusion_od_config(self): - return self.od_config - - app = FastAPI() - app.include_router(router) - - engine = FakeAsyncOmniClass() - chat_handler = object.__new__(OmniOpenAIServingChat) - chat_handler.engine_client = engine - chat_handler._diffusion_engine = None - app.state.openai_serving_chat = chat_handler - app.state.engine_client = engine - app.state.stage_configs = engine.engine.stage_configs - app.state.args = Namespace( - default_sampling_params='{"1": {"num_inference_steps":4, "guidance_scale":7.5, "generator_device":"cpu"}}', - max_generated_image_size=1048576, - ) - app.state.openai_serving_models = _DiffusionServingModels( - [BaseModelPath(name="tencent/HunyuanImage-3.0-Instruct", model_path="tencent/HunyuanImage-3.0-Instruct")] - ) - client = TestClient(app) - - response = client.post( - "/v1/images/generations", - json={ - "prompt": "a cat", - "bot_task": "think", - "size": "128x256", - }, - ) - assert response.status_code == 200 - - captured_prompt = engine.captured_prompt - assert captured_prompt["bot_task"] == "think_recaption" - assert captured_prompt["mm_processor_kwargs"]["bot_task"] == "think" - - captured = engine.captured_sampling_params_list - assert captured is not None - assert captured[0].stop_token_ids == [6, 7, 5] - - def test_generate_images_async_omni_glm_image_sets_stage0_max_tokens(): """GLM-Image multistage: stage-0 gets target_h/w from requested size. @@ -1003,8 +906,6 @@ def test_parameter_validation(): assert req.size is None # Engine will use model defaults assert req.num_inference_steps is None # Engine will use model defaults assert req.true_cfg_scale is None # Engine will use model defaults - assert ImageGenerationRequest(prompt="test", bot_task="think").bot_task == "think" - assert ImageGenerationRequest(prompt="test", bot_task="think_recaption").bot_task == "think_recaption" # Invalid num_inference_steps (out of range) with pytest.raises(ValueError): @@ -1027,9 +928,6 @@ def test_parameter_validation(): with pytest.raises(ValueError): ImageGenerationRequest(prompt="test", layers=11) - with pytest.raises(ValueError): - ImageGenerationRequest(prompt="test", bot_task="bogus") - # Pass-Through Tests diff --git a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py index 45eee6eb04a..144a0e97a6c 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py +++ b/tests/entrypoints/openai_api/test_serving_chat_multistage_generation.py @@ -91,72 +91,3 @@ def test_build_multistage_generation_inputs_applies_stage_specific_overrides(ser assert engine.default_sampling_params_list[1].lora_request is None assert engine.default_sampling_params_list[2].resolution == 640 assert engine.default_sampling_params_list[2].lora_request is None - - -@pytest.mark.parametrize( - "output_modalities,messages,bot_task,expected_task", - [ - (["image"], [{"role": "user", "content": "draw a cat"}], "think", "t2i_think"), - ( - ["image"], - [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}], - "recaption", - "it2i_recaption", - ), - ( - ["text"], - [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}]}], - "think_recaption", - "i2t_think", - ), - (["text"], [{"role": "user", "content": "describe"}], "none", "t2t"), - ], -) -def test_resolve_hunyuan_image3_request_task(serving_chat, output_modalities, messages, bot_task, expected_task): - from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat - - stage_configs = [SimpleNamespace(stage_type="llm", model_arch="HunyuanImage3ForCausalMM", is_comprehension=True)] - task = OmniOpenAIServingChat._resolve_hunyuan_image3_request_task( - stage_configs=stage_configs, - output_modalities=output_modalities, - messages=messages, - bot_task=bot_task, - ) - - assert task == expected_task - - -def test_build_multistage_generation_inputs_maps_unified_bot_task_for_hunyuan(serving_chat): - from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat - - engine = SimpleNamespace( - stage_configs=[ - SimpleNamespace( - stage_type="llm", - is_comprehension=True, - model_arch="HunyuanImage3ForCausalMM", - ), - SimpleNamespace( - stage_type="diffusion", - is_comprehension=False, - model_arch="HunyuanImage3Pipeline", - ), - ], - default_sampling_params_list=[ - SamplingParams(temperature=0.2, seed=11), - OmniDiffusionSamplingParams(), - ], - ) - - engine_prompt, _sampling_params_list = OmniOpenAIServingChat._build_multistage_generation_inputs( - serving_chat, - engine=engine, - prompt="draw a robot", - extra_body={"bot_task": "think"}, - reference_images=[], - gen_params=OmniDiffusionSamplingParams(height=768, width=1024, seed=0, num_outputs_per_prompt=1), - ) - - assert engine_prompt["modalities"] == ["image"] - assert engine_prompt["bot_task"] == "think_recaption" - assert engine_prompt["mm_processor_kwargs"]["bot_task"] == "think" diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index d9d0d508288..b22acbdaf7a 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -90,11 +90,6 @@ "vanilla": "vanilla", } -_REQUEST_BOT_TASK_ALIASES: dict[str, str | None] = { - **_PROMPT_BOT_TASK_ALIASES, - "image": "vanilla", -} - def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" @@ -126,17 +121,6 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: return _PROMPT_BOT_TASK_ALIASES[normalized] -def _normalize_request_bot_task(bot_task: str | None) -> str | None: - if bot_task is None: - return "auto" - - normalized = bot_task.strip().lower() - if normalized not in _REQUEST_BOT_TASK_ALIASES: - valid_bot_tasks = sorted(set(PROMPT_BOT_TASKS) | set(BOT_TASKS)) - raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {valid_bot_tasks}") - return _REQUEST_BOT_TASK_ALIASES[normalized] - - def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: """Return the canonical prompt task for an input/output modality. @@ -167,20 +151,6 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] -def task_for_modality_and_request_bot_task(modality: str, bot_task: str | None = "auto") -> str: - """Resolve a request bot_task into a canonical prompt task. - - Request values accept both the unified public bot_task vocabulary - (`think`, `recaption`, `vanilla`, `none`, `auto`) and the legacy - HunyuanImage3 values (`auto`, `image`, `recaption`, - `think_recaption`). - """ - return task_for_modality_and_bot_task( - modality, - _normalize_request_bot_task(bot_task), - ) - - def sys_type_for_task(task: str) -> str: """Return the default system prompt type for a canonical prompt task.""" preset_sys_type, _, _ = _task_preset(task) @@ -266,11 +236,6 @@ def stop_token_ids_for_task( ) -def tokenizer_bot_task_for_task(task: str) -> str: - """Return the tokenizer-internal bot_task for a canonical prompt task.""" - return tokenizer_bot_task_for_bot_task(bot_task_for_task(task)) - - def apply_bot_task_to_sampling_params( sampling_params_list: list[Any], tokenizer: Any, @@ -291,24 +256,6 @@ def apply_bot_task_to_sampling_params( return updated_params_list -def apply_task_to_sampling_params( - sampling_params_list: list[Any], - tokenizer: Any, - task: str, - *, - stage_index: int = 0, - image_size: int | str | None = None, -) -> list[Any]: - """Apply a canonical prompt task to one AR stage's stop tokens.""" - return apply_bot_task_to_sampling_params( - sampling_params_list, - tokenizer, - bot_task_for_task(task), - stage_index=stage_index, - image_size=image_size, - ) - - def build_prompt( user_prompt: str, task: str = "it2i_think", @@ -420,7 +367,6 @@ def build_prompt_tokens( "available_tasks", "available_prompt_bot_tasks", "apply_bot_task_to_sampling_params", - "apply_task_to_sampling_params", "bot_task_for_task", "BOT_TASKS", "build_prompt", @@ -430,7 +376,5 @@ def build_prompt_tokens( "stop_token_ids_for_task", "sys_type_for_task", "task_for_modality_and_bot_task", - "task_for_modality_and_request_bot_task", - "tokenizer_bot_task_for_task", "tokenizer_bot_task_for_bot_task", ] diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 548fe55fe30..c78a95de058 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -119,34 +119,8 @@ def validate_use_system_prompt(cls, v): ) bot_task: str | None = Field( default=None, - description=( - "HunyuanImage3 prompt behavior for this request. Preferred values: " - "auto, none, think, recaption, vanilla. Legacy values auto, image, " - "recaption, and think_recaption are also accepted." - ), + description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.", ) - - @field_validator("bot_task") - @classmethod - def validate_bot_task(cls, v): - """Validate HunyuanImage3 bot_task / prompt behavior.""" - if v is None: - return None - - normalized = v.strip().lower() - valid_values = { - "auto", - "none", - "think", - "recaption", - "vanilla", - "image", - "think_recaption", - } - if normalized not in valid_values: - raise ValueError(f"Invalid bot_task: {v}. Must be one of: {sorted(valid_values)}") - return normalized - seed: int | None = Field(default=None, description="Random seed for reproducibility") generator_device: str | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index cef07b0ac18..b2375fd38b4 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -198,24 +198,19 @@ async def create_chat_completion( tokenizer = await self.engine_client.get_tokenizer() extra_body = self._get_extra_body_from_request(request) - output_modalities = getattr(request, "modalities", self.engine_client.output_modalities) - hunyuan_task = self._resolve_hunyuan_image3_request_task( - stage_configs=list(getattr(self.engine_client, "stage_configs", []) or []), - output_modalities=output_modalities, - messages=request.messages, - bot_task=extra_body.get("bot_task"), + bot_task = ( + extra_body.get("bot_task") + if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or [])) + is not None + else None ) - hunyuan_bot_task = None request_chat_template_kwargs = request.chat_template_kwargs or {} - if hunyuan_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - bot_task_for_task, - tokenizer_bot_task_for_task, - ) + if bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task - hunyuan_bot_task = bot_task_for_task(hunyuan_task) + tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) request_chat_template_kwargs = dict(request_chat_template_kwargs) - request_chat_template_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) + request_chat_template_kwargs["bot_task"] = tokenizer_bot_task reasoning_parser: ReasoningParser | None = None if self.reasoning_parser_cls: @@ -316,6 +311,7 @@ async def create_chat_completion( if raw_request: raw_request.state.request_metadata = request_metadata + output_modalities = getattr(request, "modalities", self.engine_client.output_modalities) request.modalities = ( output_modalities if output_modalities is not None else self.engine_client.output_modalities ) @@ -384,11 +380,9 @@ async def create_chat_completion( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if hunyuan_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_task - - mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) - tprompt["bot_task"] = hunyuan_bot_task + if bot_task is not None: + mm_processor_kwargs["bot_task"] = tokenizer_bot_task + tprompt["bot_task"] = bot_task tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image @@ -426,10 +420,10 @@ async def create_chat_completion( # to delta to ensure emitted outputs are correctly drained. Otherwise # convert cumulative to Final Only to ensure the output is correct. sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream) - sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( engine=self.engine_client, sampling_params_list=sampling_params_list, - task=hunyuan_task, + bot_task=bot_task, tokenizer=tokenizer, ) @@ -758,69 +752,15 @@ def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | N return idx return None - @staticmethod - def _infer_hunyuan_image3_request_modality( - output_modalities: list[str] | None, - has_reference_images: bool, - ) -> str: - image_output_requested = bool(output_modalities) and "image" in output_modalities - if image_output_requested: - return "img2img" if has_reference_images else "text2img" - return "img2text" if has_reference_images else "text2text" - - @classmethod - def _resolve_hunyuan_image3_request_task( - cls, - *, - stage_configs: list[Any], - output_modalities: list[str] | None, - bot_task: str | None, - messages: list[Any] | None = None, - reference_images: list[Any] | None = None, - ) -> str | None: - if bot_task is None: - return None - - if cls._get_hunyuan_image3_ar_stage_index(stage_configs) is None: - return None - - has_reference_images = False - if reference_images is not None: - has_reference_images = len(reference_images) > 0 - elif messages is not None: - normalized_messages = cls._messages_to_dicts(messages) - for message in normalized_messages: - if message.get("role", "") != "user": - continue - content = message.get("content", "") - if isinstance(content, list): - if any( - (isinstance(item, dict) and (item.get("type") == "image_url" or "image" in item)) - for item in content - ): - has_reference_images = True - break - elif isinstance(content, dict) and (content.get("type") == "image_url" or "image" in content): - has_reference_images = True - break - - modality = cls._infer_hunyuan_image3_request_modality(output_modalities, has_reference_images) - - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - task_for_modality_and_request_bot_task, - ) - - return task_for_modality_and_request_bot_task(modality, bot_task) - - async def _apply_hunyuan_image3_task_sampling_params( + async def _apply_hunyuan_image3_bot_task_sampling_params( self, *, engine: Any, sampling_params_list: list[Any], - task: str | None, + bot_task: Any, tokenizer: Any | None = None, ) -> list[Any]: - if task is None: + if bot_task is None: return sampling_params_list stage_configs = list(getattr(engine, "stage_configs", []) or []) @@ -829,18 +769,24 @@ async def _apply_hunyuan_image3_task_sampling_params( return sampling_params_list from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - apply_task_to_sampling_params, + BOT_TASKS, + apply_bot_task_to_sampling_params, + tokenizer_bot_task_for_bot_task, ) + if bot_task not in BOT_TASKS: + raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") + tokenizer_bot_task_for_bot_task(bot_task) + if tokenizer is None and hasattr(engine, "get_tokenizer"): tokenizer = await engine.get_tokenizer() if tokenizer is None: raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.") - return apply_task_to_sampling_params( + return apply_bot_task_to_sampling_params( sampling_params_list, tokenizer, - task, + bot_task, stage_index=stage_index, ) @@ -2308,11 +2254,10 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - hunyuan_task = self._resolve_hunyuan_image3_request_task( - stage_configs=list(stage_configs), - output_modalities=["image"], - reference_images=reference_images, - bot_task=extra_body.get("bot_task"), + bot_task = ( + extra_body.get("bot_task") + if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None + else None ) engine_prompt_data: dict[str, Any] | None = None @@ -2352,14 +2297,11 @@ def _build_multistage_generation_inputs( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if hunyuan_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - bot_task_for_task, - tokenizer_bot_task_for_task, - ) + if bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task - mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_task(hunyuan_task) - engine_prompt["bot_task"] = bot_task_for_task(hunyuan_task) + mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task) + engine_prompt["bot_task"] = bot_task if mm_processor_kwargs: engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_data is not None: @@ -2456,12 +2398,7 @@ async def generate_diffusion_images( negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) lora_body = extra_body.get("lora") - hunyuan_task = self._resolve_hunyuan_image3_request_task( - stage_configs=list(getattr(engine, "stage_configs", None) or []), - output_modalities=["image"], - reference_images=reference_images, - bot_task=extra_body.get("bot_task"), - ) + bot_task = extra_body.get("bot_task") pil_images: list[Image.Image] = [] for img_b64 in reference_images: @@ -2545,10 +2482,10 @@ async def generate_diffusion_images( engine_prompt = gen_prompt sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( engine=diffusion_engine, sampling_params_list=sampling_params_list, - task=hunyuan_task, + bot_task=bot_task, ) result = None @@ -2638,12 +2575,7 @@ async def _create_diffusion_chat_completion( seed = getattr(request, "seed", None) negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) - hunyuan_task = self._resolve_hunyuan_image3_request_task( - stage_configs=list(getattr(self._diffusion_engine, "stage_configs", []) or []), - output_modalities=["image"], - reference_images=reference_images, - bot_task=extra_body.get("bot_task"), - ) + bot_task = extra_body.get("bot_task") # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") @@ -2757,15 +2689,14 @@ async def _create_diffusion_chat_completion( # Generate image or audio (e.g. AudioX) via AsyncOmni diffusion_engine = cast(AsyncOmni, self._diffusion_engine) stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) - if hunyuan_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - bot_task_for_task, - tokenizer_bot_task_for_task, - ) + if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None: + bot_task = None + elif bot_task is not None: + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task - gen_prompt["bot_task"] = bot_task_for_task(hunyuan_task) + gen_prompt["bot_task"] = bot_task gen_prompt["mm_processor_kwargs"] = { - "bot_task": tokenizer_bot_task_for_task(hunyuan_task), + "bot_task": tokenizer_bot_task_for_bot_task(bot_task), } sampling_params_list = build_stage_sampling_params_list( stage_configs, @@ -2777,10 +2708,10 @@ async def _create_diffusion_chat_completion( if not sampling_params_list: sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_task_sampling_params( + sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( engine=diffusion_engine, sampling_params_list=sampling_params_list, - task=hunyuan_task, + bot_task=bot_task, ) result = None From 441145c1de3983ca79e45ea8acec21a0d126b340 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 19:02:29 +0800 Subject: [PATCH 07/40] Consolidate HunyuanImage3 bot task resolution Signed-off-by: KexiongYu --- .../hunyuan_image3/end2end.py | 13 +- .../hunyuan_image3/test_prompt_utils.py | 32 ++-- .../models/hunyuan_image3/prompt_utils.py | 174 +++++++++++++----- 3 files changed, 156 insertions(+), 63 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 8233e2bf820..9b717e198b8 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -19,11 +19,9 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( available_prompt_bot_tasks, - bot_task_for_task, build_prompt_tokens, - stop_token_ids_for_task, + resolve_bot_task, sys_type_for_task, - task_for_modality_and_bot_task, ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -124,8 +122,10 @@ def main(): os.makedirs(args.output, exist_ok=True) # Determine task for prompt formatting from modality + bot behavior. - task = task_for_modality_and_bot_task(args.modality, args.bot_task) - bot_task = bot_task_for_task(task) + bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality) + task = bot_task_resolution.task + assert task is not None + bot_task = bot_task_resolution.bot_task if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -209,7 +209,8 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - ar_stop_token_ids = stop_token_ids_for_task(tokenizer, task) + ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids + assert ar_stop_token_ids is not None for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index e634fdb09aa..6c1f277b366 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -27,13 +27,12 @@ apply_bot_task_to_sampling_params, available_prompt_bot_tasks, available_tasks, - bot_task_for_task, build_prompt, build_prompt_tokens, + resolve_bot_task, stop_token_ids_for_bot_task, stop_token_ids_for_task, sys_type_for_task, - task_for_modality_and_bot_task, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -104,8 +103,10 @@ def test_available_tasks_covers_all_modalities(): ("t2i_vanilla", "image"), ], ) -def test_bot_task_for_task_matches_prompt_presets(task: str, expected_bot_task: str): - assert bot_task_for_task(task) == expected_bot_task +def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: str): + resolution = resolve_bot_task(task=task) + assert resolution.task == task + assert resolution.bot_task == expected_bot_task @pytest.mark.parametrize( @@ -126,17 +127,28 @@ def test_task_for_modality_and_bot_task_composes_prompt_task( bot_task: str, expected_task: str, ): - assert task_for_modality_and_bot_task(modality, bot_task) == expected_task + assert resolve_bot_task(bot_task, modality=modality).task == expected_task -def test_task_for_modality_and_bot_task_rejects_invalid_combinations(): +def test_resolve_bot_task_rejects_invalid_combinations(): assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"] with pytest.raises(ValueError, match="not supported"): - task_for_modality_and_bot_task("img2text", "recaption") + resolve_bot_task("recaption", modality="img2text") with pytest.raises(ValueError, match="not supported"): - task_for_modality_and_bot_task("img2img", "vanilla") + resolve_bot_task("vanilla", modality="img2img") + + +def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids(): + tok = FakeTokenizer() + + resolution = resolve_bot_task("think_recaption", tokenizer=tok) + + assert resolution.task is None + assert resolution.bot_task == "think_recaption" + assert resolution.tokenizer_bot_task == "think" + assert resolution.stop_token_ids == [6, 7, 5] def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): @@ -359,11 +371,9 @@ def test_end2end_routes_through_shared_prompt_utils(): imported_from_prompt_utils.update(alias.name for alias in node.names) expected_imports = { "available_prompt_bot_tasks", - "bot_task_for_task", "build_prompt_tokens", - "stop_token_ids_for_task", + "resolve_bot_task", "sys_type_for_task", - "task_for_modality_and_bot_task", } assert expected_imports <= imported_from_prompt_utils, ( "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from " diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index b22acbdaf7a..a3c19c7c28d 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -17,6 +17,7 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Any from .system_prompt import get_system_prompt @@ -91,13 +92,26 @@ } +@dataclass(frozen=True) +class BotTaskResolution: + """Resolved HunyuanImage3 prompt/bot-task settings.""" + + task: str | None + sys_type: str | None + prompt_bot_task: str | None + bot_task: str + tokenizer_bot_task: str + trigger_tag: str | None + stop_token_ids: list[int] | None = None + + def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) def available_prompt_bot_tasks() -> list[str]: - """Sorted public bot_task values accepted by `task_for_modality_and_bot_task`.""" + """Sorted public bot_task values accepted by `resolve_bot_task` with modality.""" return sorted(PROMPT_BOT_TASKS) @@ -121,13 +135,7 @@ def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: return _PROMPT_BOT_TASK_ALIASES[normalized] -def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: - """Return the canonical prompt task for an input/output modality. - - `modality` chooses the base route (t2t, t2i, i2t, or it2i/ti2i), while - `bot_task` chooses the prompt behavior such as thinking, recaptioning, - or the vanilla text-to-image template. - """ +def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str: modality_key = modality.strip().lower() if modality_key not in _MODALITY_TO_TASK_PREFIX: raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}") @@ -151,25 +159,117 @@ def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] +def _bot_task_for_preset_bot_task(preset_bot_task: str | None) -> str: + if preset_bot_task == "think": + return "think_recaption" + return preset_bot_task or "auto" + + +def _tokenizer_bot_task_for_bot_task(bot_task: str) -> str: + if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") + return _BOT_TASK_TO_TOKENIZER_TASK[bot_task] + + +def _stop_token_ids_for_tokenizer_bot_task( + tokenizer, + tokenizer_bot_task: str, + image_size: int | str | None = None, +) -> list[int]: + eos_id = _eos_token_id(tokenizer) + + if image_size == "auto": + extra_auto_stops = [_token_id(tokenizer, f"") for i in range(33)] + else: + extra_auto_stops = [_token_id(tokenizer, "")] + + stop_token_id = { + "auto": [eos_id] + extra_auto_stops, + "image": [eos_id], + "recaption": [ + _token_id(tokenizer, ""), + _token_id(tokenizer, ""), + eos_id, + ], + "think": [ + _token_id(tokenizer, ""), + _token_id(tokenizer, ""), + eos_id, + ], + } + return stop_token_id[tokenizer_bot_task] + + +def resolve_bot_task( + bot_task: str | None = "auto", + *, + modality: str | None = None, + task: str | None = None, + tokenizer: Any | None = None, + image_size: int | str | None = None, +) -> BotTaskResolution: + """Resolve HunyuanImage3 bot-task related prompt settings. + + Pass `modality + bot_task` for CLI/request-level behavior, `task` for a + canonical prompt task, or only `bot_task` to validate/map a pipeline + HunyuanImage3 bot_task. + """ + if task is not None and modality is not None: + raise ValueError("Pass either task or modality, not both.") + + if task is None and modality is not None: + task = _task_for_modality_and_prompt_bot_task(modality, bot_task) + + if task is not None: + sys_type, preset_bot_task, trigger_tag = _task_preset(task) + resolved_bot_task = _bot_task_for_preset_bot_task(preset_bot_task) + prompt_bot_task = preset_bot_task + else: + sys_type = None + trigger_tag = None + prompt_bot_task = None + resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower() + + tokenizer_bot_task = _tokenizer_bot_task_for_bot_task(resolved_bot_task) + stop_token_ids = ( + _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size) + if tokenizer is not None + else None + ) + + return BotTaskResolution( + task=task, + sys_type=sys_type, + prompt_bot_task=prompt_bot_task, + bot_task=resolved_bot_task, + tokenizer_bot_task=tokenizer_bot_task, + trigger_tag=trigger_tag, + stop_token_ids=stop_token_ids, + ) + + +def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: + """Return the canonical prompt task for an input/output modality.""" + task = resolve_bot_task(bot_task, modality=modality).task + assert task is not None + return task + + def sys_type_for_task(task: str) -> str: """Return the default system prompt type for a canonical prompt task.""" - preset_sys_type, _, _ = _task_preset(task) - return preset_sys_type + sys_type = resolve_bot_task(task=task).sys_type + assert sys_type is not None + return sys_type def bot_task_for_task(task: str) -> str: """Return the HunyuanImage3 bot_task associated with a prompt task.""" - _, preset_bot_task, _ = _task_preset(task) - if preset_bot_task == "think": - return "think_recaption" - return preset_bot_task or "auto" + return resolve_bot_task(task=task).bot_task def tokenizer_bot_task_for_bot_task(bot_task: str) -> str: """Map the public HunyuanImage3 bot_task to tokenizer-internal task.""" - if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: - raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") - return _BOT_TASK_TO_TOKENIZER_TASK[bot_task] + return resolve_bot_task(bot_task).tokenizer_bot_task def _token_id(tokenizer, token: str) -> int: @@ -198,29 +298,9 @@ def stop_token_ids_for_bot_task( their structural end tokens, and all ids are resolved from the tokenizer instead of being hard-coded in deploy YAML. """ - eos_id = _eos_token_id(tokenizer) - - if image_size == "auto": - extra_auto_stops = [_token_id(tokenizer, f"") for i in range(33)] - else: - extra_auto_stops = [_token_id(tokenizer, "")] - - tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) - stop_token_id = { - "auto": [eos_id] + extra_auto_stops, - "image": [eos_id], - "recaption": [ - _token_id(tokenizer, ""), - _token_id(tokenizer, ""), - eos_id, - ], - "think": [ - _token_id(tokenizer, ""), - _token_id(tokenizer, ""), - eos_id, - ], - } - return stop_token_id[tokenizer_bot_task] + stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids + assert stop_token_ids is not None + return stop_token_ids def stop_token_ids_for_task( @@ -229,11 +309,9 @@ def stop_token_ids_for_task( image_size: int | str | None = None, ) -> list[int]: """Return AR stop token ids for a canonical prompt task.""" - return stop_token_ids_for_bot_task( - tokenizer, - bot_task_for_task(task), - image_size=image_size, - ) + stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer, image_size=image_size).stop_token_ids + assert stop_token_ids is not None + return stop_token_ids def apply_bot_task_to_sampling_params( @@ -250,7 +328,9 @@ def apply_bot_task_to_sampling_params( updated_params_list = list(sampling_params_list) params = updated_params_list[stage_index] - params.stop_token_ids = stop_token_ids_for_bot_task(tokenizer, bot_task, image_size=image_size) + stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids + assert stop_token_ids is not None + params.stop_token_ids = stop_token_ids updated_params_list[stage_index] = params return updated_params_list @@ -364,6 +444,7 @@ def build_prompt_tokens( __all__ = [ + "BotTaskResolution", "available_tasks", "available_prompt_bot_tasks", "apply_bot_task_to_sampling_params", @@ -372,6 +453,7 @@ def build_prompt_tokens( "build_prompt", "build_prompt_tokens", "PROMPT_BOT_TASKS", + "resolve_bot_task", "stop_token_ids_for_bot_task", "stop_token_ids_for_task", "sys_type_for_task", From 5d88d160f7ac8720d5253b6e4037fc1c1fee558c Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 19:07:02 +0800 Subject: [PATCH 08/40] Remove legacy HunyuanImage3 bot task helpers Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 24 +++---- .../models/hunyuan_image3/prompt_utils.py | 68 ++----------------- vllm_omni/entrypoints/openai/serving_chat.py | 16 ++--- 3 files changed, 23 insertions(+), 85 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 6c1f277b366..4298a37870e 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -30,8 +30,6 @@ build_prompt, build_prompt_tokens, resolve_bot_task, - stop_token_ids_for_bot_task, - stop_token_ids_for_task, sys_type_for_task, ) @@ -122,7 +120,7 @@ def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: s ("text2text", "none", "t2t"), ], ) -def test_task_for_modality_and_bot_task_composes_prompt_task( +def test_resolve_bot_task_composes_prompt_task( modality: str, bot_task: str, expected_task: str, @@ -151,25 +149,25 @@ def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids(): assert resolution.stop_token_ids == [6, 7, 5] -def test_stop_token_ids_for_bot_task_are_resolved_from_tokenizer(): +def test_resolve_bot_task_resolves_stop_ids_from_bot_task(): tok = FakeTokenizer() - assert stop_token_ids_for_bot_task(tok, "auto") == [5, 8] - assert stop_token_ids_for_bot_task(tok, "image") == [5] - assert stop_token_ids_for_bot_task(tok, "think_recaption") == [6, 7, 5] - assert stop_token_ids_for_bot_task(tok, "recaption") == [6, 7, 5] - assert stop_token_ids_for_bot_task(tok, "auto", image_size="auto") == [ + assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [5, 8] + assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [5] + assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [6, 7, 5] + assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [6, 7, 5] + assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [ 5, *range(1000, 1033), ] -def test_stop_token_ids_for_task_are_resolved_from_prompt_task(): +def test_resolve_bot_task_resolves_stop_ids_from_prompt_task(): tok = FakeTokenizer() - assert stop_token_ids_for_task(tok, "i2t") == [5, 8] - assert stop_token_ids_for_task(tok, "i2t_think") == [6, 7, 5] - assert stop_token_ids_for_task(tok, "t2i_vanilla") == [5] + assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [5, 8] + assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [6, 7, 5] + assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [5] def test_sys_type_for_task_returns_prompt_preset_default(): diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index a3c19c7c28d..c62e8a39437 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -159,18 +159,6 @@ def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] -def _bot_task_for_preset_bot_task(preset_bot_task: str | None) -> str: - if preset_bot_task == "think": - return "think_recaption" - return preset_bot_task or "auto" - - -def _tokenizer_bot_task_for_bot_task(bot_task: str) -> str: - if bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: - raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") - return _BOT_TASK_TO_TOKENIZER_TASK[bot_task] - - def _stop_token_ids_for_tokenizer_bot_task( tokenizer, tokenizer_bot_task: str, @@ -222,7 +210,7 @@ def resolve_bot_task( if task is not None: sys_type, preset_bot_task, trigger_tag = _task_preset(task) - resolved_bot_task = _bot_task_for_preset_bot_task(preset_bot_task) + resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto" prompt_bot_task = preset_bot_task else: sys_type = None @@ -230,7 +218,9 @@ def resolve_bot_task( prompt_bot_task = None resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower() - tokenizer_bot_task = _tokenizer_bot_task_for_bot_task(resolved_bot_task) + if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: + raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}") + tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task] stop_token_ids = ( _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size) if tokenizer is not None @@ -248,13 +238,6 @@ def resolve_bot_task( ) -def task_for_modality_and_bot_task(modality: str, bot_task: str | None = "auto") -> str: - """Return the canonical prompt task for an input/output modality.""" - task = resolve_bot_task(bot_task, modality=modality).task - assert task is not None - return task - - def sys_type_for_task(task: str) -> str: """Return the default system prompt type for a canonical prompt task.""" sys_type = resolve_bot_task(task=task).sys_type @@ -262,16 +245,6 @@ def sys_type_for_task(task: str) -> str: return sys_type -def bot_task_for_task(task: str) -> str: - """Return the HunyuanImage3 bot_task associated with a prompt task.""" - return resolve_bot_task(task=task).bot_task - - -def tokenizer_bot_task_for_bot_task(bot_task: str) -> str: - """Map the public HunyuanImage3 bot_task to tokenizer-internal task.""" - return resolve_bot_task(bot_task).tokenizer_bot_task - - def _token_id(tokenizer, token: str) -> int: token_id = tokenizer.convert_tokens_to_ids(token) if token_id is None: @@ -286,34 +259,6 @@ def _eos_token_id(tokenizer) -> int: return _token_id(tokenizer, "<|endoftext|>") -def stop_token_ids_for_bot_task( - tokenizer, - bot_task: str, - image_size: int | str | None = None, -) -> list[int]: - """Return AR stop token ids for a HunyuanImage3 bot_task. - - Mirrors the official HunyuanImage-3.0 generation logic: `auto` - additionally stops on image-start markers, text/image tasks stop on - their structural end tokens, and all ids are resolved from the - tokenizer instead of being hard-coded in deploy YAML. - """ - stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids - assert stop_token_ids is not None - return stop_token_ids - - -def stop_token_ids_for_task( - tokenizer, - task: str, - image_size: int | str | None = None, -) -> list[int]: - """Return AR stop token ids for a canonical prompt task.""" - stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer, image_size=image_size).stop_token_ids - assert stop_token_ids is not None - return stop_token_ids - - def apply_bot_task_to_sampling_params( sampling_params_list: list[Any], tokenizer: Any, @@ -448,15 +393,10 @@ def build_prompt_tokens( "available_tasks", "available_prompt_bot_tasks", "apply_bot_task_to_sampling_params", - "bot_task_for_task", "BOT_TASKS", "build_prompt", "build_prompt_tokens", "PROMPT_BOT_TASKS", "resolve_bot_task", - "stop_token_ids_for_bot_task", - "stop_token_ids_for_task", "sys_type_for_task", - "task_for_modality_and_bot_task", - "tokenizer_bot_task_for_bot_task", ] diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index b2375fd38b4..03214d21612 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -206,9 +206,9 @@ async def create_chat_completion( ) request_chat_template_kwargs = request.chat_template_kwargs or {} if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task - tokenizer_bot_task = tokenizer_bot_task_for_bot_task(bot_task) + tokenizer_bot_task = resolve_bot_task(bot_task).tokenizer_bot_task request_chat_template_kwargs = dict(request_chat_template_kwargs) request_chat_template_kwargs["bot_task"] = tokenizer_bot_task @@ -771,12 +771,12 @@ async def _apply_hunyuan_image3_bot_task_sampling_params( from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( BOT_TASKS, apply_bot_task_to_sampling_params, - tokenizer_bot_task_for_bot_task, + resolve_bot_task, ) if bot_task not in BOT_TASKS: raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") - tokenizer_bot_task_for_bot_task(bot_task) + resolve_bot_task(bot_task) if tokenizer is None and hasattr(engine, "get_tokenizer"): tokenizer = await engine.get_tokenizer() @@ -2298,9 +2298,9 @@ def _build_multistage_generation_inputs( if width is not None: mm_processor_kwargs["target_w"] = width if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task - mm_processor_kwargs["bot_task"] = tokenizer_bot_task_for_bot_task(bot_task) + mm_processor_kwargs["bot_task"] = resolve_bot_task(bot_task).tokenizer_bot_task engine_prompt["bot_task"] = bot_task if mm_processor_kwargs: engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs @@ -2692,11 +2692,11 @@ async def _create_diffusion_chat_completion( if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None: bot_task = None elif bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import tokenizer_bot_task_for_bot_task + from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task gen_prompt["bot_task"] = bot_task gen_prompt["mm_processor_kwargs"] = { - "bot_task": tokenizer_bot_task_for_bot_task(bot_task), + "bot_task": resolve_bot_task(bot_task).tokenizer_bot_task, } sampling_params_list = build_stage_sampling_params_list( stage_configs, From 7d70ae5723a2cfcb64b3d333ac62298a8dbc4c99 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Thu, 7 May 2026 19:08:49 +0800 Subject: [PATCH 09/40] Remove online HunyuanImage3 bot task changes Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 25 --- .../models/hunyuan_image3/prompt_utils.py | 23 --- vllm_omni/entrypoints/openai/api_server.py | 5 - .../entrypoints/openai/protocol/images.py | 4 - vllm_omni/entrypoints/openai/serving_chat.py | 154 +----------------- 5 files changed, 9 insertions(+), 202 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 4298a37870e..8d9448ea6b9 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -24,7 +24,6 @@ import pytest from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - apply_bot_task_to_sampling_params, available_prompt_bot_tasks, available_tasks, build_prompt, @@ -175,30 +174,6 @@ def test_sys_type_for_task_returns_prompt_preset_default(): assert sys_type_for_task("t2i_vanilla") == "en_vanilla" -class FakeSamplingParams: - def __init__(self, stop_token_ids: list[int] | None = None, max_tokens: int = 16) -> None: - self.stop_token_ids = stop_token_ids - self.max_tokens = max_tokens - - -def test_apply_bot_task_to_sampling_params_updates_only_target_stage(): - tok = FakeTokenizer() - stage0 = FakeSamplingParams(stop_token_ids=[999]) - stage1 = FakeSamplingParams(stop_token_ids=[888]) - - updated = apply_bot_task_to_sampling_params( - [stage0, stage1], - tok, - "think_recaption", - stage_index=0, - ) - - assert updated[0] is stage0 - assert updated[0].stop_token_ids == [6, 7, 5] - assert updated[1] is stage1 - assert stage0.stop_token_ids == [6, 7, 5] - - @pytest.mark.parametrize( "task", [ diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index c62e8a39437..f6a622180a0 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -259,28 +259,6 @@ def _eos_token_id(tokenizer) -> int: return _token_id(tokenizer, "<|endoftext|>") -def apply_bot_task_to_sampling_params( - sampling_params_list: list[Any], - tokenizer: Any, - bot_task: str, - *, - stage_index: int = 0, - image_size: int | str | None = None, -) -> list[Any]: - """Apply a per-request HunyuanImage3 bot_task to one AR stage.""" - if stage_index < 0 or stage_index >= len(sampling_params_list): - raise IndexError(f"stage_index {stage_index} is out of range for {len(sampling_params_list)} sampling params") - - updated_params_list = list(sampling_params_list) - params = updated_params_list[stage_index] - stop_token_ids = resolve_bot_task(bot_task, tokenizer=tokenizer, image_size=image_size).stop_token_ids - assert stop_token_ids is not None - params.stop_token_ids = stop_token_ids - - updated_params_list[stage_index] = params - return updated_params_list - - def build_prompt( user_prompt: str, task: str = "it2i_think", @@ -392,7 +370,6 @@ def build_prompt_tokens( "BotTaskResolution", "available_tasks", "available_prompt_bot_tasks", - "apply_bot_task_to_sampling_params", "BOT_TASKS", "build_prompt", "build_prompt_tokens", diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 9b3aec58f21..06fb0a7f4cb 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1527,8 +1527,6 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) extra_body["guidance_scale"] = request.guidance_scale if request.true_cfg_scale is not None: extra_body["true_cfg_scale"] = request.true_cfg_scale - if request.bot_task is not None: - extra_body["bot_task"] = request.bot_task if request.generator_device is not None: extra_body["generator_device"] = request.generator_device if request.lora is not None: @@ -1695,7 +1693,6 @@ async def edit_images( guidance_scale: float | None = Form(None), strength: float | None = Form(None), true_cfg_scale: float | None = Form(None), - bot_task: str | None = Form(None), seed: int | None = Form(None), generator_device: str | None = Form(None), # vllm-omni extension for per-request LoRA. @@ -1899,8 +1896,6 @@ async def edit_images( extra_body["strength"] = strength if true_cfg_scale is not None: extra_body["true_cfg_scale"] = true_cfg_scale - if bot_task is not None: - extra_body["bot_task"] = bot_task if layers is not None: extra_body["layers"] = layers if resolution is not None: diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index c78a95de058..0fb22a548cf 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -117,10 +117,6 @@ def validate_use_system_prompt(cls, v): le=20.0, description="True CFG scale (model-specific parameter, may be ignored if not supported)", ) - bot_task: str | None = Field( - default=None, - description="HunyuanImage3 AR bot_task for this request: auto, image, recaption, or think_recaption.", - ) seed: int | None = Field(default=None, description="Random seed for reproducibility") generator_device: str | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 03214d21612..09b62bf8972 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -197,25 +197,10 @@ async def create_chat_completion( if tokenizer is None: tokenizer = await self.engine_client.get_tokenizer() - extra_body = self._get_extra_body_from_request(request) - bot_task = ( - extra_body.get("bot_task") - if self._get_hunyuan_image3_ar_stage_index(list(getattr(self.engine_client, "stage_configs", []) or [])) - is not None - else None - ) - request_chat_template_kwargs = request.chat_template_kwargs or {} - if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task - - tokenizer_bot_task = resolve_bot_task(bot_task).tokenizer_bot_task - request_chat_template_kwargs = dict(request_chat_template_kwargs) - request_chat_template_kwargs["bot_task"] = tokenizer_bot_task - reasoning_parser: ReasoningParser | None = None if self.reasoning_parser_cls: chat_template_kwargs = self._prepare_extra_chat_template_kwargs( - request_chat_template_kwargs, + request.chat_template_kwargs, self.default_chat_template_kwargs, ) reasoning_parser = self.reasoning_parser_cls( @@ -263,13 +248,13 @@ async def create_chat_completion( if not self.use_harmony: error_check_ret = self._validate_chat_template( request_chat_template=request.chat_template, - chat_template_kwargs=request_chat_template_kwargs, + chat_template_kwargs=request.chat_template_kwargs, trust_request_chat_template=self.trust_request_chat_template, ) if error_check_ret is not None: return error_check_ret - chat_template_kwargs = dict(request_chat_template_kwargs) + chat_template_kwargs = request.chat_template_kwargs or {} chat_template_kwargs.update(reasoning_effort=request.reasoning_effort) # Merge chat_template_kwargs with defaults @@ -336,7 +321,9 @@ async def create_chat_completion( # `extra_body` is flattented and merged into the payload's root. # These extra fields are accessible via `model_extra` property (from Pydantic base class). # When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict. - extra_body = self._get_extra_body_from_request(request) + extra_body = getattr(request, "extra_body", None) + if not extra_body: + extra_body = request.model_extra or {} height, width = self._resolve_height_width_from_extra_body(extra_body) @@ -380,9 +367,6 @@ async def create_chat_completion( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if bot_task is not None: - mm_processor_kwargs["bot_task"] = tokenizer_bot_task - tprompt["bot_task"] = bot_task tprompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_image is not None: tprompt["multi_modal_data"] = engine_prompt_image @@ -420,12 +404,6 @@ async def create_chat_completion( # to delta to ensure emitted outputs are correctly drained. Otherwise # convert cumulative to Final Only to ensure the output is correct. sampling_params_list = coerce_param_message_types(sampling_params_list, request.stream) - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( - engine=self.engine_client, - sampling_params_list=sampling_params_list, - bot_task=bot_task, - tokenizer=tokenizer, - ) # Apply user-specified overrides to diffusion stage(s) for image generation for idx, sp in enumerate(sampling_params_list): @@ -707,89 +685,6 @@ def _to_sampling_params_list(self, sampling_params_list: list[dict]) -> list[Sam raise ValueError(f"Invalid sampling params: {sampling_params}") return final_sampling_params_list - @staticmethod - def _get_extra_body_from_request(request: Any) -> dict[str, Any]: - body: dict[str, Any] = {} - model_extra = getattr(request, "model_extra", None) - if isinstance(model_extra, dict): - body.update(model_extra) - extra_body = getattr(request, "extra_body", None) - if isinstance(extra_body, dict): - body.update(extra_body) - return body - - @staticmethod - def _stage_config_get(stage_config: Any, key: str) -> Any: - if isinstance(stage_config, dict): - return stage_config.get(key) - if hasattr(stage_config, "get"): - try: - return stage_config.get(key) - except Exception: - pass - return getattr(stage_config, key, None) - - @classmethod - def _is_hunyuan_image3_stage(cls, stage_config: Any) -> bool: - model_arch = cls._stage_config_get(stage_config, "model_arch") - if model_arch == "HunyuanImage3ForCausalMM": - return True - - engine_args = cls._stage_config_get(stage_config, "engine_args") - if isinstance(engine_args, dict): - return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM" - if engine_args is not None and hasattr(engine_args, "get"): - try: - return engine_args.get("model_arch") == "HunyuanImage3ForCausalMM" - except Exception: - pass - return getattr(engine_args, "model_arch", None) == "HunyuanImage3ForCausalMM" - - @classmethod - def _get_hunyuan_image3_ar_stage_index(cls, stage_configs: list[Any]) -> int | None: - for idx, stage_config in enumerate(stage_configs): - if cls._is_hunyuan_image3_stage(stage_config) and get_stage_type(stage_config) != "diffusion": - return idx - return None - - async def _apply_hunyuan_image3_bot_task_sampling_params( - self, - *, - engine: Any, - sampling_params_list: list[Any], - bot_task: Any, - tokenizer: Any | None = None, - ) -> list[Any]: - if bot_task is None: - return sampling_params_list - - stage_configs = list(getattr(engine, "stage_configs", []) or []) - stage_index = self._get_hunyuan_image3_ar_stage_index(stage_configs) - if stage_index is None: - return sampling_params_list - - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - BOT_TASKS, - apply_bot_task_to_sampling_params, - resolve_bot_task, - ) - - if bot_task not in BOT_TASKS: - raise ValueError(f"Unknown HunyuanImage3 bot_task {bot_task!r}. Choose from: {list(BOT_TASKS)}") - resolve_bot_task(bot_task) - - if tokenizer is None and hasattr(engine, "get_tokenizer"): - tokenizer = await engine.get_tokenizer() - if tokenizer is None: - raise ValueError("Cannot resolve tokenizer to apply HunyuanImage3 bot_task stop tokens.") - - return apply_bot_task_to_sampling_params( - sampling_params_list, - tokenizer, - bot_task, - stage_index=stage_index, - ) - def _get_comprehension_stage_index(self) -> int: for idx, stage in enumerate(self.engine_client.stage_configs): if stage.is_comprehension: @@ -2254,11 +2149,6 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") - bot_task = ( - extra_body.get("bot_task") - if self._get_hunyuan_image3_ar_stage_index(list(stage_configs)) is not None - else None - ) engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] @@ -2297,11 +2187,6 @@ def _build_multistage_generation_inputs( mm_processor_kwargs["target_h"] = height if width is not None: mm_processor_kwargs["target_w"] = width - if bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task - - mm_processor_kwargs["bot_task"] = resolve_bot_task(bot_task).tokenizer_bot_task - engine_prompt["bot_task"] = bot_task if mm_processor_kwargs: engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs if engine_prompt_data is not None: @@ -2398,7 +2283,6 @@ async def generate_diffusion_images( negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) lora_body = extra_body.get("lora") - bot_task = extra_body.get("bot_task") pil_images: list[Image.Image] = [] for img_b64 in reference_images: @@ -2482,12 +2366,6 @@ async def generate_diffusion_images( engine_prompt = gen_prompt sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( - engine=diffusion_engine, - sampling_params_list=sampling_params_list, - bot_task=bot_task, - ) - result = None async for output in diffusion_engine.generate( prompt=engine_prompt, @@ -2556,7 +2434,9 @@ async def _create_diffusion_chat_completion( # `extra_body` is flattented and merged into the payload's root. # These extra fields are accessible via `model_extra` property (from Pydantic base class). # When sending raw request with curl, no flattening happens. Directly read the `extra_body` dict. - extra_body = self._get_extra_body_from_request(request) + extra_body = getattr(request, "extra_body", None) + if not extra_body: + extra_body = request.model_extra or {} # Parse size if provided (supports "1024x1024" format) height, width = self._resolve_height_width_from_extra_body(extra_body) @@ -2575,7 +2455,6 @@ async def _create_diffusion_chat_completion( seed = getattr(request, "seed", None) negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) - bot_task = extra_body.get("bot_task") # Text-to-video parameters (ref: text_to_video.py) num_frames = extra_body.get("num_frames") @@ -2689,15 +2568,6 @@ async def _create_diffusion_chat_completion( # Generate image or audio (e.g. AudioX) via AsyncOmni diffusion_engine = cast(AsyncOmni, self._diffusion_engine) stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) - if self._get_hunyuan_image3_ar_stage_index(stage_configs) is None: - bot_task = None - elif bot_task is not None: - from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import resolve_bot_task - - gen_prompt["bot_task"] = bot_task - gen_prompt["mm_processor_kwargs"] = { - "bot_task": resolve_bot_task(bot_task).tokenizer_bot_task, - } sampling_params_list = build_stage_sampling_params_list( stage_configs, get_default_sampling_params_list(diffusion_engine), @@ -2708,12 +2578,6 @@ async def _create_diffusion_chat_completion( if not sampling_params_list: sampling_params_list = [gen_params] - sampling_params_list = await self._apply_hunyuan_image3_bot_task_sampling_params( - engine=diffusion_engine, - sampling_params_list=sampling_params_list, - bot_task=bot_task, - ) - result = None async for output in diffusion_engine.generate( prompt=gen_prompt, From 09a025993982b49c86331a523cbf110a4624bcf9 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 09:33:57 +0800 Subject: [PATCH 10/40] Hardcode HunyuanImage3 offline control token ids Signed-off-by: KexiongYu --- .../models/hunyuan_image3/prompt_utils.py | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index f6a622180a0..248a13943fe 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -24,6 +24,29 @@ BOT_TASKS = ("auto", "image", "recaption", "think_recaption") PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla") + +# HunyuanImage-3.0-Instruct special token ids from tokenizer.json. +# Keep offline AR prompt/stop-token behavior independent of runtime +# tokenizer lookup for these fixed control tokens. +HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS: dict[str, int] = { + "<|endoftext|>": 127957, + "<|startoftext|>": 127958, + "": 128000, + "": 128001, + "": 128006, + "": 128010, + "": 128018, + "": 128019, + "": 128023, + "": 128024, + "": 128025, + "": 128026, + "": 128037, + "": 128044, + "": 128076, + "": 130103, + "": 130106, +} _BOT_TASK_TO_TOKENIZER_TASK = { "auto": "auto", "image": "image", @@ -167,7 +190,9 @@ def _stop_token_ids_for_tokenizer_bot_task( eos_id = _eos_token_id(tokenizer) if image_size == "auto": - extra_auto_stops = [_token_id(tokenizer, f"") for i in range(33)] + start_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + end_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1)) else: extra_auto_stops = [_token_id(tokenizer, "")] @@ -246,6 +271,9 @@ def sys_type_for_task(task: str) -> str: def _token_id(tokenizer, token: str) -> int: + if token in HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS: + return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token] + token_id = tokenizer.convert_tokens_to_ids(token) if token_id is None: raise ValueError(f"Tokenizer does not know special token {token!r}") @@ -253,10 +281,7 @@ def _token_id(tokenizer, token: str) -> int: def _eos_token_id(tokenizer) -> int: - token_id = getattr(tokenizer, "eos_token_id", None) - if token_id is not None: - return int(token_id) - return _token_id(tokenizer, "<|endoftext|>") + return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] def build_prompt( @@ -334,9 +359,9 @@ def build_prompt_tokens( preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) effective_sys_type = sys_type or preset_sys_type - bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") - img_id = tokenizer.convert_tokens_to_ids("") - trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None + bos_id = _token_id(tokenizer, "<|startoftext|>") + img_id = _token_id(tokenizer, "") + trig_id = _token_id(tokenizer, trigger_tag) if trigger_tag else None has_image_input = _task_has_image_input(task) @@ -373,6 +398,7 @@ def build_prompt_tokens( "BOT_TASKS", "build_prompt", "build_prompt_tokens", + "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", "PROMPT_BOT_TASKS", "resolve_bot_task", "sys_type_for_task", From 2cc6ad75f92b90b97c60a6c004e69949e21a1ac1 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 09:33:57 +0800 Subject: [PATCH 11/40] Hardcode HunyuanImage3 offline control token ids Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 80 ++++++++++++++----- 1 file changed, 59 insertions(+), 21 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 8d9448ea6b9..e858944c0a4 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -24,6 +24,7 @@ import pytest from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, available_prompt_bot_tasks, available_tasks, build_prompt, @@ -41,7 +42,7 @@ class FakeTokenizer: """Minimal tokenizer stub that records every encode() call. - Returns deterministic ids: special tokens map to small ints (1-4), + Returns deterministic ids from convert_tokens_to_ids while encode() returns one id per character starting at 100. This lets tests both verify segmentation (by inspecting `encode_calls`) and locate substrings inside the returned id list. @@ -145,28 +146,56 @@ def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids(): assert resolution.task is None assert resolution.bot_task == "think_recaption" assert resolution.tokenizer_bot_task == "think" - assert resolution.stop_token_ids == [6, 7, 5] + assert resolution.stop_token_ids == [ + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"], + ] def test_resolve_bot_task_resolves_stop_ids_from_bot_task(): tok = FakeTokenizer() - assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [5, 8] - assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [5] - assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [6, 7, 5] - assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [6, 7, 5] + eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] + boi_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + end_recaption_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + end_answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + + assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [eos_id, boi_id] + assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [eos_id] + assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [ + end_recaption_id, + end_answer_id, + eos_id, + ] + assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [ + end_recaption_id, + end_answer_id, + eos_id, + ] assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [ - 5, - *range(1000, 1033), + eos_id, + *range( + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + 1, + ), ] def test_resolve_bot_task_resolves_stop_ids_from_prompt_task(): tok = FakeTokenizer() - assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [5, 8] - assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [6, 7, 5] - assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [5] + eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] + assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [ + eos_id, + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + ] + assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [ + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + eos_id, + ] + assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id] def test_sys_type_for_task_returns_prompt_preset_default(): @@ -265,25 +294,31 @@ def test_build_prompt_tokens_segments_each_boundary(): def test_build_prompt_tokens_image_placeholder_present_for_image_tasks(): tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="i2t") - assert ids[0] == 1, "BOS (<|startoftext|>) must be the first token" - assert 2 in ids, " placeholder must be present for i2t/it2i tasks" + assert ids[0] == HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|startoftext|>"], ( + "BOS (<|startoftext|>) must be the first token" + ) + assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] in ids, ( + " placeholder must be present for i2t/it2i tasks" + ) def test_build_prompt_tokens_no_image_for_text_only_tasks(): tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="t2t") - assert 2 not in ids, " must NOT appear for text-only tasks" + assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] not in ids, ( + " must NOT appear for text-only tasks" + ) @pytest.mark.parametrize( "task,trigger_id", [ - ("t2t_think", 3), - ("i2t_think", 3), - ("it2i_think", 3), - ("t2i_think", 3), - ("it2i_recaption", 4), - ("t2i_recaption", 4), + ("t2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("i2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("it2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("t2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("it2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("t2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), ], ) def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int): @@ -297,7 +332,10 @@ def test_build_prompt_tokens_no_trigger_for_plain_tasks(): """Tasks without trigger_tag (t2t / i2t) must NOT append a trigger id.""" tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="t2t") - assert ids[-1] not in {3, 4} # neither nor + assert ids[-1] not in { + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + } # -------------------- end2end.py wiring guard -------------------- From 12a77da318d0c78be64877689c77642e45187d41 Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Fri, 8 May 2026 11:37:22 +0800 Subject: [PATCH 12/40] Refactor prompt_utils.py Signed-off-by: Y. Fisher Signed-off-by: KexiongYu --- .../models/hunyuan_image3/prompt_utils.py | 261 ++---------------- 1 file changed, 24 insertions(+), 237 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 248a13943fe..9754cf2c82f 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -22,8 +22,6 @@ from .system_prompt import get_system_prompt -BOT_TASKS = ("auto", "image", "recaption", "think_recaption") -PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla") # HunyuanImage-3.0-Instruct special token ids from tokenizer.json. # Keep offline AR prompt/stop-token behavior independent of runtime @@ -47,17 +45,10 @@ "": 130103, "": 130106, } -_BOT_TASK_TO_TOKENIZER_TASK = { - "auto": "auto", - "image": "image", - "recaption": "recaption", - "think_recaption": "think", -} # task -> (sys_type, bot_task, trigger_tag) _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), - "t2t_think": ("en_unified", "think", ""), "i2t": ("en_unified", None, None), "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), @@ -67,221 +58,22 @@ "t2i_vanilla": ("en_vanilla", "image", None), } -_MODALITY_TO_TASK_PREFIX = { - "text2text": "t2t", - "t2t": "t2t", - "img2text": "i2t", - "image2text": "i2t", - "i2t": "i2t", - "text2img": "t2i", - "text2image": "t2i", - "t2i": "t2i", - "img2img": "it2i", - "image2image": "it2i", - "it2i": "it2i", - "ti2i": "it2i", -} - -_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = { - "t2t": None, - "i2t": None, - "t2i": "think", - "it2i": "think", -} - -_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = { - ("t2t", None): "t2t", - ("t2t", "think"): "t2t_think", - ("i2t", None): "i2t", - ("i2t", "think"): "i2t_think", - ("t2i", "think"): "t2i_think", - ("t2i", "recaption"): "t2i_recaption", - ("t2i", "vanilla"): "t2i_vanilla", - ("it2i", "think"): "it2i_think", - ("it2i", "recaption"): "it2i_recaption", -} - -_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = { - "auto": "auto", - "default": "auto", - "none": None, - "no": None, - "false": None, - "think": "think", - "think_recaption": "think", - "recaption": "recaption", - "image": "vanilla", - "vanilla": "vanilla", -} - - -@dataclass(frozen=True) -class BotTaskResolution: - """Resolved HunyuanImage3 prompt/bot-task settings.""" - - task: str | None - sys_type: str | None - prompt_bot_task: str | None - bot_task: str - tokenizer_bot_task: str - trigger_tag: str | None - stop_token_ids: list[int] | None = None - def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) +def resolve_stop_token_ids( + task: str = "it2i_think", + bot_task: str = "think", + tokenizer: Any | None = None): + tkw = tokenizer + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + stop_token_ids = [127957] + if trigger_tag: + stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) + return stop_token_ids -def available_prompt_bot_tasks() -> list[str]: - """Sorted public bot_task values accepted by `resolve_bot_task` with modality.""" - return sorted(PROMPT_BOT_TASKS) - - -def _task_preset(task: str) -> tuple[str, str | None, str | None]: - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - return _TASK_PRESETS[task] - - -def _task_has_image_input(task: str) -> bool: - return task.startswith(("i2t", "it2i")) - - -def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: - if bot_task is None: - return "auto" - - normalized = bot_task.strip().lower() - if normalized not in _PROMPT_BOT_TASK_ALIASES: - raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}") - return _PROMPT_BOT_TASK_ALIASES[normalized] - - -def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str: - modality_key = modality.strip().lower() - if modality_key not in _MODALITY_TO_TASK_PREFIX: - raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}") - - task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key] - normalized_bot_task = _normalize_prompt_bot_task(bot_task) - if normalized_bot_task == "auto": - normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix] - - task_key = (task_prefix, normalized_bot_task) - if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK: - valid_bot_tasks = sorted( - "none" if candidate is None else candidate - for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK - if prefix == task_prefix - ) - raise ValueError( - f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}" - ) - - return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] - - -def _stop_token_ids_for_tokenizer_bot_task( - tokenizer, - tokenizer_bot_task: str, - image_size: int | str | None = None, -) -> list[int]: - eos_id = _eos_token_id(tokenizer) - - if image_size == "auto": - start_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - end_ratio_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1)) - else: - extra_auto_stops = [_token_id(tokenizer, "")] - - stop_token_id = { - "auto": [eos_id] + extra_auto_stops, - "image": [eos_id], - "recaption": [ - _token_id(tokenizer, ""), - _token_id(tokenizer, ""), - eos_id, - ], - "think": [ - _token_id(tokenizer, ""), - _token_id(tokenizer, ""), - eos_id, - ], - } - return stop_token_id[tokenizer_bot_task] - - -def resolve_bot_task( - bot_task: str | None = "auto", - *, - modality: str | None = None, - task: str | None = None, - tokenizer: Any | None = None, - image_size: int | str | None = None, -) -> BotTaskResolution: - """Resolve HunyuanImage3 bot-task related prompt settings. - - Pass `modality + bot_task` for CLI/request-level behavior, `task` for a - canonical prompt task, or only `bot_task` to validate/map a pipeline - HunyuanImage3 bot_task. - """ - if task is not None and modality is not None: - raise ValueError("Pass either task or modality, not both.") - - if task is None and modality is not None: - task = _task_for_modality_and_prompt_bot_task(modality, bot_task) - - if task is not None: - sys_type, preset_bot_task, trigger_tag = _task_preset(task) - resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto" - prompt_bot_task = preset_bot_task - else: - sys_type = None - trigger_tag = None - prompt_bot_task = None - resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower() - - if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: - raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}") - tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task] - stop_token_ids = ( - _stop_token_ids_for_tokenizer_bot_task(tokenizer, tokenizer_bot_task, image_size=image_size) - if tokenizer is not None - else None - ) - - return BotTaskResolution( - task=task, - sys_type=sys_type, - prompt_bot_task=prompt_bot_task, - bot_task=resolved_bot_task, - tokenizer_bot_task=tokenizer_bot_task, - trigger_tag=trigger_tag, - stop_token_ids=stop_token_ids, - ) - - -def sys_type_for_task(task: str) -> str: - """Return the default system prompt type for a canonical prompt task.""" - sys_type = resolve_bot_task(task=task).sys_type - assert sys_type is not None - return sys_type - - -def _token_id(tokenizer, token: str) -> int: - if token in HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS: - return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token] - - token_id = tokenizer.convert_tokens_to_ids(token) - if token_id is None: - raise ValueError(f"Tokenizer does not know special token {token!r}") - return int(token_id) - - -def _eos_token_id(tokenizer) -> int: - return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] def build_prompt( @@ -298,13 +90,16 @@ def build_prompt( inputs that need to match HF baseline byte-for-byte, use `build_prompt_tokens` instead and feed the result via prompt_token_ids. """ - preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] effective_sys_type = sys_type or preset_sys_type system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) sys_text = system_prompt.strip() if system_prompt else "" - has_image_input = _task_has_image_input(task) + has_image_input = task.startswith("i2t") or task.startswith("it2i") # t2i_vanilla: pretrain mode for direct text->image generation. The # vanilla system prompt drives the model with no chat structure. @@ -356,14 +151,17 @@ def build_prompt_tokens( boundary merge happens. We replicate that here and feed the result to Omni via OmniTokensPrompt (prompt_token_ids). """ - preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] effective_sys_type = sys_type or preset_sys_type - bos_id = _token_id(tokenizer, "<|startoftext|>") - img_id = _token_id(tokenizer, "") - trig_id = _token_id(tokenizer, trigger_tag) if trigger_tag else None + bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") + img_id = tokenizer.convert_tokens_to_ids("") + trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None - has_image_input = _task_has_image_input(task) + has_image_input = task.startswith("i2t") or task.startswith("it2i") # t2i_vanilla uses pretrain template with no chat structure; the vanilla # system prompt drives the model directly. No segment boundaries to @@ -391,15 +189,4 @@ def build_prompt_tokens( return ids -__all__ = [ - "BotTaskResolution", - "available_tasks", - "available_prompt_bot_tasks", - "BOT_TASKS", - "build_prompt", - "build_prompt_tokens", - "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", - "PROMPT_BOT_TASKS", - "resolve_bot_task", - "sys_type_for_task", -] +__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS] From 2612670ae02358816ce25a8d929a941468edfdb1 Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Fri, 8 May 2026 11:38:18 +0800 Subject: [PATCH 13/40] adjust end2end according to prompt utils Signed-off-by: Y. Fisher Signed-off-by: KexiongYu --- .../hunyuan_image3/end2end.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 9b717e198b8..ceebc2d3f39 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,10 +18,9 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - available_prompt_bot_tasks, build_prompt_tokens, - resolve_bot_task, - sys_type_for_task, + resolve_stop_token_ids, + _TASK_PRESETS ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -45,6 +44,12 @@ "text2text": "text-to-text", } +_MODALITY_TASK_MAP = { + "text2img": "t2i", + "img2img": "it2i", + "img2text": "i2t", + "text2text": "t2t", +} def parse_args(): parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") @@ -90,7 +95,7 @@ def parse_args(): "--bot-task", type=str, default="auto", - choices=available_prompt_bot_tasks(), + choices=["auto", "think", "recaption", "vanilla"], help=( "Prompt behavior. 'auto' selects the default for the modality; " "'think' adds ; 'recaption' adds ; " @@ -122,10 +127,11 @@ def main(): os.makedirs(args.output, exist_ok=True) # Determine task for prompt formatting from modality + bot behavior. - bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality) - task = bot_task_resolution.task + task = _MODALITY_TASK_MAP[args.modality] assert task is not None - bot_task = bot_task_resolution.bot_task + bot_task = args.bot_task + if bot_task != "auto": + task = task + "_" + bot_task if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -176,7 +182,8 @@ def main(): formatted_prompts: list[OmniPromptType] = [] for p in prompts: token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) - effective_sys_type = args.sys_type or sys_type_for_task(task) + preset_sys_type, _, _ = _TASK_PRESETS[task] + effective_sys_type = args.sys_type or preset_sys_type # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte). # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to @@ -209,7 +216,7 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids + ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) assert ar_stop_token_ids is not None for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): From 1dab1f0bf21304394cc6ceb7370715a2ae91edab Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 16:11:02 +0800 Subject: [PATCH 14/40] Fix HunyuanImage3 i2t think stop tokens Signed-off-by: KexiongYu --- .../hunyuan_image3/end2end.py | 25 +- .../hunyuan_image3/test_prompt_utils.py | 3 +- .../models/hunyuan_image3/prompt_utils.py | 285 +++++++++++++++--- 3 files changed, 255 insertions(+), 58 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index ceebc2d3f39..9b717e198b8 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,9 +18,10 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + available_prompt_bot_tasks, build_prompt_tokens, - resolve_stop_token_ids, - _TASK_PRESETS + resolve_bot_task, + sys_type_for_task, ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -44,12 +45,6 @@ "text2text": "text-to-text", } -_MODALITY_TASK_MAP = { - "text2img": "t2i", - "img2img": "it2i", - "img2text": "i2t", - "text2text": "t2t", -} def parse_args(): parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") @@ -95,7 +90,7 @@ def parse_args(): "--bot-task", type=str, default="auto", - choices=["auto", "think", "recaption", "vanilla"], + choices=available_prompt_bot_tasks(), help=( "Prompt behavior. 'auto' selects the default for the modality; " "'think' adds ; 'recaption' adds ; " @@ -127,11 +122,10 @@ def main(): os.makedirs(args.output, exist_ok=True) # Determine task for prompt formatting from modality + bot behavior. - task = _MODALITY_TASK_MAP[args.modality] + bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality) + task = bot_task_resolution.task assert task is not None - bot_task = args.bot_task - if bot_task != "auto": - task = task + "_" + bot_task + bot_task = bot_task_resolution.bot_task if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -182,8 +176,7 @@ def main(): formatted_prompts: list[OmniPromptType] = [] for p in prompts: token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) - preset_sys_type, _, _ = _TASK_PRESETS[task] - effective_sys_type = args.sys_type or preset_sys_type + effective_sys_type = args.sys_type or sys_type_for_task(task) # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte). # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to @@ -216,7 +209,7 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) + ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids assert ar_stop_token_ids is not None for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index e858944c0a4..664975d87fd 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -191,9 +191,10 @@ def test_resolve_bot_task_resolves_stop_ids_from_prompt_task(): HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [ - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], eos_id, + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id] diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 9754cf2c82f..231c965c3a7 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -22,6 +22,8 @@ from .system_prompt import get_system_prompt +BOT_TASKS = ("auto", "image", "recaption", "think_recaption") +PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla") # HunyuanImage-3.0-Instruct special token ids from tokenizer.json. # Keep offline AR prompt/stop-token behavior independent of runtime @@ -46,9 +48,17 @@ "": 130106, } +_BOT_TASK_TO_TOKENIZER_TASK = { + "auto": "auto", + "image": "image", + "recaption": "recaption", + "think_recaption": "think", +} + # task -> (sys_type, bot_task, trigger_tag) _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), + "t2t_think": ("en_unified", "think", ""), "i2t": ("en_unified", None, None), "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), @@ -58,23 +68,228 @@ "t2i_vanilla": ("en_vanilla", "image", None), } +_MODALITY_TO_TASK_PREFIX = { + "text2text": "t2t", + "t2t": "t2t", + "img2text": "i2t", + "image2text": "i2t", + "i2t": "i2t", + "text2img": "t2i", + "text2image": "t2i", + "t2i": "t2i", + "img2img": "it2i", + "image2image": "it2i", + "it2i": "it2i", + "ti2i": "it2i", +} + +_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = { + "t2t": None, + "i2t": None, + "t2i": "think", + "it2i": "think", +} + +_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = { + ("t2t", None): "t2t", + ("t2t", "think"): "t2t_think", + ("i2t", None): "i2t", + ("i2t", "think"): "i2t_think", + ("t2i", "think"): "t2i_think", + ("t2i", "recaption"): "t2i_recaption", + ("t2i", "vanilla"): "t2i_vanilla", + ("it2i", "think"): "it2i_think", + ("it2i", "recaption"): "it2i_recaption", +} + +_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = { + "auto": "auto", + "default": "auto", + "none": None, + "no": None, + "false": None, + "think": "think", + "think_recaption": "think", + "recaption": "recaption", + "image": "vanilla", + "vanilla": "vanilla", +} + + +@dataclass(frozen=True) +class BotTaskResolution: + """Resolved HunyuanImage3 prompt/bot-task settings.""" + + task: str | None + sys_type: str | None + prompt_bot_task: str | None + bot_task: str + tokenizer_bot_task: str + trigger_tag: str | None + stop_token_ids: list[int] | None = None + def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) -def resolve_stop_token_ids( - task: str = "it2i_think", - bot_task: str = "think", - tokenizer: Any | None = None): - tkw = tokenizer - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] - stop_token_ids = [127957] - if trigger_tag: - stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) - return stop_token_ids + +def available_prompt_bot_tasks() -> list[str]: + """Sorted public bot_task values accepted by `resolve_bot_task` with modality.""" + return sorted(PROMPT_BOT_TASKS) + + +def _task_preset(task: str) -> tuple[str, str | None, str | None]: + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + return _TASK_PRESETS[task] + + +def _task_has_image_input(task: str) -> bool: + return task.startswith(("i2t", "it2i")) +def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: + if bot_task is None: + return "auto" + + normalized = bot_task.strip().lower() + if normalized not in _PROMPT_BOT_TASK_ALIASES: + raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}") + return _PROMPT_BOT_TASK_ALIASES[normalized] + + +def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str: + modality_key = modality.strip().lower() + if modality_key not in _MODALITY_TO_TASK_PREFIX: + raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}") + + task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key] + normalized_bot_task = _normalize_prompt_bot_task(bot_task) + if normalized_bot_task == "auto": + normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix] + + task_key = (task_prefix, normalized_bot_task) + if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK: + valid_bot_tasks = sorted( + "none" if candidate is None else candidate + for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK + if prefix == task_prefix + ) + raise ValueError( + f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}" + ) + + return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] + + +def _special_token_id(token: str) -> int: + return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token] + + +def _generic_stop_token_ids_for_tokenizer_bot_task( + tokenizer_bot_task: str, + image_size: int | str | None = None, +) -> list[int]: + eos_id = _special_token_id("<|endoftext|>") + + if image_size == "auto": + start_ratio_id = _special_token_id("") + end_ratio_id = _special_token_id("") + extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1)) + else: + extra_auto_stops = [_special_token_id("")] + + stop_token_ids = { + "auto": [eos_id] + extra_auto_stops, + "image": [eos_id], + "recaption": [ + _special_token_id(""), + _special_token_id(""), + eos_id, + ], + "think": [ + _special_token_id(""), + _special_token_id(""), + eos_id, + ], + } + return stop_token_ids[tokenizer_bot_task] + + +def _stop_token_ids_for_task( + task: str, + tokenizer_bot_task: str, + image_size: int | str | None = None, +) -> list[int]: + if task in ("t2t_think", "i2t_think"): + return [ + _special_token_id(""), + _special_token_id(""), + _special_token_id("<|endoftext|>"), + _special_token_id(""), + ] + return _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size) + + +def resolve_bot_task( + bot_task: str | None = "auto", + *, + modality: str | None = None, + task: str | None = None, + tokenizer: Any | None = None, + image_size: int | str | None = None, +) -> BotTaskResolution: + """Resolve HunyuanImage3 bot-task related prompt settings. + + Pass `modality + bot_task` for CLI/request-level behavior, `task` for a + canonical prompt task, or only `bot_task` to validate/map a pipeline + HunyuanImage3 bot_task. + """ + del tokenizer # Stop tokens are fixed HunyuanImage3 control ids. + + if task is not None and modality is not None: + raise ValueError("Pass either task or modality, not both.") + + if task is None and modality is not None: + task = _task_for_modality_and_prompt_bot_task(modality, bot_task) + + if task is not None: + sys_type, preset_bot_task, trigger_tag = _task_preset(task) + resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto" + prompt_bot_task = preset_bot_task + else: + sys_type = None + trigger_tag = None + prompt_bot_task = None + resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower() + + if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: + raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}") + tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task] + stop_token_ids = ( + _stop_token_ids_for_task(task, tokenizer_bot_task, image_size=image_size) + if task is not None + else _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size) + ) + + return BotTaskResolution( + task=task, + sys_type=sys_type, + prompt_bot_task=prompt_bot_task, + bot_task=resolved_bot_task, + tokenizer_bot_task=tokenizer_bot_task, + trigger_tag=trigger_tag, + stop_token_ids=stop_token_ids, + ) + + +def sys_type_for_task(task: str) -> str: + """Return the default system prompt type for a canonical prompt task.""" + sys_type = resolve_bot_task(task=task).sys_type + assert sys_type is not None + return sys_type + def build_prompt( user_prompt: str, @@ -86,20 +301,17 @@ def build_prompt( NOTE: when this string is passed to the engine, the engine's tokenizer will run a single BPE pass over the whole string, which can merge - tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For + tokens across segment boundaries (e.g. `X\n\n` into one token). For inputs that need to match HF baseline byte-for-byte, use `build_prompt_tokens` instead and feed the result via prompt_token_ids. """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) effective_sys_type = sys_type or preset_sys_type system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) sys_text = system_prompt.strip() if system_prompt else "" - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = _task_has_image_input(task) # t2i_vanilla: pretrain mode for direct text->image generation. The # vanilla system prompt drives the model with no chat structure. @@ -110,16 +322,8 @@ def build_prompt( parts.append(user_prompt) return "".join(parts) - # All other tasks (t2t / i2t / t2i_think / t2i_recaption / - # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template: + # All other tasks use HunyuanImage3 Instruct chat template: # <|startoftext|>{system?}\n\nUser: {?}{user_prompt}\n\nAssistant: {trigger?} - # generation_config.json declares sequence_template="instruct", so the - # AR prefill MUST use this template -- verified to match HF's - # apply_chat_template output token-for-token (modulo BPE boundary merges). - # The trigger_tag (e.g. ) MUST come AFTER the `Assistant: ` prefix: - # if it goes BEFORE user_prompt (the old pretrain layout) the model puts - # the user's instructions inside the "thinking section" and collapses - # into repetition garbage under greedy decoding. parts = ["<|startoftext|>"] if sys_text: parts.append(f"{sys_text}\n\n") @@ -141,27 +345,15 @@ def build_prompt_tokens( sys_type: str | None = None, custom_system_prompt: str | None = None, ) -> list[int]: - """Segment-by-segment tokenization that matches HF apply_chat_template. - - Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE - merge tokens across segment boundaries (e.g. user_prompt ends with `。` - and the next segment is `\\n\\n` -> they merge into a single token id - 3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes - each segment independently and concatenates token_ids, so no cross- - boundary merge happens. We replicate that here and feed the result to - Omni via OmniTokensPrompt (prompt_token_ids). - """ - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + """Segment-by-segment tokenization that matches HF apply_chat_template.""" + preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) effective_sys_type = sys_type or preset_sys_type bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") img_id = tokenizer.convert_tokens_to_ids("") trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None - has_image_input = task.startswith("i2t") or task.startswith("it2i") + has_image_input = _task_has_image_input(task) # t2i_vanilla uses pretrain template with no chat structure; the vanilla # system prompt drives the model directly. No segment boundaries to @@ -189,4 +381,15 @@ def build_prompt_tokens( return ids -__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS] +__all__ = [ + "BotTaskResolution", + "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", + "available_prompt_bot_tasks", + "available_tasks", + "BOT_TASKS", + "build_prompt", + "build_prompt_tokens", + "PROMPT_BOT_TASKS", + "resolve_bot_task", + "sys_type_for_task", +] From 5c3eda0c26db2c48b6fab8e646bb1d24a424c796 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 16:12:49 +0800 Subject: [PATCH 15/40] Revert "Fix HunyuanImage3 i2t think stop tokens" This reverts commit e527e7bdd59d5d67064c3e74823d38f574f08f71. Signed-off-by: KexiongYu --- .../hunyuan_image3/end2end.py | 25 +- .../hunyuan_image3/test_prompt_utils.py | 3 +- .../models/hunyuan_image3/prompt_utils.py | 285 +++--------------- 3 files changed, 58 insertions(+), 255 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 9b717e198b8..ceebc2d3f39 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,10 +18,9 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( - available_prompt_bot_tasks, build_prompt_tokens, - resolve_bot_task, - sys_type_for_task, + resolve_stop_token_ids, + _TASK_PRESETS ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -45,6 +44,12 @@ "text2text": "text-to-text", } +_MODALITY_TASK_MAP = { + "text2img": "t2i", + "img2img": "it2i", + "img2text": "i2t", + "text2text": "t2t", +} def parse_args(): parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") @@ -90,7 +95,7 @@ def parse_args(): "--bot-task", type=str, default="auto", - choices=available_prompt_bot_tasks(), + choices=["auto", "think", "recaption", "vanilla"], help=( "Prompt behavior. 'auto' selects the default for the modality; " "'think' adds ; 'recaption' adds ; " @@ -122,10 +127,11 @@ def main(): os.makedirs(args.output, exist_ok=True) # Determine task for prompt formatting from modality + bot behavior. - bot_task_resolution = resolve_bot_task(args.bot_task, modality=args.modality) - task = bot_task_resolution.task + task = _MODALITY_TASK_MAP[args.modality] assert task is not None - bot_task = bot_task_resolution.bot_task + bot_task = args.bot_task + if bot_task != "auto": + task = task + "_" + bot_task if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") @@ -176,7 +182,8 @@ def main(): formatted_prompts: list[OmniPromptType] = [] for p in prompts: token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type) - effective_sys_type = args.sys_type or sys_type_for_task(task) + preset_sys_type, _, _ = _TASK_PRESETS[task] + effective_sys_type = args.sys_type or preset_sys_type # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte). # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to @@ -209,7 +216,7 @@ def main(): # Override diffusion params if applicable from vllm_omni.inputs.data import OmniDiffusionSamplingParams - ar_stop_token_ids = resolve_bot_task(task=task, tokenizer=tokenizer).stop_token_ids + ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer) assert ar_stop_token_ids is not None for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 664975d87fd..e858944c0a4 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -191,10 +191,9 @@ def test_resolve_bot_task_resolves_stop_ids_from_prompt_task(): HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [ - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], eos_id, - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id] diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 231c965c3a7..9754cf2c82f 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -22,8 +22,6 @@ from .system_prompt import get_system_prompt -BOT_TASKS = ("auto", "image", "recaption", "think_recaption") -PROMPT_BOT_TASKS = ("auto", "none", "think", "recaption", "vanilla") # HunyuanImage-3.0-Instruct special token ids from tokenizer.json. # Keep offline AR prompt/stop-token behavior independent of runtime @@ -48,17 +46,9 @@ "": 130106, } -_BOT_TASK_TO_TOKENIZER_TASK = { - "auto": "auto", - "image": "image", - "recaption": "recaption", - "think_recaption": "think", -} - # task -> (sys_type, bot_task, trigger_tag) _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), - "t2t_think": ("en_unified", "think", ""), "i2t": ("en_unified", None, None), "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), @@ -68,227 +58,22 @@ "t2i_vanilla": ("en_vanilla", "image", None), } -_MODALITY_TO_TASK_PREFIX = { - "text2text": "t2t", - "t2t": "t2t", - "img2text": "i2t", - "image2text": "i2t", - "i2t": "i2t", - "text2img": "t2i", - "text2image": "t2i", - "t2i": "t2i", - "img2img": "it2i", - "image2image": "it2i", - "it2i": "it2i", - "ti2i": "it2i", -} - -_DEFAULT_BOT_TASK_BY_PREFIX: dict[str, str | None] = { - "t2t": None, - "i2t": None, - "t2i": "think", - "it2i": "think", -} - -_TASK_BY_PREFIX_AND_BOT_TASK: dict[tuple[str, str | None], str] = { - ("t2t", None): "t2t", - ("t2t", "think"): "t2t_think", - ("i2t", None): "i2t", - ("i2t", "think"): "i2t_think", - ("t2i", "think"): "t2i_think", - ("t2i", "recaption"): "t2i_recaption", - ("t2i", "vanilla"): "t2i_vanilla", - ("it2i", "think"): "it2i_think", - ("it2i", "recaption"): "it2i_recaption", -} - -_PROMPT_BOT_TASK_ALIASES: dict[str, str | None] = { - "auto": "auto", - "default": "auto", - "none": None, - "no": None, - "false": None, - "think": "think", - "think_recaption": "think", - "recaption": "recaption", - "image": "vanilla", - "vanilla": "vanilla", -} - - -@dataclass(frozen=True) -class BotTaskResolution: - """Resolved HunyuanImage3 prompt/bot-task settings.""" - - task: str | None - sys_type: str | None - prompt_bot_task: str | None - bot_task: str - tokenizer_bot_task: str - trigger_tag: str | None - stop_token_ids: list[int] | None = None - def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) +def resolve_stop_token_ids( + task: str = "it2i_think", + bot_task: str = "think", + tokenizer: Any | None = None): + tkw = tokenizer + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + stop_token_ids = [127957] + if trigger_tag: + stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) + return stop_token_ids -def available_prompt_bot_tasks() -> list[str]: - """Sorted public bot_task values accepted by `resolve_bot_task` with modality.""" - return sorted(PROMPT_BOT_TASKS) - - -def _task_preset(task: str) -> tuple[str, str | None, str | None]: - if task not in _TASK_PRESETS: - raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") - return _TASK_PRESETS[task] - - -def _task_has_image_input(task: str) -> bool: - return task.startswith(("i2t", "it2i")) - - -def _normalize_prompt_bot_task(bot_task: str | None) -> str | None: - if bot_task is None: - return "auto" - - normalized = bot_task.strip().lower() - if normalized not in _PROMPT_BOT_TASK_ALIASES: - raise ValueError(f"Unknown bot_task {bot_task!r}. Choose from: {available_prompt_bot_tasks()}") - return _PROMPT_BOT_TASK_ALIASES[normalized] - - -def _task_for_modality_and_prompt_bot_task(modality: str, bot_task: str | None = "auto") -> str: - modality_key = modality.strip().lower() - if modality_key not in _MODALITY_TO_TASK_PREFIX: - raise ValueError(f"Unknown modality {modality!r}. Choose from: {sorted(_MODALITY_TO_TASK_PREFIX)}") - - task_prefix = _MODALITY_TO_TASK_PREFIX[modality_key] - normalized_bot_task = _normalize_prompt_bot_task(bot_task) - if normalized_bot_task == "auto": - normalized_bot_task = _DEFAULT_BOT_TASK_BY_PREFIX[task_prefix] - - task_key = (task_prefix, normalized_bot_task) - if task_key not in _TASK_BY_PREFIX_AND_BOT_TASK: - valid_bot_tasks = sorted( - "none" if candidate is None else candidate - for prefix, candidate in _TASK_BY_PREFIX_AND_BOT_TASK - if prefix == task_prefix - ) - raise ValueError( - f"bot_task {bot_task!r} is not supported for modality {modality!r}. Choose from: {valid_bot_tasks}" - ) - - return _TASK_BY_PREFIX_AND_BOT_TASK[task_key] - - -def _special_token_id(token: str) -> int: - return HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[token] - - -def _generic_stop_token_ids_for_tokenizer_bot_task( - tokenizer_bot_task: str, - image_size: int | str | None = None, -) -> list[int]: - eos_id = _special_token_id("<|endoftext|>") - - if image_size == "auto": - start_ratio_id = _special_token_id("") - end_ratio_id = _special_token_id("") - extra_auto_stops = list(range(start_ratio_id, end_ratio_id + 1)) - else: - extra_auto_stops = [_special_token_id("")] - - stop_token_ids = { - "auto": [eos_id] + extra_auto_stops, - "image": [eos_id], - "recaption": [ - _special_token_id(""), - _special_token_id(""), - eos_id, - ], - "think": [ - _special_token_id(""), - _special_token_id(""), - eos_id, - ], - } - return stop_token_ids[tokenizer_bot_task] - - -def _stop_token_ids_for_task( - task: str, - tokenizer_bot_task: str, - image_size: int | str | None = None, -) -> list[int]: - if task in ("t2t_think", "i2t_think"): - return [ - _special_token_id(""), - _special_token_id(""), - _special_token_id("<|endoftext|>"), - _special_token_id(""), - ] - return _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size) - - -def resolve_bot_task( - bot_task: str | None = "auto", - *, - modality: str | None = None, - task: str | None = None, - tokenizer: Any | None = None, - image_size: int | str | None = None, -) -> BotTaskResolution: - """Resolve HunyuanImage3 bot-task related prompt settings. - - Pass `modality + bot_task` for CLI/request-level behavior, `task` for a - canonical prompt task, or only `bot_task` to validate/map a pipeline - HunyuanImage3 bot_task. - """ - del tokenizer # Stop tokens are fixed HunyuanImage3 control ids. - - if task is not None and modality is not None: - raise ValueError("Pass either task or modality, not both.") - - if task is None and modality is not None: - task = _task_for_modality_and_prompt_bot_task(modality, bot_task) - - if task is not None: - sys_type, preset_bot_task, trigger_tag = _task_preset(task) - resolved_bot_task = "think_recaption" if preset_bot_task == "think" else preset_bot_task or "auto" - prompt_bot_task = preset_bot_task - else: - sys_type = None - trigger_tag = None - prompt_bot_task = None - resolved_bot_task = "auto" if bot_task is None else bot_task.strip().lower() - - if resolved_bot_task not in _BOT_TASK_TO_TOKENIZER_TASK: - raise ValueError(f"Unknown bot_task {resolved_bot_task!r}. Choose from: {list(BOT_TASKS)}") - tokenizer_bot_task = _BOT_TASK_TO_TOKENIZER_TASK[resolved_bot_task] - stop_token_ids = ( - _stop_token_ids_for_task(task, tokenizer_bot_task, image_size=image_size) - if task is not None - else _generic_stop_token_ids_for_tokenizer_bot_task(tokenizer_bot_task, image_size=image_size) - ) - - return BotTaskResolution( - task=task, - sys_type=sys_type, - prompt_bot_task=prompt_bot_task, - bot_task=resolved_bot_task, - tokenizer_bot_task=tokenizer_bot_task, - trigger_tag=trigger_tag, - stop_token_ids=stop_token_ids, - ) - - -def sys_type_for_task(task: str) -> str: - """Return the default system prompt type for a canonical prompt task.""" - sys_type = resolve_bot_task(task=task).sys_type - assert sys_type is not None - return sys_type def build_prompt( @@ -301,17 +86,20 @@ def build_prompt( NOTE: when this string is passed to the engine, the engine's tokenizer will run a single BPE pass over the whole string, which can merge - tokens across segment boundaries (e.g. `X\n\n` into one token). For + tokens across segment boundaries (e.g. `。\\n\\n` -> id 3490). For inputs that need to match HF baseline byte-for-byte, use `build_prompt_tokens` instead and feed the result via prompt_token_ids. """ - preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] effective_sys_type = sys_type or preset_sys_type system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) sys_text = system_prompt.strip() if system_prompt else "" - has_image_input = _task_has_image_input(task) + has_image_input = task.startswith("i2t") or task.startswith("it2i") # t2i_vanilla: pretrain mode for direct text->image generation. The # vanilla system prompt drives the model with no chat structure. @@ -322,8 +110,16 @@ def build_prompt( parts.append(user_prompt) return "".join(parts) - # All other tasks use HunyuanImage3 Instruct chat template: + # All other tasks (t2t / i2t / t2i_think / t2i_recaption / + # it2i_think / it2i_recaption) use HunyuanImage3 Instruct chat template: # <|startoftext|>{system?}\n\nUser: {?}{user_prompt}\n\nAssistant: {trigger?} + # generation_config.json declares sequence_template="instruct", so the + # AR prefill MUST use this template -- verified to match HF's + # apply_chat_template output token-for-token (modulo BPE boundary merges). + # The trigger_tag (e.g. ) MUST come AFTER the `Assistant: ` prefix: + # if it goes BEFORE user_prompt (the old pretrain layout) the model puts + # the user's instructions inside the "thinking section" and collapses + # into repetition garbage under greedy decoding. parts = ["<|startoftext|>"] if sys_text: parts.append(f"{sys_text}\n\n") @@ -345,15 +141,27 @@ def build_prompt_tokens( sys_type: str | None = None, custom_system_prompt: str | None = None, ) -> list[int]: - """Segment-by-segment tokenization that matches HF apply_chat_template.""" - preset_sys_type, preset_bot_task, trigger_tag = _task_preset(task) + """Segment-by-segment tokenization that matches HF apply_chat_template. + + Calling tokenizer.encode(build_prompt(...)) on the full string lets BPE + merge tokens across segment boundaries (e.g. user_prompt ends with `。` + and the next segment is `\\n\\n` -> they merge into a single token id + 3490 instead of HF's [1811, 271]). HF's apply_chat_template tokenizes + each segment independently and concatenates token_ids, so no cross- + boundary merge happens. We replicate that here and feed the result to + Omni via OmniTokensPrompt (prompt_token_ids). + """ + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {available_tasks()}") + + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] effective_sys_type = sys_type or preset_sys_type bos_id = tokenizer.convert_tokens_to_ids("<|startoftext|>") img_id = tokenizer.convert_tokens_to_ids("") trig_id = tokenizer.convert_tokens_to_ids(trigger_tag) if trigger_tag else None - has_image_input = _task_has_image_input(task) + has_image_input = task.startswith("i2t") or task.startswith("it2i") # t2i_vanilla uses pretrain template with no chat structure; the vanilla # system prompt drives the model directly. No segment boundaries to @@ -381,15 +189,4 @@ def build_prompt_tokens( return ids -__all__ = [ - "BotTaskResolution", - "HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS", - "available_prompt_bot_tasks", - "available_tasks", - "BOT_TASKS", - "build_prompt", - "build_prompt_tokens", - "PROMPT_BOT_TASKS", - "resolve_bot_task", - "sys_type_for_task", -] +__all__ = ["build_prompt", "build_prompt_tokens", "resolve_stop_token_ids", _TASK_PRESETS] From 8d2970b4bcd997029546b46dce9e291e24bb226d Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 16:13:34 +0800 Subject: [PATCH 16/40] Fix HunyuanImage3 i2t think stop token Signed-off-by: KexiongYu --- .../diffusion/models/hunyuan_image3/prompt_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 9754cf2c82f..577f8de196e 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -65,12 +65,14 @@ def available_tasks() -> list[str]: def resolve_stop_token_ids( task: str = "it2i_think", - bot_task: str = "think", - tokenizer: Any | None = None): - tkw = tokenizer - preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + bot_task: str = "think", + tokenizer: Any | None = None, +): + _, _, trigger_tag = _TASK_PRESETS[task] stop_token_ids = [127957] - if trigger_tag: + if task in ("t2t_think", "i2t_think"): + stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) + elif trigger_tag: stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) return stop_token_ids From 85881e8a5b47d4479d58695d55033e624dba4358 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 16:23:22 +0800 Subject: [PATCH 17/40] Align HunyuanImage3 prompt utils tests Signed-off-by: KexiongYu --- .../hunyuan_image3/test_prompt_utils.py | 149 +++--------------- 1 file changed, 21 insertions(+), 128 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index e858944c0a4..984377f802f 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -25,12 +25,10 @@ from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS, - available_prompt_bot_tasks, available_tasks, build_prompt, build_prompt_tokens, - resolve_bot_task, - sys_type_for_task, + resolve_stop_token_ids, ) pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -76,7 +74,6 @@ def test_available_tasks_covers_all_modalities(): tasks = set(available_tasks()) assert tasks >= { "t2t", - "t2t_think", "i2t", "i2t_think", "it2i_think", @@ -87,127 +84,31 @@ def test_available_tasks_covers_all_modalities(): } -@pytest.mark.parametrize( - "task,expected_bot_task", - [ - ("t2t", "auto"), - ("t2t_think", "think_recaption"), - ("i2t", "auto"), - ("i2t_think", "think_recaption"), - ("it2i_think", "think_recaption"), - ("it2i_recaption", "recaption"), - ("t2i_think", "think_recaption"), - ("t2i_recaption", "recaption"), - ("t2i_vanilla", "image"), - ], -) -def test_resolve_bot_task_matches_prompt_presets(task: str, expected_bot_task: str): - resolution = resolve_bot_task(task=task) - assert resolution.task == task - assert resolution.bot_task == expected_bot_task - - -@pytest.mark.parametrize( - "modality,bot_task,expected_task", - [ - ("text2text", "auto", "t2t"), - ("img2text", "auto", "i2t"), - ("text2img", "auto", "t2i_think"), - ("img2img", "auto", "it2i_think"), - ("i2t", "think", "i2t_think"), - ("ti2i", "recaption", "it2i_recaption"), - ("t2i", "vanilla", "t2i_vanilla"), - ("text2text", "none", "t2t"), - ], -) -def test_resolve_bot_task_composes_prompt_task( - modality: str, - bot_task: str, - expected_task: str, -): - assert resolve_bot_task(bot_task, modality=modality).task == expected_task - - -def test_resolve_bot_task_rejects_invalid_combinations(): - assert available_prompt_bot_tasks() == ["auto", "none", "recaption", "think", "vanilla"] - - with pytest.raises(ValueError, match="not supported"): - resolve_bot_task("recaption", modality="img2text") - - with pytest.raises(ValueError, match="not supported"): - resolve_bot_task("vanilla", modality="img2img") - - -def test_resolve_bot_task_maps_tokenizer_task_and_stop_ids(): - tok = FakeTokenizer() - - resolution = resolve_bot_task("think_recaption", tokenizer=tok) - - assert resolution.task is None - assert resolution.bot_task == "think_recaption" - assert resolution.tokenizer_bot_task == "think" - assert resolution.stop_token_ids == [ - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"], - ] - - -def test_resolve_bot_task_resolves_stop_ids_from_bot_task(): +def test_resolve_stop_token_ids_uses_end_think_for_i2t_think(): tok = FakeTokenizer() eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - boi_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - end_recaption_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - end_answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] - - assert resolve_bot_task("auto", tokenizer=tok).stop_token_ids == [eos_id, boi_id] - assert resolve_bot_task("image", tokenizer=tok).stop_token_ids == [eos_id] - assert resolve_bot_task("think_recaption", tokenizer=tok).stop_token_ids == [ - end_recaption_id, - end_answer_id, - eos_id, - ] - assert resolve_bot_task("recaption", tokenizer=tok).stop_token_ids == [ - end_recaption_id, - end_answer_id, - eos_id, - ] - assert resolve_bot_task("auto", tokenizer=tok, image_size="auto").stop_token_ids == [ + assert resolve_stop_token_ids(task="i2t_think", tokenizer=tok) == [ eos_id, - *range( - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + 1, - ), + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] -def test_resolve_bot_task_resolves_stop_ids_from_prompt_task(): +def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks(): tok = FakeTokenizer() eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - assert resolve_bot_task(task="i2t", tokenizer=tok).stop_token_ids == [ - eos_id, - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - ] - assert resolve_bot_task(task="i2t_think", tokenizer=tok).stop_token_ids == [ - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL[""]] + assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [ eos_id, + FakeTokenizer.SPECIAL[""], ] - assert resolve_bot_task(task="t2i_vanilla", tokenizer=tok).stop_token_ids == [eos_id] - - -def test_sys_type_for_task_returns_prompt_preset_default(): - assert sys_type_for_task("i2t_think") == "en_unified" - assert sys_type_for_task("t2i_vanilla") == "en_vanilla" @pytest.mark.parametrize( "task", [ "t2t", - "t2t_think", "i2t", "i2t_think", "it2i_think", @@ -238,7 +139,7 @@ def test_build_prompt_string_structure_chat_template(task: str): # documentation, so substring index() catches the wrong occurrence -- use # endswith() which directly captures "trigger is at the tail" (the Part A # fix: trigger goes AFTER `Assistant: `, not before user_prompt). - if task in ("t2t_think", "i2t_think", "it2i_think", "t2i_think"): + if task in ("i2t_think", "it2i_think", "t2i_think"): assert s.endswith("Assistant: "), ( f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}" ) @@ -294,31 +195,24 @@ def test_build_prompt_tokens_segments_each_boundary(): def test_build_prompt_tokens_image_placeholder_present_for_image_tasks(): tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="i2t") - assert ids[0] == HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|startoftext|>"], ( - "BOS (<|startoftext|>) must be the first token" - ) - assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] in ids, ( - " placeholder must be present for i2t/it2i tasks" - ) + assert ids[0] == FakeTokenizer.SPECIAL["<|startoftext|>"], "BOS (<|startoftext|>) must be the first token" + assert FakeTokenizer.SPECIAL[""] in ids, " placeholder must be present for i2t/it2i tasks" def test_build_prompt_tokens_no_image_for_text_only_tasks(): tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="t2t") - assert HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] not in ids, ( - " must NOT appear for text-only tasks" - ) + assert FakeTokenizer.SPECIAL[""] not in ids, " must NOT appear for text-only tasks" @pytest.mark.parametrize( "task,trigger_id", [ - ("t2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), - ("i2t_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), - ("it2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), - ("t2i_think", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), - ("it2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), - ("t2i_recaption", HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]), + ("i2t_think", FakeTokenizer.SPECIAL[""]), + ("it2i_think", FakeTokenizer.SPECIAL[""]), + ("t2i_think", FakeTokenizer.SPECIAL[""]), + ("it2i_recaption", FakeTokenizer.SPECIAL[""]), + ("t2i_recaption", FakeTokenizer.SPECIAL[""]), ], ) def test_build_prompt_tokens_trigger_is_last_token(task: str, trigger_id: int): @@ -333,8 +227,8 @@ def test_build_prompt_tokens_no_trigger_for_plain_tasks(): tok = FakeTokenizer() ids = build_prompt_tokens("hi", tok, task="t2t") assert ids[-1] not in { - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + FakeTokenizer.SPECIAL[""], + FakeTokenizer.SPECIAL[""], } @@ -381,10 +275,9 @@ def test_end2end_routes_through_shared_prompt_utils(): if isinstance(node, ast.ImportFrom) and node.module and node.module.endswith("hunyuan_image3.prompt_utils"): imported_from_prompt_utils.update(alias.name for alias in node.names) expected_imports = { - "available_prompt_bot_tasks", + "_TASK_PRESETS", "build_prompt_tokens", - "resolve_bot_task", - "sys_type_for_task", + "resolve_stop_token_ids", } assert expected_imports <= imported_from_prompt_utils, ( "end2end.py must import the HunyuanImage3 prompt and stop-token helpers from " From a72f4578ddd2b9ba190a4b1baa6ac20337ddc7e6 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Fri, 8 May 2026 17:26:36 +0800 Subject: [PATCH 18/40] Remove unsupported HunyuanImage3 comprehension think tasks Signed-off-by: KexiongYu --- .../offline_inference/hunyuan_image3/end2end.py | 13 ++++++++++++- .../models/hunyuan_image3/test_prompt_utils.py | 15 +-------------- .../models/hunyuan_image3/prompt_utils.py | 7 ++----- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index ceebc2d3f39..09533a67ff0 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -18,9 +18,9 @@ from pathlib import Path from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import ( + _TASK_PRESETS, build_prompt_tokens, resolve_stop_token_ids, - _TASK_PRESETS ) from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniPromptType @@ -51,6 +51,7 @@ "text2text": "t2t", } + def parse_args(): parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.") parser.add_argument( @@ -132,6 +133,16 @@ def main(): bot_task = args.bot_task if bot_task != "auto": task = task + "_" + bot_task + if task not in _TASK_PRESETS: + valid_bot_tasks = { + "text2img": ["think", "recaption", "vanilla"], + "img2img": ["think", "recaption"], + "img2text": ["auto"], + "text2text": ["auto"], + }[args.modality] + raise ValueError( + f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}" + ) if args.deploy_config is not None and args.stage_configs_path is not None: raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.") diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 984377f802f..bb24797f44c 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -75,7 +75,6 @@ def test_available_tasks_covers_all_modalities(): assert tasks >= { "t2t", "i2t", - "i2t_think", "it2i_think", "it2i_recaption", "t2i_think", @@ -84,16 +83,6 @@ def test_available_tasks_covers_all_modalities(): } -def test_resolve_stop_token_ids_uses_end_think_for_i2t_think(): - tok = FakeTokenizer() - - eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - assert resolve_stop_token_ids(task="i2t_think", tokenizer=tok) == [ - eos_id, - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - ] - - def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks(): tok = FakeTokenizer() @@ -110,7 +99,6 @@ def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks(): [ "t2t", "i2t", - "i2t_think", "it2i_think", "it2i_recaption", "t2i_think", @@ -139,7 +127,7 @@ def test_build_prompt_string_structure_chat_template(task: str): # documentation, so substring index() catches the wrong occurrence -- use # endswith() which directly captures "trigger is at the tail" (the Part A # fix: trigger goes AFTER `Assistant: `, not before user_prompt). - if task in ("i2t_think", "it2i_think", "t2i_think"): + if task in ("it2i_think", "t2i_think"): assert s.endswith("Assistant: "), ( f"Trigger must be appended right after `Assistant: ` (Part A fix). Got tail: ...{s[-40:]!r}" ) @@ -208,7 +196,6 @@ def test_build_prompt_tokens_no_image_for_text_only_tasks(): @pytest.mark.parametrize( "task,trigger_id", [ - ("i2t_think", FakeTokenizer.SPECIAL[""]), ("it2i_think", FakeTokenizer.SPECIAL[""]), ("t2i_think", FakeTokenizer.SPECIAL[""]), ("it2i_recaption", FakeTokenizer.SPECIAL[""]), diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 577f8de196e..2ca7f4c77cd 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -50,7 +50,6 @@ _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), "i2t": ("en_unified", None, None), - "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), "it2i_recaption": ("en_unified", "recaption", ""), "t2i_think": ("en_unified", "think", ""), @@ -63,6 +62,7 @@ def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) + def resolve_stop_token_ids( task: str = "it2i_think", bot_task: str = "think", @@ -70,14 +70,11 @@ def resolve_stop_token_ids( ): _, _, trigger_tag = _TASK_PRESETS[task] stop_token_ids = [127957] - if task in ("t2t_think", "i2t_think"): - stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) - elif trigger_tag: + if trigger_tag: stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) return stop_token_ids - def build_prompt( user_prompt: str, task: str = "it2i_think", From 596148bf012113b76c460000ffa2140d11677bd0 Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Fri, 8 May 2026 18:41:06 +0800 Subject: [PATCH 19/40] update Signed-off-by: Y. Fisher --- vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index 2ca7f4c77cd..d0137001034 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -50,8 +50,10 @@ _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), "i2t": ("en_unified", None, None), + "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), "it2i_recaption": ("en_unified", "recaption", ""), + "t2i": ("en_unified", "image", None), "t2i_think": ("en_unified", "think", ""), "t2i_recaption": ("en_unified", "recaption", ""), "t2i_vanilla": ("en_vanilla", "image", None), @@ -62,7 +64,6 @@ def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) - def resolve_stop_token_ids( task: str = "it2i_think", bot_task: str = "think", @@ -70,11 +71,14 @@ def resolve_stop_token_ids( ): _, _, trigger_tag = _TASK_PRESETS[task] stop_token_ids = [127957] - if trigger_tag: + if task in ("t2t_think", "i2t_think"): + stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) + elif trigger_tag: stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) return stop_token_ids + def build_prompt( user_prompt: str, task: str = "it2i_think", From 29e9f945e9287e42546104ce038497c9d75d579a Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Fri, 8 May 2026 18:43:04 +0800 Subject: [PATCH 20/40] update Signed-off-by: Y. Fisher --- vllm_omni/deploy/hunyuan_image3.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index b5238169786..505f4ed5919 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -3,11 +3,13 @@ # (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the # verified NPU/XPU stage configs that previously lived under stage_configs/. pipeline: hunyuan_image3 +async_chunk: false stages: - stage_id: 0 max_num_seqs: 1 gpu_memory_utilization: 0.9 + trust_remote_code: true enforce_eager: true max_num_batched_tokens: 32768 devices: "0,1,2,3" @@ -34,18 +36,12 @@ stages: cache_config: enable_cache_dit_summary: false parallel_config: - pipeline_parallel_size: 1 - data_parallel_size: 1 tensor_parallel_size: 4 enable_expert_parallel: true sequence_parallel_size: 1 ulysses_degree: 1 - ring_degree: 1 cfg_parallel_size: 1 vae_patch_parallel_size: 1 - use_hsdp: false - hsdp_shard_size: -1 - hsdp_replicate_size: 1 default_sampling_params: seed: 42 From 1ccedc6e26f0535b09c3768f694001c3a20b5e04 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 10:52:03 +0800 Subject: [PATCH 21/40] Update HunyuanImage3 stop token handling Signed-off-by: KexiongYu --- examples/offline_inference/hunyuan_image3/end2end.py | 2 +- vllm_omni/config/stage_config.py | 8 -------- .../diffusion/models/hunyuan_image3/prompt_utils.py | 10 ++++------ vllm_omni/entrypoints/openai/serving_chat.py | 1 + vllm_omni/entrypoints/utils.py | 1 - 5 files changed, 6 insertions(+), 16 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 09533a67ff0..7fb267ab6cc 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -96,7 +96,7 @@ def parse_args(): "--bot-task", type=str, default="auto", - choices=["auto", "think", "recaption", "vanilla"], + choices=["auto", "think", "recaption", "think_recaption", "vanilla"], help=( "Prompt behavior. 'auto' selects the default for the modality; " "'think' adds ; 'recaption' adds ; " diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index 0bd1f2b7f8f..a879a9a0cda 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -1079,14 +1079,6 @@ def create_from_model( if model_type and model_type in _PIPELINE_REGISTRY: return cls._create_from_registry(model_type, cli_overrides, deploy_config_path) - if deploy_config_path is not None: - deploy_cfg = load_deploy_config(deploy_config_path) - if deploy_cfg.pipeline and deploy_cfg.pipeline in _PIPELINE_REGISTRY: - return cls._create_from_registry( - deploy_cfg.pipeline, - cli_overrides, - deploy_config_path, - ) # --- HF architecture fallback: some models report a generic # model_type that collides with another model. Match by the diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index d0137001034..c14ae8ced23 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -50,13 +50,12 @@ _TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { "t2t": ("en_unified", None, None), "i2t": ("en_unified", None, None), - "i2t_think": ("en_unified", "think", ""), "it2i_think": ("en_unified", "think", ""), "it2i_recaption": ("en_unified", "recaption", ""), "t2i": ("en_unified", "image", None), + "t2i_vanilla": ("en_vanilla", "image", None), "t2i_think": ("en_unified", "think", ""), "t2i_recaption": ("en_unified", "recaption", ""), - "t2i_vanilla": ("en_vanilla", "image", None), } @@ -69,12 +68,11 @@ def resolve_stop_token_ids( bot_task: str = "think", tokenizer: Any | None = None, ): - _, _, trigger_tag = _TASK_PRESETS[task] stop_token_ids = [127957] - if task in ("t2t_think", "i2t_think"): + if "recaption" in task: stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) - elif trigger_tag: - stop_token_ids.append(tokenizer.convert_tokens_to_ids(trigger_tag)) + if "think" in task: + stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) return stop_token_ids diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 09b62bf8972..7558e85aaac 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -2149,6 +2149,7 @@ def _build_multistage_generation_inputs( lora_body = extra_body.get("lora") layers = extra_body.get("layers") resolution = extra_body.get("resolution") + bot_task = extra_body.get("bot_task") engine_prompt_data: dict[str, Any] | None = None modalities = ["image"] diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 460c6985b0c..d728e76417c 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -340,7 +340,6 @@ def resolve_model_config_path(model: str) -> str: normalized_model_type = _DIFFUSERS_CLASS_TO_CONFIG[model_type] else: normalized_model_type = model_type.replace("-", "_") - model_type_str = f"{normalized_model_type}.yaml" complete_config_path = PROJECT_ROOT / default_config_path / model_type_str if os.path.exists(complete_config_path): From a63b9ffcdcea2bd13dcbaf3928853e66c984c301 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 11:03:57 +0800 Subject: [PATCH 22/40] Fix HunyuanImage3 pre-commit formatting Signed-off-by: KexiongYu --- vllm_omni/config/stage_config.py | 1 - vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py index a879a9a0cda..dcc4d5ec9d6 100644 --- a/vllm_omni/config/stage_config.py +++ b/vllm_omni/config/stage_config.py @@ -1079,7 +1079,6 @@ def create_from_model( if model_type and model_type in _PIPELINE_REGISTRY: return cls._create_from_registry(model_type, cli_overrides, deploy_config_path) - # --- HF architecture fallback: some models report a generic # model_type that collides with another model. Match by the # hf_architectures declared on each registered PipelineConfig. diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index c14ae8ced23..d0b9f2ca40f 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -17,12 +17,10 @@ from __future__ import annotations -from dataclasses import dataclass from typing import Any from .system_prompt import get_system_prompt - # HunyuanImage-3.0-Instruct special token ids from tokenizer.json. # Keep offline AR prompt/stop-token behavior independent of runtime # tokenizer lookup for these fixed control tokens. @@ -63,6 +61,7 @@ def available_tasks() -> list[str]: """Sorted list of task keys accepted by `build_prompt` / `build_prompt_tokens`.""" return sorted(_TASK_PRESETS) + def resolve_stop_token_ids( task: str = "it2i_think", bot_task: str = "think", @@ -76,7 +75,6 @@ def resolve_stop_token_ids( return stop_token_ids - def build_prompt( user_prompt: str, task: str = "it2i_think", From 21e16afa2310825a4fd9d28001a85faca25cb529 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 11:07:48 +0800 Subject: [PATCH 23/40] Add HunyuanImage3 KV reuse deploy config Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 505f4ed5919..2a331af5186 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -5,6 +5,17 @@ pipeline: hunyuan_image3 async_chunk: false +connectors: + rdma_connector: + name: MooncakeTransferEngineConnector + extra: + host: "auto" + zmq_port: 50051 + protocol: "rdma" + device_name: "" + memory_pool_size: 4294967296 + memory_pool_device: "cpu" + stages: - stage_id: 0 max_num_seqs: 1 @@ -18,6 +29,10 @@ stages: rope_parameters: mrope_section: [0, 32, 32] rope_type: default + omni_kv_config: + need_send_cache: true + output_connectors: + to_stage_1: rdma_connector default_sampling_params: temperature: 0.6 top_p: 0.95 @@ -35,6 +50,8 @@ stages: cache_backend: cache_config: enable_cache_dit_summary: false + omni_kv_config: + need_recv_cache: true parallel_config: tensor_parallel_size: 4 enable_expert_parallel: true @@ -44,6 +61,8 @@ stages: vae_patch_parallel_size: 1 default_sampling_params: seed: 42 + input_connectors: + from_stage_0: rdma_connector edges: - from: 0 From 6ae5389a89d353c7dca01ef5a035ff8e80b7ba11 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 11:14:07 +0800 Subject: [PATCH 24/40] Address HunyuanImage3 deploy path review Signed-off-by: KexiongYu --- tests/e2e/offline_inference/test_hunyuanimage3.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3.py b/tests/e2e/offline_inference/test_hunyuanimage3.py index 2a385f6a4c0..ac1cb13cba7 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3.py @@ -1,6 +1,5 @@ # ruff: noqa: E501 from collections.abc import Generator -from pathlib import Path import pytest import torch @@ -9,6 +8,7 @@ from transformers import CLIPModel, CLIPProcessor from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import get_deploy_config_path from vllm_omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -16,8 +16,7 @@ PROMPT = "A brown and white dog is running on the grass" MODEL_NAME = "tencent/HunyuanImage-3.0" LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" -REPO_ROOT = Path(__file__).resolve().parents[3] -DEPLOY_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml" +DEPLOY_CONFIG_PATH = get_deploy_config_path("hunyuan_image3.yaml") pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] From 02a83784af53577e5d618ec53a67c406518ca3d5 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 14:41:14 +0800 Subject: [PATCH 25/40] Limit HunyuanImage3 images per prompt Signed-off-by: KexiongYu --- examples/offline_inference/hunyuan_image3/README.md | 1 + examples/offline_inference/hunyuan_image3/end2end.py | 8 ++++++++ vllm_omni/deploy/hunyuan_image3.yaml | 1 + vllm_omni/deploy/hunyuan_image3_dit.yaml | 1 + 4 files changed, 11 insertions(+) diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index 98908ace0d7..431f081300d 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -108,6 +108,7 @@ python examples/offline_inference/hunyuan_image3/end2end.py \ | `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. | | `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. | | `--steps` | Number of diffusion inference steps for image generation. | +| `--num-outputs-per-prompt` | Number of images to generate for each prompt. Defaults to `1`. | | `--guidance-scale` | Classifier-free guidance scale for image generation. | | `--height`, `--width` | Output image size for `text2img`. | | `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds ``; `recaption` adds ``; `vanilla` uses the text-to-image pretrain template. | diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 7fb267ab6cc..87f5c62a2a0 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -85,6 +85,12 @@ def parse_args(): parser.add_argument("--seed", type=int, default=42, help="Random seed.") parser.add_argument("--height", type=int, default=1024, help="Output image height.") parser.add_argument("--width", type=int, default=1024, help="Output image width.") + parser.add_argument( + "--num-outputs-per-prompt", + type=int, + default=1, + help="Number of images to generate for each prompt.", + ) parser.add_argument( "--vae-use-tiling", action="store_true", @@ -232,6 +238,7 @@ def main(): for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps + sp.num_outputs_per_prompt = args.num_outputs_per_prompt sp.guidance_scale = args.guidance_scale sp.guidance_scale_provided = True if args.seed is not None: @@ -256,6 +263,7 @@ def main(): print(f" Num stages: {omni.num_stages}") if args.modality in ("text2img", "img2img"): print(f" Inference steps: {args.steps}") + print(f" Outputs per prompt: {args.num_outputs_per_prompt}") print(f" Guidance scale: {args.guidance_scale}") print(f" Seed: {args.seed}") if args.modality == "text2img": diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 2a331af5186..d49800f72c8 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -61,6 +61,7 @@ stages: vae_patch_parallel_size: 1 default_sampling_params: seed: 42 + num_outputs_per_prompt: 1 input_connectors: from_stage_0: rdma_connector diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml index 3c0ba190101..3b922df20ea 100644 --- a/vllm_omni/deploy/hunyuan_image3_dit.yaml +++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml @@ -31,6 +31,7 @@ stages: hsdp_replicate_size: 1 default_sampling_params: seed: 42 + num_outputs_per_prompt: 1 platforms: npu: From 476a7f03e3191a465ff52fb959a3b90159c14784 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 15:43:14 +0800 Subject: [PATCH 26/40] Revert "Limit HunyuanImage3 images per prompt" This reverts commit dac00c4b7a7b24dd5e2fbfa987062a0bb9dcc3be. Signed-off-by: KexiongYu --- examples/offline_inference/hunyuan_image3/README.md | 1 - examples/offline_inference/hunyuan_image3/end2end.py | 8 -------- vllm_omni/deploy/hunyuan_image3.yaml | 1 - vllm_omni/deploy/hunyuan_image3_dit.yaml | 1 - 4 files changed, 11 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/README.md b/examples/offline_inference/hunyuan_image3/README.md index 431f081300d..98908ace0d7 100644 --- a/examples/offline_inference/hunyuan_image3/README.md +++ b/examples/offline_inference/hunyuan_image3/README.md @@ -108,7 +108,6 @@ python examples/offline_inference/hunyuan_image3/end2end.py \ | `--stage-configs-path` | Legacy stage config path, kept only for compatibility. Prefer `--deploy-config`. | | `--modality` | Offline-only convenience flag. One of `text2img`, `img2img`, `img2text`, `text2text`. It selects prompt formatting, internal `mode`, and default deploy config for this script. Online serving uses `--deploy-config` plus the endpoint and, for chat completions, request `modalities` instead. | | `--steps` | Number of diffusion inference steps for image generation. | -| `--num-outputs-per-prompt` | Number of images to generate for each prompt. Defaults to `1`. | | `--guidance-scale` | Classifier-free guidance scale for image generation. | | `--height`, `--width` | Output image size for `text2img`. | | `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds ``; `recaption` adds ``; `vanilla` uses the text-to-image pretrain template. | diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 87f5c62a2a0..7fb267ab6cc 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -85,12 +85,6 @@ def parse_args(): parser.add_argument("--seed", type=int, default=42, help="Random seed.") parser.add_argument("--height", type=int, default=1024, help="Output image height.") parser.add_argument("--width", type=int, default=1024, help="Output image width.") - parser.add_argument( - "--num-outputs-per-prompt", - type=int, - default=1, - help="Number of images to generate for each prompt.", - ) parser.add_argument( "--vae-use-tiling", action="store_true", @@ -238,7 +232,6 @@ def main(): for sp in params_list: if isinstance(sp, OmniDiffusionSamplingParams): sp.num_inference_steps = args.steps - sp.num_outputs_per_prompt = args.num_outputs_per_prompt sp.guidance_scale = args.guidance_scale sp.guidance_scale_provided = True if args.seed is not None: @@ -263,7 +256,6 @@ def main(): print(f" Num stages: {omni.num_stages}") if args.modality in ("text2img", "img2img"): print(f" Inference steps: {args.steps}") - print(f" Outputs per prompt: {args.num_outputs_per_prompt}") print(f" Guidance scale: {args.guidance_scale}") print(f" Seed: {args.seed}") if args.modality == "text2img": diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index d49800f72c8..2a331af5186 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -61,7 +61,6 @@ stages: vae_patch_parallel_size: 1 default_sampling_params: seed: 42 - num_outputs_per_prompt: 1 input_connectors: from_stage_0: rdma_connector diff --git a/vllm_omni/deploy/hunyuan_image3_dit.yaml b/vllm_omni/deploy/hunyuan_image3_dit.yaml index 3b922df20ea..3c0ba190101 100644 --- a/vllm_omni/deploy/hunyuan_image3_dit.yaml +++ b/vllm_omni/deploy/hunyuan_image3_dit.yaml @@ -31,7 +31,6 @@ stages: hsdp_replicate_size: 1 default_sampling_params: seed: 42 - num_outputs_per_prompt: 1 platforms: npu: From 8f594ee92243e9baa01f96549b638335048511b1 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sat, 9 May 2026 16:21:33 +0800 Subject: [PATCH 27/40] Fix HunyuanImage3 stop token mapping Signed-off-by: KexiongYu --- vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index d0b9f2ca40f..c975bc3ab61 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -69,9 +69,9 @@ def resolve_stop_token_ids( ): stop_token_ids = [127957] if "recaption" in task: - stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) - if "think" in task: stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) + if "think" in task: + stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) return stop_token_ids From 5c03b7ca807c7df0c708dc1dd1f45619f8668104 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sun, 10 May 2026 14:19:46 +0800 Subject: [PATCH 28/40] Enable model sampler for NPU AR runner Signed-off-by: KexiongYu --- .../npu/worker/npu_ar_model_runner.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index 8cff1849aa5..a4acd421cf8 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -5,6 +5,7 @@ import time from copy import copy, deepcopy +from dataclasses import replace from typing import Any, NamedTuple import numpy as np @@ -92,6 +93,83 @@ def _make_buffer(self, *size, dtype, numpy=True): with maybe_disable_pin_memory_for_ray(self, total_bytes): return super()._make_buffer(*size, dtype=dtype, numpy=numpy) + def _build_model_sampler_output_token_ids(self) -> list[list[int]]: + """Build decoded-token history for custom model samplers. + + vLLM only populates sampling_metadata.output_token_ids when penalties or + logits processors require it. HunyuanImage3's custom sampler needs this + history to force transitions such as -> , so mirror + the GPU AR runner behavior for prefer_model_sampler models. + """ + req_output_token_ids = getattr(self.input_batch, "req_output_token_ids", []) + req_ids = list(getattr(self.input_batch, "req_ids", [])) + output_token_ids = [list(req_output_token_ids[idx] or []) for idx in range(len(req_ids))] + + sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None) + async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None) + prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None) + if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None: + return output_token_ids + + sampled_token_ids: list[list[int]] | None = None + for index, req_id in enumerate(req_ids): + prev_index = prev_req_id_to_index.get(req_id) + if prev_index is None: + continue + req_history = output_token_ids[index] + if not req_history or req_history[-1] != -1: + continue + if sampled_token_ids is None: + assert async_copy_ready_event is not None + async_copy_ready_event.synchronize() + sampled_token_ids = sampled_token_ids_cpu.tolist() + new_ids = list(sampled_token_ids[prev_index]) + if not new_ids: + continue + num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1) + first_placeholder = req_history.index(-1) + num_placeholders = len(req_history) - first_placeholder + num_to_replace = min(num_sampled_ids, num_placeholders) + req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] + + return output_token_ids + + def _sampling_metadata_for_model_sampler(self, sampling_metadata): + output_token_ids = self._build_model_sampler_output_token_ids() + if output_token_ids == sampling_metadata.output_token_ids: + return sampling_metadata + return replace(sampling_metadata, output_token_ids=output_token_ids) + + def _sample( + self, + logits: torch.Tensor | None, + spec_decode_metadata: Any, + ): + sampling_metadata = self.input_batch.sampling_metadata + if spec_decode_metadata is None: + model_sample = getattr(self.model, "sample", None) + if logits is not None and callable(model_sample) and getattr(self.model, "prefer_model_sampler", False): + if hasattr(self.sampler, "logit_bias_state"): + self.sampler.logit_bias_state.apply_logit_bias( + logits, + self.input_batch.expanded_idx_mapping, + self.input_batch.idx_mapping_np, + self.input_batch.positions[self.input_batch.logits_indices], + ) + sampler_output = model_sample( + logits, + self._sampling_metadata_for_model_sampler(sampling_metadata), + ) + if sampler_output is not None: + return sampler_output + self.input_batch.update_async_output_token_ids() + return self.sampler( + logits=logits, + sampling_metadata=sampling_metadata, + ) + + return super()._sample(logits, spec_decode_metadata) + # -------------------------------------- Omni-new ------------------------------------------------- def capture_model(self) -> int: npugraph_memory_bytes = super().capture_model() From 32ea60f470cbcf08c96ce27ae7a26f445f5174ee Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sun, 10 May 2026 14:31:25 +0800 Subject: [PATCH 29/40] Update HunyuanImage3 KV reuse deploy config Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 59 +++++++++++++--------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 2a331af5186..775b0c0f34a 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -1,7 +1,7 @@ -# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1). -# The base CUDA layout follows the existing 8-GPU AR->DiT config -# (4 GPUs for AR, 4 GPUs for DiT). Platform overrides below fold in the -# verified NPU/XPU stage configs that previously lived under stage_configs/. +# HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1) +# with AR-to-DiT KV reuse. The base CUDA layout follows +# model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml from +# PR #3346: 2 GPUs for AR and 2 GPUs for DiT. pipeline: hunyuan_image3 async_chunk: false @@ -19,12 +19,11 @@ connectors: stages: - stage_id: 0 max_num_seqs: 1 - gpu_memory_utilization: 0.9 - trust_remote_code: true + gpu_memory_utilization: 0.95 enforce_eager: true max_num_batched_tokens: 32768 - devices: "0,1,2,3" - tensor_parallel_size: 4 + devices: "0,1" + tensor_parallel_size: 2 hf_overrides: rope_parameters: mrope_section: [0, 32, 32] @@ -34,35 +33,29 @@ stages: output_connectors: to_stage_1: rdma_connector default_sampling_params: - temperature: 0.6 - top_p: 0.95 - top_k: 1024 - max_tokens: 4096 - detokenize: false + temperature: 0.0 + top_p: 1 + top_k: -1 + max_tokens: 8192 + stop_token_ids: [128025] + detokenize: true + skip_special_tokens: false - stage_id: 1 max_num_seqs: 1 - gpu_memory_utilization: 0.9 enforce_eager: true - devices: "4,5,6,7" - vae_use_slicing: false - vae_use_tiling: false - cache_backend: - cache_config: - enable_cache_dit_summary: false + devices: "2,3" + distributed_executor_backend: "mp" omni_kv_config: need_recv_cache: true parallel_config: - tensor_parallel_size: 4 + tensor_parallel_size: 2 enable_expert_parallel: true - sequence_parallel_size: 1 - ulysses_degree: 1 - cfg_parallel_size: 1 - vae_patch_parallel_size: 1 - default_sampling_params: - seed: 42 input_connectors: from_stage_0: rdma_connector + default_sampling_params: + num_inference_steps: 50 + guidance_scale: 0 edges: - from: 0 @@ -75,15 +68,15 @@ platforms: stages: - stage_id: 0 gpu_memory_utilization: 0.65 - devices: "0,1,2,3" - tensor_parallel_size: 4 + devices: "0,1,2,3,4,5,6,7" + tensor_parallel_size: 8 - stage_id: 1 gpu_memory_utilization: 0.65 - devices: "4,5,6,7" + devices: "8,9,10,11" max_num_batched_tokens: 32768 parallel_config: tensor_parallel_size: 4 - enable_expert_parallel: false + enable_expert_parallel: true xpu: stages: @@ -95,6 +88,10 @@ platforms: quantization: fp8 enable_expert_parallel: true worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker + default_sampling_params: + max_tokens: 2048 + seed: 42 + repetition_penalty: 1.1 - stage_id: 1 gpu_memory_utilization: 0.9 devices: "0,1,2,3,4,5,6,7" From c7643df0891b08d406e3b5e826bda1a2c5f09318 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sun, 10 May 2026 15:27:44 +0800 Subject: [PATCH 30/40] Fix HunyuanImage3 stop token unit test Signed-off-by: KexiongYu --- tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index bb24797f44c..a9d570936bf 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -55,6 +55,7 @@ class FakeTokenizer: "": 6, "": 7, "": 8, + "": 9, **{f"": 1000 + i for i in range(33)}, } @@ -83,14 +84,14 @@ def test_available_tasks_covers_all_modalities(): } -def test_resolve_stop_token_ids_uses_trigger_for_generation_tasks(): +def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks(): tok = FakeTokenizer() eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL[""]] + assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL[""]] assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [ eos_id, - FakeTokenizer.SPECIAL[""], + FakeTokenizer.SPECIAL[""], ] From 553bd8b9623394bcb847bb3e3ef8970b9608beeb Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sun, 10 May 2026 16:26:26 +0800 Subject: [PATCH 31/40] Update HunyuanImage3 deploy config Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index 775b0c0f34a..b414f6eb78a 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -4,6 +4,7 @@ # PR #3346: 2 GPUs for AR and 2 GPUs for DiT. pipeline: hunyuan_image3 async_chunk: false +trust_remote_code: true connectors: rdma_connector: @@ -67,7 +68,7 @@ platforms: npu: stages: - stage_id: 0 - gpu_memory_utilization: 0.65 + gpu_memory_utilization: 0.75 devices: "0,1,2,3,4,5,6,7" tensor_parallel_size: 8 - stage_id: 1 From badd206bac720377378039704a34edbea5f209d5 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Sun, 10 May 2026 17:22:00 +0800 Subject: [PATCH 32/40] Fix HunyuanImage3 stop token test ids Signed-off-by: KexiongYu --- tests/diffusion/models/hunyuan_image3/test_prompt_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index a9d570936bf..83280ebefd5 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -88,10 +88,13 @@ def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks(): tok = FakeTokenizer() eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [eos_id, FakeTokenizer.SPECIAL[""]] + assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [ + eos_id, + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], + ] assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [ eos_id, - FakeTokenizer.SPECIAL[""], + HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], ] From 3975e50c552bfbf26dc7c838fc4df86d344812a4 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 15:05:47 +0800 Subject: [PATCH 33/40] Print HunyuanImage3 AR generated text Signed-off-by: KexiongYu --- examples/offline_inference/hunyuan_image3/end2end.py | 11 +++++++++-- .../models/hunyuan_image3/pipeline_hunyuan_image3.py | 4 ++++ .../diffusion/models/hunyuan_image3/prompt_utils.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/hunyuan_image3/end2end.py b/examples/offline_inference/hunyuan_image3/end2end.py index 7fb267ab6cc..0f41150b65d 100644 --- a/examples/offline_inference/hunyuan_image3/end2end.py +++ b/examples/offline_inference/hunyuan_image3/end2end.py @@ -273,10 +273,17 @@ def main(): for req_output in omni_outputs: # Text output (AR stage or text-only) ro = getattr(req_output, "request_output", None) + txt = "" if ro and getattr(ro, "outputs", None): txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) - if txt: - print(f"[Output] Text:\n{txt}") + if not txt: + ar_text = getattr(req_output, "custom_output", {}).get("ar_generated_text") + if isinstance(ar_text, list): + txt = "\n".join(text for text in ar_text if text) + else: + txt = ar_text or "" + if txt: + print(f"[Output] Text:\n{txt}") # Image output (DiT stage) images = getattr(req_output, "images", None) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 7a8be07456d..5c0ef163506 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -1424,7 +1424,11 @@ def forward( model_inputs.update(ar_kv_kwargs) outputs = self._generate(**model_inputs, **kwargs) + custom_output = {} + if any(t is not None for t in cot_text_list): + custom_output["ar_generated_text"] = cot_text_list[0] if len(cot_text_list) == 1 else cot_text_list return DiffusionOutput( output=outputs[0], + custom_output=custom_output, stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, ) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index c975bc3ab61..a8b2c743a82 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -67,7 +67,7 @@ def resolve_stop_token_ids( bot_task: str = "think", tokenizer: Any | None = None, ): - stop_token_ids = [127957] + stop_token_ids = [128025] if "recaption" in task: stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) if "think" in task: From 015b34ffb82ec33c16a4aa9ed745fa4c124867c3 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 15:20:37 +0800 Subject: [PATCH 34/40] Preserve HunyuanImage3 AR tag output Signed-off-by: KexiongYu --- .../models/hunyuan_image3/test_prompt_utils.py | 14 ++++---------- .../models/hunyuan_image3/prompt_utils.py | 7 +------ 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py index 83280ebefd5..f899e5a65d1 100644 --- a/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py +++ b/tests/diffusion/models/hunyuan_image3/test_prompt_utils.py @@ -84,18 +84,12 @@ def test_available_tasks_covers_all_modalities(): } -def test_resolve_stop_token_ids_uses_end_tags_for_generation_tasks(): +def test_resolve_stop_token_ids_uses_answer_for_generation_tasks(): tok = FakeTokenizer() - eos_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS["<|endoftext|>"] - assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [ - eos_id, - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - ] - assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [ - eos_id, - HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""], - ] + answer_id = HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""] + assert resolve_stop_token_ids(task="t2i_think", tokenizer=tok) == [answer_id] + assert resolve_stop_token_ids(task="t2i_recaption", tokenizer=tok) == [answer_id] @pytest.mark.parametrize( diff --git a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py index a8b2c743a82..bfc0146d8e8 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/prompt_utils.py @@ -67,12 +67,7 @@ def resolve_stop_token_ids( bot_task: str = "think", tokenizer: Any | None = None, ): - stop_token_ids = [128025] - if "recaption" in task: - stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) - if "think" in task: - stop_token_ids.append(HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]) - return stop_token_ids + return [HUNYUAN_IMAGE3_SPECIAL_TOKEN_IDS[""]] def build_prompt( From 4f6b5732fb5fa7705c28866fe9f87f5d11dceba6 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 15:38:42 +0800 Subject: [PATCH 35/40] Fix HunyuanImage3 NPU AR output flow Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 17 ++++++++++++----- .../platforms/npu/worker/npu_ar_model_runner.py | 16 ++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index b414f6eb78a..fce8d71dc58 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -16,9 +16,16 @@ connectors: device_name: "" memory_pool_size: 4294967296 memory_pool_device: "cpu" + shared_memory_connector: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 stages: - stage_id: 0 + is_comprehension: false + final_output: true + final_output_type: text max_num_seqs: 1 gpu_memory_utilization: 0.95 enforce_eager: true @@ -32,13 +39,12 @@ stages: omni_kv_config: need_send_cache: true output_connectors: - to_stage_1: rdma_connector + to_stage_1: shared_memory_connector default_sampling_params: temperature: 0.0 top_p: 1 top_k: -1 max_tokens: 8192 - stop_token_ids: [128025] detokenize: true skip_special_tokens: false @@ -53,7 +59,7 @@ stages: tensor_parallel_size: 2 enable_expert_parallel: true input_connectors: - from_stage_0: rdma_connector + from_stage_0: shared_memory_connector default_sampling_params: num_inference_steps: 50 guidance_scale: 0 @@ -68,13 +74,14 @@ platforms: npu: stages: - stage_id: 0 - gpu_memory_utilization: 0.75 + gpu_memory_utilization: 0.6 devices: "0,1,2,3,4,5,6,7" tensor_parallel_size: 8 + max_num_batched_tokens: 8192 - stage_id: 1 gpu_memory_utilization: 0.65 devices: "8,9,10,11" - max_num_batched_tokens: 32768 + max_num_batched_tokens: 8192 parallel_config: tensor_parallel_size: 4 enable_expert_parallel: true diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index a4acd421cf8..e78e57101f5 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -117,8 +117,6 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]: if prev_index is None: continue req_history = output_token_ids[index] - if not req_history or req_history[-1] != -1: - continue if sampled_token_ids is None: assert async_copy_ready_event is not None async_copy_ready_event.synchronize() @@ -127,10 +125,16 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]: if not new_ids: continue num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1) - first_placeholder = req_history.index(-1) - num_placeholders = len(req_history) - first_placeholder - num_to_replace = min(num_sampled_ids, num_placeholders) - req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] + new_ids = new_ids[:num_sampled_ids] + if not new_ids: + continue + if req_history and req_history[-1] == -1: + first_placeholder = req_history.index(-1) + num_placeholders = len(req_history) - first_placeholder + num_to_replace = min(len(new_ids), num_placeholders) + req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] + elif req_history[-len(new_ids) :] != new_ids: + req_history.extend(new_ids) return output_token_ids From 4807452a3794416821226c99459c7dffd0860c7d Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 16:04:30 +0800 Subject: [PATCH 36/40] Fix NPU AR sampler history fallback Signed-off-by: KexiongYu --- vllm_omni/platforms/npu/worker/npu_ar_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index e78e57101f5..4962fe728fa 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -108,12 +108,12 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]: sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None) async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None) prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None) - if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None: + if sampled_token_ids_cpu is None or not output_token_ids: return output_token_ids sampled_token_ids: list[list[int]] | None = None for index, req_id in enumerate(req_ids): - prev_index = prev_req_id_to_index.get(req_id) + prev_index = prev_req_id_to_index.get(req_id) if prev_req_id_to_index is not None else index if prev_index is None: continue req_history = output_token_ids[index] @@ -121,6 +121,8 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]: assert async_copy_ready_event is not None async_copy_ready_event.synchronize() sampled_token_ids = sampled_token_ids_cpu.tolist() + if prev_index >= len(sampled_token_ids): + continue new_ids = list(sampled_token_ids[prev_index]) if not new_ids: continue From a0dd770b796e491d9f9ddbeb887cc9bd20a2b14a Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 16:36:15 +0800 Subject: [PATCH 37/40] Revert NPU AR sampler history fallback Signed-off-by: KexiongYu --- .../npu/worker/npu_ar_model_runner.py | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index 4962fe728fa..a4acd421cf8 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -108,35 +108,29 @@ def _build_model_sampler_output_token_ids(self) -> list[list[int]]: sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None) async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None) prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None) - if sampled_token_ids_cpu is None or not output_token_ids: + if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None: return output_token_ids sampled_token_ids: list[list[int]] | None = None for index, req_id in enumerate(req_ids): - prev_index = prev_req_id_to_index.get(req_id) if prev_req_id_to_index is not None else index + prev_index = prev_req_id_to_index.get(req_id) if prev_index is None: continue req_history = output_token_ids[index] + if not req_history or req_history[-1] != -1: + continue if sampled_token_ids is None: assert async_copy_ready_event is not None async_copy_ready_event.synchronize() sampled_token_ids = sampled_token_ids_cpu.tolist() - if prev_index >= len(sampled_token_ids): - continue new_ids = list(sampled_token_ids[prev_index]) if not new_ids: continue num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1) - new_ids = new_ids[:num_sampled_ids] - if not new_ids: - continue - if req_history and req_history[-1] == -1: - first_placeholder = req_history.index(-1) - num_placeholders = len(req_history) - first_placeholder - num_to_replace = min(len(new_ids), num_placeholders) - req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] - elif req_history[-len(new_ids) :] != new_ids: - req_history.extend(new_ids) + first_placeholder = req_history.index(-1) + num_placeholders = len(req_history) - first_placeholder + num_to_replace = min(num_sampled_ids, num_placeholders) + req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] return output_token_ids From 64a65c7c9b960b52b61e360df069cd6d7cd7583d Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 18:44:33 +0800 Subject: [PATCH 38/40] Revert NPU AR model sampler override Signed-off-by: KexiongYu --- .../npu/worker/npu_ar_model_runner.py | 78 ------------------- 1 file changed, 78 deletions(-) diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index a4acd421cf8..8cff1849aa5 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -5,7 +5,6 @@ import time from copy import copy, deepcopy -from dataclasses import replace from typing import Any, NamedTuple import numpy as np @@ -93,83 +92,6 @@ def _make_buffer(self, *size, dtype, numpy=True): with maybe_disable_pin_memory_for_ray(self, total_bytes): return super()._make_buffer(*size, dtype=dtype, numpy=numpy) - def _build_model_sampler_output_token_ids(self) -> list[list[int]]: - """Build decoded-token history for custom model samplers. - - vLLM only populates sampling_metadata.output_token_ids when penalties or - logits processors require it. HunyuanImage3's custom sampler needs this - history to force transitions such as -> , so mirror - the GPU AR runner behavior for prefer_model_sampler models. - """ - req_output_token_ids = getattr(self.input_batch, "req_output_token_ids", []) - req_ids = list(getattr(self.input_batch, "req_ids", [])) - output_token_ids = [list(req_output_token_ids[idx] or []) for idx in range(len(req_ids))] - - sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None) - async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None) - prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None) - if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None: - return output_token_ids - - sampled_token_ids: list[list[int]] | None = None - for index, req_id in enumerate(req_ids): - prev_index = prev_req_id_to_index.get(req_id) - if prev_index is None: - continue - req_history = output_token_ids[index] - if not req_history or req_history[-1] != -1: - continue - if sampled_token_ids is None: - assert async_copy_ready_event is not None - async_copy_ready_event.synchronize() - sampled_token_ids = sampled_token_ids_cpu.tolist() - new_ids = list(sampled_token_ids[prev_index]) - if not new_ids: - continue - num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1) - first_placeholder = req_history.index(-1) - num_placeholders = len(req_history) - first_placeholder - num_to_replace = min(num_sampled_ids, num_placeholders) - req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] - - return output_token_ids - - def _sampling_metadata_for_model_sampler(self, sampling_metadata): - output_token_ids = self._build_model_sampler_output_token_ids() - if output_token_ids == sampling_metadata.output_token_ids: - return sampling_metadata - return replace(sampling_metadata, output_token_ids=output_token_ids) - - def _sample( - self, - logits: torch.Tensor | None, - spec_decode_metadata: Any, - ): - sampling_metadata = self.input_batch.sampling_metadata - if spec_decode_metadata is None: - model_sample = getattr(self.model, "sample", None) - if logits is not None and callable(model_sample) and getattr(self.model, "prefer_model_sampler", False): - if hasattr(self.sampler, "logit_bias_state"): - self.sampler.logit_bias_state.apply_logit_bias( - logits, - self.input_batch.expanded_idx_mapping, - self.input_batch.idx_mapping_np, - self.input_batch.positions[self.input_batch.logits_indices], - ) - sampler_output = model_sample( - logits, - self._sampling_metadata_for_model_sampler(sampling_metadata), - ) - if sampler_output is not None: - return sampler_output - self.input_batch.update_async_output_token_ids() - return self.sampler( - logits=logits, - sampling_metadata=sampling_metadata, - ) - - return super()._sample(logits, spec_decode_metadata) - # -------------------------------------- Omni-new ------------------------------------------------- def capture_model(self) -> int: npugraph_memory_bytes = super().capture_model() From 6d9b2f91c82023dc218461c06e1315f42b4c50b5 Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 18:49:37 +0800 Subject: [PATCH 39/40] Adjust HunyuanImage3 NPU stage 0 batching Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 8 ++++---- vllm_omni/deploy/hunyuan_image3_ar.yaml | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index fce8d71dc58..bbcf78a16b4 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -74,13 +74,13 @@ platforms: npu: stages: - stage_id: 0 - gpu_memory_utilization: 0.6 - devices: "0,1,2,3,4,5,6,7" - tensor_parallel_size: 8 + gpu_memory_utilization: 0.8 + devices: "0,1,2,3" + tensor_parallel_size: 4 max_num_batched_tokens: 8192 - stage_id: 1 gpu_memory_utilization: 0.65 - devices: "8,9,10,11" + devices: "4,5,6,7" max_num_batched_tokens: 8192 parallel_config: tensor_parallel_size: 4 diff --git a/vllm_omni/deploy/hunyuan_image3_ar.yaml b/vllm_omni/deploy/hunyuan_image3_ar.yaml index 27cbf0f9a60..a59fbfcc95f 100644 --- a/vllm_omni/deploy/hunyuan_image3_ar.yaml +++ b/vllm_omni/deploy/hunyuan_image3_ar.yaml @@ -33,6 +33,7 @@ platforms: gpu_memory_utilization: 0.75 devices: "0,1,2,3,4,5,6,7" tensor_parallel_size: 8 + max_num_batched_tokens: 8192 xpu: stages: From 2b44288e7a0ed262d1a8876833f982c4284b0edd Mon Sep 17 00:00:00 2001 From: KexiongYu Date: Mon, 11 May 2026 19:40:46 +0800 Subject: [PATCH 40/40] Remove legacy HunyuanImage3 stage config Signed-off-by: KexiongYu --- vllm_omni/deploy/hunyuan_image3.yaml | 5 +- .../hunyuan_image3_it2i_kv_reuse.yaml | 89 ------------------- 2 files changed, 2 insertions(+), 92 deletions(-) delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml diff --git a/vllm_omni/deploy/hunyuan_image3.yaml b/vllm_omni/deploy/hunyuan_image3.yaml index bbcf78a16b4..634165cd33a 100644 --- a/vllm_omni/deploy/hunyuan_image3.yaml +++ b/vllm_omni/deploy/hunyuan_image3.yaml @@ -1,7 +1,6 @@ # HunyuanImage-3.0-Instruct deploy: AR (stage 0) + DiT (stage 1) -# with AR-to-DiT KV reuse. The base CUDA layout follows -# model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml from -# PR #3346: 2 GPUs for AR and 2 GPUs for DiT. +# with AR-to-DiT KV reuse. The base CUDA layout uses 2 GPUs for AR +# and 2 GPUs for DiT. pipeline: hunyuan_image3 async_chunk: false trust_remote_code: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml deleted file mode 100644 index 23c3bbbb262..00000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i_kv_reuse.yaml +++ /dev/null @@ -1,89 +0,0 @@ -stage_args: - # Stage 0: AR Model - - stage_id: 0 - stage_type: llm - runtime: - process: true - devices: "0,1" - max_batch_size: 1 - requires_multimodal_data: true # AR needs the original image - engine_args: - model_stage: AR - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.95 - enforce_eager: true - trust_remote_code: true - engine_output_type: latent # AR outputs latent for DiT - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 2 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - omni_kv_config: - need_send_cache: true - is_comprehension: false # Generation task, not comprehension - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1 - top_k: -1 - max_tokens: 8192 - stop_token_ids: [128025] # - detokenize: true # DiT bridge consumes ar_generated_text; let the AR engine produce it - skip_special_tokens: False - output_connectors: - to_stage_1: rdma_connector - - # Stage 1: Diffusion (DiT + VAE) - # Receives latents from AR stage, performs denoising + VAE decode - - stage_id: 1 - stage_type: diffusion - runtime: - process: true - devices: "2,3" - max_batch_size: 1 - requires_multimodal_data: true # May need condition images - engine_args: - model_stage: dit - model_arch: HunyuanImage3ForCausalMM - enforce_eager: true - trust_remote_code: true - distributed_executor_backend: "mp" - omni_kv_config: - need_recv_cache: true # Receive AR KV cache from stage 0 - parallel_config: - tensor_parallel_size: 2 - enable_expert_parallel: true - engine_input_source: [0] # Input from AR stage - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hunyuan_image3.ar2diffusion - final_output: true - final_output_type: image - default_sampling_params: - num_inference_steps: 50 - guidance_scale: 0 - input_connectors: - from_stage_0: rdma_connector - - -# Top-level runtime config -runtime: - enabled: true - connectors: - rdma_connector: - name: MooncakeTransferEngineConnector - extra: - host: "auto" - zmq_port: 50051 - protocol: "rdma" - device_name: "" - memory_pool_size: 4294967296 - memory_pool_device: "cpu" - edges: - - from: 0 # AR → Diffusion - to: 1