vllm-project · Gaohan123 · May 14, 2026 · May 8, 2026 · May 8, 2026 · May 9, 2026
@@ -112,6 +112,7 @@ python end2end.py --modality text2img \
                   --additional-config '{"torchair_graph_config":{"enabled":true}}'
 ```
 
+
 ## Key Arguments
 
 | Argument | Description |
@@ -123,16 +124,15 @@ python end2end.py --modality text2img \
 | `--steps` | Number of diffusion inference steps for image generation. |
 | `--guidance-scale` | Classifier-free guidance scale for image generation. |
 | `--height`, `--width` | Output image size for `text2img`. |
-| `--bot-task` | Prompt behavior. `auto` selects the default from `--modality`; `think` adds `<think>`; `recaption` adds `<recaption>`; `vanilla` uses the text-to-image pretrain template. |
+| `--bot-task` | Override prompt mode. `none`, `think`, `recaption`, `think_recaption`, or `vanilla`. |
 | `--sys-type` | Override the system prompt type, for example `en_unified` or `en_vanilla`. |
 | `--vae-use-tiling` | Enable VAE tiling for memory reduction. |
 
 ## Notes
 
-- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy. It sets `engine_output_type: text`, `final_output_type: text`, and text sampling defaults.
-- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`; it does not require stage 1 or a running AR stage.
+- `hunyuan_image3_ar.yaml` is a 4-card AR-only text/comprehension deploy.
+- `hunyuan_image3_dit.yaml` is a single-stage DiT deploy with `stage_id: 0`.
 - The old HunyuanImage3 YAMLs under `model_executor/stage_configs/` and `platforms/*/stage_configs/` have been folded into the deploy YAMLs.
-- This PR does not keep the HunyuanImage3 AR-to-DiT KV reuse wiring. The deploy YAMLs describe the topology and platform settings only.
 
 ## Prompt Format
 
@@ -148,22 +148,8 @@ Assistant: {trigger_tag?}
 
 - `<img>`: Placeholder for each input image (single token; expanded by the multimodal pipeline).
 - Trigger tags: `<think>` for CoT and `<recaption>` for recaptioning, placed after `Assistant: `.
-- System prompt: Auto-selected based on task.
-- `t2i_vanilla` is the only task that uses the bare pretrain template without chat structure.
-- The example composes the internal prompt task from `--modality` and `--bot-task`
-  before calling `prompt_utils`; for example, `img2text + think` becomes
-  `i2t_think` for prompt and stop-token lookup.
+- System prompt: Auto-selected from `task` and `bot_task`.
+- `bot_task='vanilla'` with `task='t2i'` uses the bare pretrain template.
 
 The shared `vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens()`
 helper handles segment-by-segment tokenization and matches HF `apply_chat_template`.
-
-## FAQ
-
-- **OOM errors**: Decrease `gpu_memory_utilization` in the deploy YAML, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling`.
-- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended).
-
-| Stage | VRAM (approx) |
-| :--- | :--- |
-| Stage 0 (AR) | ~15 GiB + KV Cache |
-| Stage 1 (DiT) | ~30 GiB |
-| Total (8-GPU) | ~45 GiB + KV Cache |
@@ -1,16 +1,5 @@
 """
 HunyuanImage-3.0-Instruct unified end-to-end inference script.
-
-Supports all modalities through a single entry point:
-  - text2img:  Text → AR → DiT → Image
-  - img2img:   Text+Image → AR → DiT → Edited Image (IT2I)
-  - img2text:  Image+Text → AR → Text description (I2T)
-  - text2text: Text → AR → Text (comprehension, no image)
-
-Usage:
-    python end2end.py --modality text2img --prompts "A cute cat"
-    python end2end.py --modality img2img --image-path input.png --prompts "Make it snowy"
-    python end2end.py --modality img2text --image-path input.png --prompts "Describe this image"
 """
 
 import argparse
@@ -19,18 +8,25 @@
 from pathlib import Path
 
 from vllm_omni.diffusion.models.hunyuan_image3.prompt_utils import (
-    _TASK_PRESETS,
+    MAX_IMAGES_PER_REQUEST,
     build_prompt_tokens,
     resolve_stop_token_ids,
+    resolve_sys_type,
 )
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniPromptType
 
-# Default deploy configs are absolute so this example works from any cwd.
 _REPO_ROOT = Path(__file__).resolve().parents[3]
 _DEFAULT_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3.yaml")
 _DEFAULT_AR_DEPLOY_CONFIG = str(_REPO_ROOT / "vllm_omni" / "deploy" / "hunyuan_image3_ar.yaml")
 
+_MODALITY_TASK_MAP: dict[str, tuple[str, str | None]] = {
+    "text2img": ("t2i", "think"),
+    "img2img": ("it2i", "think"),
+    "img2text": ("i2t", None),
+    "text2text": ("t2t", None),
+}
+
 _MODALITY_DEFAULT_DEPLOY_CONFIG = {
     "text2img": _DEFAULT_DEPLOY_CONFIG,
     "img2img": _DEFAULT_DEPLOY_CONFIG,
@@ -45,73 +41,37 @@
     "text2text": "text-to-text",
 }
 
-_MODALITY_TASK_MAP = {
-    "text2img": "t2i",
-    "img2img": "it2i",
-    "img2text": "i2t",
-    "text2text": "t2t",
-}
-
 
 def parse_args():
     parser = argparse.ArgumentParser(description="HunyuanImage-3.0-Instruct end-to-end inference.")
-    parser.add_argument(
-        "--model",
-        default="tencent/HunyuanImage-3.0-Instruct",
-        help="Model name or local path.",
-    )
+    parser.add_argument("--model", default="tencent/HunyuanImage-3.0-Instruct", help="Model name or local path.")
     parser.add_argument(
         "--modality",
         default="text2img",
-        choices=["text2img", "img2img", "img2text", "text2text"],
-        help="Modality mode to control stage execution.",
+        choices=list(_MODALITY_TASK_MAP),
     )
     parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
     parser.add_argument(
         "--image-path",
         type=str,
         default=None,
-        help="Path to input image (for img2img/img2text).",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=".",
-        help="Output directory to save results.",
+        help="Input image path(s) for img2img/img2text. Comma-separated for multi-image (up to 3).",
     )
-
-    # Generation parameters
+    parser.add_argument("--output", type=str, default=".", help="Output directory to save results.")
     parser.add_argument("--steps", type=int, default=50, help="Number of inference steps.")
     parser.add_argument("--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
     parser.add_argument("--height", type=int, default=1024, help="Output image height.")
     parser.add_argument("--width", type=int, default=1024, help="Output image width.")
-    parser.add_argument(
-        "--vae-use-tiling",
-        action="store_true",
-        help="Enable VAE tiling for memory optimization.",
-    )
-
-    # Prompt configuration
+    parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.")
     parser.add_argument(
         "--bot-task",
         type=str,
-        default="auto",
-        choices=["auto", "think", "recaption", "think_recaption", "vanilla"],
-        help=(
-            "Prompt behavior. 'auto' selects the default for the modality; "
-            "'think' adds <think>; 'recaption' adds <recaption>; "
-            "'vanilla' uses the t2i pretrain template."
-        ),
-    )
-    parser.add_argument(
-        "--sys-type",
-        type=str,
         default=None,
-        help="Override system prompt type (e.g. en_unified, en_vanilla).",
+        choices=["none", "think", "recaption", "think_recaption", "vanilla"],
+        help="Override prompt mode. Default: auto from --modality.",
     )
-
-    # Omni init args
+    parser.add_argument("--sys-type", type=str, default=None, help="Override system prompt type.")
     parser.add_argument("--deploy-config", type=str, default=None, help="Custom deploy YAML path.")
     parser.add_argument("--stage-configs-path", type=str, default=None, help="Custom legacy stage config YAML path.")
     parser.add_argument("--log-stats", action="store_true", default=False)
@@ -157,22 +117,13 @@ def main():
     os.makedirs(args.output, exist_ok=True)
     additional_config = parse_additional_config(args.additional_config)
 
-    # Determine task for prompt formatting from modality + bot behavior.
-    task = _MODALITY_TASK_MAP[args.modality]
-    assert task is not None
-    bot_task = args.bot_task
-    if bot_task != "auto":
-        task = task + "_" + bot_task
-    if task not in _TASK_PRESETS:
-        valid_bot_tasks = {
-            "text2img": ["think", "recaption", "vanilla"],
-            "img2img": ["think", "recaption", "think_recaption"],
-            "img2text": ["auto"],
-            "text2text": ["auto"],
-        }[args.modality]
-        raise ValueError(
-            f"--bot-task {bot_task!r} is not supported for {args.modality}. Choose from: {valid_bot_tasks}"
-        )
+    task, default_bot_task = _MODALITY_TASK_MAP[args.modality]
+    if args.bot_task is None:
+        bot_task: str | None = default_bot_task
+    elif args.bot_task == "none":
+        bot_task = None
+    else:
+        bot_task = args.bot_task
 
     if args.deploy_config is not None and args.stage_configs_path is not None:
         raise ValueError("--deploy-config and --stage-configs-path are mutually exclusive.")
@@ -182,13 +133,13 @@ def main():
     if deploy_config is None and stage_configs_path is None:
         deploy_config = _MODALITY_DEFAULT_DEPLOY_CONFIG[args.modality]
 
-    # Build Omni
     omni_kwargs = {
         "model": args.model,
         "vae_use_tiling": args.vae_use_tiling,
         "log_stats": args.log_stats,
         "init_timeout": args.init_timeout,
         "enforce_eager": args.enforce_eager,
+        "mode": _MODALITY_MODE[args.modality],
     }
 
     if additional_config is not None:
@@ -197,85 +148,80 @@ def main():
         omni_kwargs["deploy_config"] = deploy_config
     else:
         omni_kwargs["stage_configs_path"] = stage_configs_path
-    omni_kwargs["mode"] = _MODALITY_MODE[args.modality]
 
     omni = Omni(**omni_kwargs)
 
-    # Prepare prompts
     prompts = args.prompts or ["A cute cat"]
-    if not prompts:
-        print("[Info] No prompts provided, using default.")
-        prompts = ["A cute cat"]
-
-    # Load image if needed
-    input_image = None
+    input_images: list = []
     if args.modality in ("img2img", "img2text"):
-        if not args.image_path or not os.path.exists(args.image_path):
+        if not args.image_path:
             raise ValueError(f"--image-path required for {args.modality}, got: {args.image_path}")
         from PIL import Image
 
-        input_image = Image.open(args.image_path).convert("RGB")
+        image_paths = [p.strip() for p in args.image_path.split(",") if p.strip()]
+        if len(image_paths) > MAX_IMAGES_PER_REQUEST:
+            raise ValueError(
+                f"--image-path accepts at most {MAX_IMAGES_PER_REQUEST} images for "
+                f"HunyuanImage-3.0 IT2I, got {len(image_paths)}: {args.image_path}"
+            )
+        for image_path in image_paths:
+            if not os.path.exists(image_path):
+                raise ValueError(f"Image path does not exist: {image_path}")
+            input_images.append(Image.open(image_path).convert("RGB"))
+        if not input_images:
+            raise ValueError(f"--image-path produced no usable paths: {args.image_path!r}")
 
-    # Load tokenizer for segment-wise prompt tokenization (matches HF
-    # apply_chat_template byte-for-byte; see build_prompt_tokens docstring).
     from transformers import AutoTokenizer
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    mm_image_payload = (input_images[0] if len(input_images) == 1 else input_images) if input_images else None
 
-    # Format prompts
     formatted_prompts: list[OmniPromptType] = []
-    for p in prompts:
-        result = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)
+    for prompt in prompts:
+        build_kwargs: dict = {"task": task, "bot_task": bot_task, "sys_type": args.sys_type}
+        if input_images:
+            build_kwargs["num_images"] = len(input_images)
+        result = build_prompt_tokens(prompt, tokenizer, **build_kwargs)
         token_ids = result.token_ids
-        effective_sys_type = result.system_prompt_type
+        effective_sys_type = args.sys_type or resolve_sys_type(bot_task)
 
-        # `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
-        # `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
-        # the DiT stage so the diffusion pipeline can rebuild the same
-        # system prefix when constructing its model inputs.
         prompt_dict: dict = {
             "prompt_token_ids": token_ids,
-            "prompt": p,
+            "prompt": prompt,
             "use_system_prompt": effective_sys_type,
         }
-
         if args.modality == "text2img":
             prompt_dict["modalities"] = ["image"]
         elif args.modality == "img2img":
             prompt_dict["modalities"] = ["image"]
-            prompt_dict["multi_modal_data"] = {"image": input_image}
-            prompt_dict["height"] = input_image.height
-            prompt_dict["width"] = input_image.width
+            prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
+            prompt_dict["height"] = input_images[0].height
+            prompt_dict["width"] = input_images[0].width
         elif args.modality == "img2text":
             prompt_dict["modalities"] = ["text"]
-            prompt_dict["multi_modal_data"] = {"image": input_image}
-        elif args.modality == "text2text":
+            prompt_dict["multi_modal_data"] = {"image": mm_image_payload}
+        else:
             prompt_dict["modalities"] = ["text"]
-
         formatted_prompts.append(prompt_dict)
 
-    # Build sampling params from defaults
     params_list = list(omni.default_sampling_params_list)
 
-    # Override diffusion params if applicable
     from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
     ar_stop_token_ids = resolve_stop_token_ids(task=task, bot_task=bot_task, tokenizer=tokenizer)
-    assert ar_stop_token_ids is not None
     for sp in params_list:
         if isinstance(sp, OmniDiffusionSamplingParams):
             sp.num_inference_steps = args.steps
             sp.guidance_scale = args.guidance_scale
             sp.guidance_scale_provided = True
             if args.seed is not None:
                 sp.seed = args.seed
-            if args.modality in ("text2img",):
+            if args.modality == "text2img":
                 sp.height = args.height
                 sp.width = args.width
         elif hasattr(sp, "stop_token_ids"):
             sp.stop_token_ids = ar_stop_token_ids
 
-    # Print configuration
     print(f"\n{'=' * 60}")
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
@@ -300,13 +246,9 @@ def main():
     print(f"  Prompts: {prompts}")
     print(f"{'=' * 60}\n")
 
-    # Generate
     omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list))
-
-    # Process outputs
     img_idx = 0
     for req_output in omni_outputs:
-        # Text output (AR stage or text-only)
         ro = getattr(req_output, "request_output", None)
         txt = ""
         if ro and getattr(ro, "outputs", None):
@@ -320,11 +262,9 @@ def main():
         if txt:
             print(f"[Output] Text:\n{txt}")
 
-        # Image output (DiT stage)
         images = getattr(req_output, "images", None)
         if not images and ro and hasattr(ro, "images"):
             images = ro.images
-
         if images:
             for j, img in enumerate(images):
                 save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png")