vllm-project · lishunyang12 · Apr 21, 2026 · Apr 21, 2026 · Apr 22, 2026 · Apr 22, 2026
@@ -23,7 +23,7 @@ HunyuanImage-3.0-Instruct supports multiple modality modes. You can control the
 - **Pipeline**: Text → AR (CoT + latent tokens) → DiT (denoise) → VAE Decode → Image
 - **Stages Used**: Stage 0 (AR) + Stage 1 (DiT)
 - **KV Transfer**: AR sends KV cache to DiT for conditioned generation
-- **Default Config**: `hunyuan_image3_t2i.yaml`
+- **Default Config**: `vllm_omni/deploy/hunyuan_image3_t2i.yaml`
 
 ```bash
 python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
@@ -36,7 +36,7 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
 - **Pipeline**: Image + Text → AR (CoT + recaption + latent) → DiT → Edited Image
 - **Stages Used**: Stage 0 (AR) + Stage 1 (DiT)
 - **KV Transfer**: AR sends KV cache to DiT
-- **Default Config**: `hunyuan_image3_it2i.yaml`
+- **Default Config**: `vllm_omni/deploy/hunyuan_image3_it2i.yaml`
 
 ```bash
 python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
@@ -45,31 +45,6 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
                   --prompts "Make the petals neon pink"
 ```
 
-#### Image to Text (img2text)
-
-- **Pipeline**: Image + Question → AR → Text description
-- **Stages Used**: Stage 0 (AR) only
-- **Default Config**: `hunyuan_image3_i2t.yaml`
-
-```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality img2text \
-                  --image-path /path/to/image.jpg \
-                  --prompts "Describe the content of the picture."
-```
-
-#### Text to Text (text2text)
-
-- **Pipeline**: Text → AR → Text
-- **Stages Used**: Stage 0 (AR) only
-- **Default Config**: `hunyuan_image3_t2t.yaml`
-
-```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2text \
-                  --prompts "What is the capital of France?"
-```
-
 ### Inference Steps & Guidance
 
 Control generation quality for image modalities:
@@ -89,7 +64,7 @@ python end2end.py --modality text2img \
 | Argument               | Type   | Default                              | Description                                                  |
 | :--------------------- | :----- | :----------------------------------- | :----------------------------------------------------------- |
 | `--model`              | string | `tencent/HunyuanImage-3.0-Instruct` | Model path or name                                           |
-| `--modality`           | choice | `text2img`                           | Modality: `text2img`, `img2img`, `img2text`, `text2text`     |
+| `--modality`           | choice | `text2img`                           | Modality: `text2img`, `img2img`                              |
 | `--prompts`            | list   | `None`                               | Input text prompts                                           |
 | `--image-path`         | string | `None`                               | Input image path (for `img2img`/`img2text`)                  |
 | `--output`             | string | `.`                                  | Output directory for saved images                            |
@@ -108,28 +83,20 @@ python end2end.py --modality text2img \
 
 #### ⚙️ Stage Configurations
 
-| Config YAML                         | Modality  | Stages | GPUs   | Description                           |
-| :---------------------------------- | :-------- | :----- | :----- | :------------------------------------ |
-| `hunyuan_image3_t2i.yaml`           | text2img  | 2      | 8      | T2I with AR→DiT, 4 GPU each          |
-| `hunyuan_image3_it2i.yaml`          | img2img   | 2      | 8      | IT2I with AR→DiT, 4 GPU each         |
-| `hunyuan_image3_i2t.yaml`           | img2text  | 1      | 4      | I2T (AR only)                         |
-| `hunyuan_image3_t2t.yaml`           | text2text | 1      | 4      | T2T (AR only)                         |
-| `hunyuan_image3_t2i_2gpu.yaml`      | text2img  | 2      | 2      | T2I for 2-GPU setups                  |
-| `hunyuan_image3_moe.yaml`           | text2img  | 2      | 8      | T2I with MoE AR→DiT KV reuse          |
-| `hunyuan_image3_moe_dit_2gpu_fp8.yaml` | text2img | 2   | 2      | T2I with FP8 quantization             |
+All deploy YAMLs live under `vllm_omni/deploy/` in the new schema (PR #2383).
 
-------
+| Deploy YAML                              | Modality   | Stages | GPUs | Description                       |
+| :--------------------------------------- | :--------- | :----- | :--- | :-------------------------------- |
+| `hunyuan_image3_t2i.yaml`                | text2img   | 2      | 8    | AR + DiT with KV transfer         |
+| `hunyuan_image3_it2i.yaml`               | img2img    | 2      | 8    | AR + DiT (image-edit)             |
 
-## Using MoE Config
+The `hunyuan_image3_dit_only` pipeline is also registered (no shipped deploy yaml) for users who want to skip the AR stage with a custom deploy.
 
-The `hunyuan_image3_moe.yaml` config enables AR→DiT KV cache reuse with 8 GPUs (4 for AR + 4 for DiT).
+------
 
-```bash
-python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
-                  --modality text2img \
-                  --stage-configs-path hunyuan_image3_moe.yaml \
-                  --prompts "A cute cat"
-```
+## AR→DiT KV cache reuse
+
+The default `hunyuan_image3_t2i.yaml` deploy already enables AR→DiT KV cache reuse on 8 GPUs (4 for AR + 4 for DiT) — the wiring lives on the pipeline (`omni_kv_config` for both stages).
 
 ------
 

@@ -72,12 +72,10 @@ def build_prompt(
     return "".join(parts)
 
 
-# Modality → default stage config
+# Modality → default deploy config (under vllm_omni/deploy/).
 _MODALITY_DEFAULT_CONFIG = {
-    "text2img": "hunyuan_image3_t2i.yaml",
-    "img2img": "hunyuan_image3_it2i.yaml",
-    "img2text": "hunyuan_image3_i2t.yaml",
-    "text2text": "hunyuan_image3_t2t.yaml",
+    "text2img": "vllm_omni/deploy/hunyuan_image3_t2i.yaml",
+    "img2img": "vllm_omni/deploy/hunyuan_image3_it2i.yaml",
 }
 
 
@@ -91,7 +89,7 @@ def parse_args():
     parser.add_argument(
         "--modality",
         default="text2img",
-        choices=["text2img", "img2img", "img2text", "text2text"],
+        choices=["text2img", "img2img"],
         help="Modality mode to control stage execution.",
     )
     parser.add_argument("--prompts", nargs="+", default=None, help="Input text prompts.")
@@ -148,21 +146,15 @@ def main():
     # Determine task for prompt formatting
     task = args.bot_task or _MODALITY_TASK_MAP[args.modality]
 
-    # Determine stage config
-    stage_configs_path = args.stage_configs_path or _MODALITY_DEFAULT_CONFIG[args.modality]
-
-    # Build Omni
-    omni_kwargs = {
-        "model": args.model,
-        "stage_configs_path": stage_configs_path,
-        "log_stats": args.log_stats,
-        "init_timeout": args.init_timeout,
-        "enforce_eager": args.enforce_eager,
+    # Resolve modality-derived overrides — these are not direct CLI flags so
+    # forward them to ``from_cli_args`` via ``**overrides``.
+    overrides: dict[str, object] = {
+        "stage_configs_path": args.stage_configs_path or _MODALITY_DEFAULT_CONFIG[args.modality],
     }
     if args.modality in ("text2img", "img2img"):
-        omni_kwargs["mode"] = "text-to-image"
+        overrides["mode"] = "text-to-image"
 
-    omni = Omni(**omni_kwargs)
+    omni = Omni.from_cli_args(args, **overrides)
 
     # Prepare prompts
     prompts = args.prompts or ["A cute cat"]
@@ -222,7 +214,7 @@ def main():
     print("HunyuanImage-3.0 Generation Configuration:")
     print(f"  Model: {args.model}")
     print(f"  Modality: {args.modality}")
-    print(f"  Stage config: {stage_configs_path}")
+    print(f"  Stage config: {overrides['stage_configs_path']}")
     print(f"  Num stages: {omni.num_stages}")
     if args.modality in ("text2img", "img2img"):
         print(f"  Inference steps: {args.steps}")

@@ -0,0 +1,27 @@
+# HunyuanImage-3.0 DiT-only (no AR). CUDA verified on 4x H20.
+pipeline: hunyuan_image3_dit_only
+trust_remote_code: true
+distributed_executor_backend: mp
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    enforce_eager: true
+    devices: "0,1,2,3"
+    parallel_config:
+      tensor_parallel_size: 4
+      enable_expert_parallel: true
+    default_sampling_params:
+      seed: 42
+
+platforms:
+  npu:
+    # Verified on 8x A3-64G NPUs.
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.65
+        devices: "0,1,2,3,4,5,6,7"
+        max_num_batched_tokens: 32768
+        parallel_config:
+          tensor_parallel_size: 8
+          enable_expert_parallel: true
@@ -17,7 +17,7 @@
 MODEL_NAME = "tencent/HunyuanImage-3.0"
 LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32"
 REPO_ROOT = Path(__file__).resolve().parents[3]
-STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml"
+STAGE_CONFIG_PATH = REPO_ROOT / "tests" / "e2e" / "offline_inference" / "deploy" / "hunyuan_image3_dit_only_ci.yaml"
 
 pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion]
 

@@ -1334,3 +1334,61 @@ def test_constraints_win(self):
         assert stages[1].yaml_extras["default_sampling_params"]["stop_token_ids"] == [2150]
         # Deploy temperature still flows through
         assert stages[0].yaml_extras["default_sampling_params"]["temperature"] == 0.4
+
+
+class TestHunyuanImage3ShippedDeploys:
+    """Structural smoke tests for shipped Hunyuan-Image3 deploy yamls.
+
+    The GPU-gated e2e test (``test_hunyuanimage3_text2img.py``) runs against
+    the DiT-only CI fixture; these cheap tests catch schema regressions in
+    the shipped AR→DiT t2i / it2i / dit_only deploys that no GPU is needed
+    to see.
+    """
+
+    @pytest.mark.parametrize(
+        "yaml_name,expected_pipeline,expected_stage_count,expected_stages",
+        [
+            ("hunyuan_image3_t2i.yaml", "hunyuan_image3_t2i", 2, ("AR", "dit")),
+            ("hunyuan_image3_it2i.yaml", "hunyuan_image3_it2i", 2, ("AR", "dit")),
+            ("hunyuan_image3_dit_only.yaml", "hunyuan_image3_dit_only", 1, ("dit",)),
+        ],
+    )
+    def test_shipped_deploys_parse_and_resolve(
+        self, yaml_name, expected_pipeline, expected_stage_count, expected_stages
+    ):
+        import vllm_omni.model_executor.models.hunyuan_image3.pipeline  # noqa: F401
+        from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / yaml_name
+        assert deploy_path.exists(), f"Shipped deploy missing: {yaml_name}"
+
+        deploy = load_deploy_config(deploy_path)
+        assert deploy.pipeline == expected_pipeline
+        assert len(deploy.stages) == expected_stage_count
+
+        pipeline = _PIPELINE_REGISTRY[expected_pipeline]
+        assert tuple(s.model_stage for s in pipeline.stages) == expected_stages
+
+        stages = merge_pipeline_deploy(pipeline, deploy)
+        assert len(stages) == expected_stage_count
+
+    def test_t2i_ar_dit_topology(self):
+        """The AR→DiT t2i default wires stage 1 to consume stage 0's KV output."""
+        import vllm_omni.model_executor.models.hunyuan_image3.pipeline  # noqa: F401
+        from vllm_omni.config.stage_config import load_deploy_config, merge_pipeline_deploy
+
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "hunyuan_image3_t2i.yaml"
+        assert deploy_path.exists(), "Shipped deploy missing: hunyuan_image3_t2i.yaml"
+
+        pipeline = _PIPELINE_REGISTRY["hunyuan_image3_t2i"]
+        deploy = load_deploy_config(deploy_path)
+        stages = merge_pipeline_deploy(pipeline, deploy)
+
+        # Pipeline-level invariants for the KV-transfer path.
+        assert pipeline.stages[0].omni_kv_config is not None
+        assert pipeline.stages[1].input_sources == (0,)
+        assert pipeline.stages[1].omni_kv_config is not None
+
+        # Deploy-level placement: 4 AR + 4 DiT across 8 devices.
+        assert stages[0].yaml_runtime.get("devices") == "0,1,2,3"
+        assert stages[1].yaml_runtime.get("devices") == "4,5,6,7"
@@ -33,6 +33,28 @@
 # --- Multi-stage omni pipelines (LLM-centric; audio / video I/O) ---
 _OMNI_PIPELINES: dict[str, tuple[str, str]] = {
     # model_type -> (module_path, variable_name)
+    "hunyuan_image3_t2i": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_T2I_PIPELINE",
+    ),
+    "hunyuan_image3_it2i": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_IT2I_PIPELINE",
+    ),
+    # ``dit_only`` ships a default deploy yaml (DiT-only path + NPU section);
+    # ``i2t`` / ``t2t`` are kept BYO.
+    "hunyuan_image3_dit_only": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_DIT_ONLY_PIPELINE",
+    ),
+    "hunyuan_image3_i2t": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_I2T_PIPELINE",
+    ),
+    "hunyuan_image3_t2t": (
+        "vllm_omni.model_executor.models.hunyuan_image3.pipeline",
+        "HUNYUAN_IMAGE3_T2T_PIPELINE",
+    ),
     "qwen2_5_omni": (
         "vllm_omni.model_executor.models.qwen2_5_omni.pipeline",
         "QWEN2_5_OMNI_PIPELINE",

@@ -463,6 +463,22 @@ class DeployConfig:
 _STAGE_DEPLOY_FIELDS = {f.name: f for f in fields(StageDeployConfig) if f.name not in _STAGE_NON_ENGINE_KEYS}
 
 
+_DIT_PARALLEL_FIELDS_AT_TOP_LEVEL = frozenset(
+    {
+        "enable_expert_parallel",
+        "sequence_parallel_size",
+        "ulysses_degree",
+        "ring_degree",
+        "ulysses_mode",
+        "cfg_parallel_size",
+        "vae_patch_parallel_size",
+        "use_hsdp",
+        "hsdp_shard_size",
+        "hsdp_replicate_size",
+    }
+)
+
+
 def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig:
     """Parse a single stage entry from deploy YAML into StageDeployConfig."""
     if "engine_args" in stage_data:
@@ -477,6 +493,22 @@ def _parse_stage_deploy(stage_data: dict[str, Any]) -> StageDeployConfig:
         if name in engine_args:
             kwargs[name] = engine_args.pop(name)
 
+    # Flat schema support: hoist DiT-only parallel fields and tensor_parallel_size
+    # from top level into a parallel_config block. Lets authors write the same
+    # flat shape for AR and DiT stages — DiT engines read parallel_config.*,
+    # AR engines read top-level fields directly. Existing nested parallel_config
+    # forms keep working (we only setdefault).
+    pc = engine_args.get("parallel_config")
+    if not isinstance(pc, dict):
+        pc = {}
+    for name in _DIT_PARALLEL_FIELDS_AT_TOP_LEVEL:
+        if name in engine_args:
+            pc.setdefault(name, engine_args.pop(name))
+    if (tps := kwargs.get("tensor_parallel_size")) and tps > 1:
+        pc.setdefault("tensor_parallel_size", tps)
+    if pc:
+        engine_args["parallel_config"] = pc
+
     kwargs["output_connectors"] = stage_data.get("output_connectors")
     kwargs["input_connectors"] = stage_data.get("input_connectors")
     kwargs["default_sampling_params"] = stage_data.get("default_sampling_params")
@@ -714,6 +746,8 @@ def _build_engine_args(
         engine_args["model_subdir"] = ps.model_subdir
     if ps.tokenizer_subdir:
         engine_args["tokenizer_subdir"] = ps.tokenizer_subdir
+    if ps.omni_kv_config is not None:
+        engine_args["omni_kv_config"] = dict(ps.omni_kv_config)
 
     # Pipeline-wide top-level DeployConfig settings, applied to every stage.
     for name in _PIPELINE_WIDE_ENGINE_FIELDS:
@@ -800,6 +834,8 @@ def merge_pipeline_deploy(
         runtime: dict[str, Any] = {"process": True}
         if ds is not None:
             runtime["devices"] = ds.devices
+        if ps.requires_multimodal_data:
+            runtime["requires_multimodal_data"] = True
 
         result.append(
             StageConfig(

@@ -0,0 +1,29 @@
+# HunyuanImage-3.0 DiT-only (no AR). Matches the Tencent reference
+# `generate_image(prompt, bot_task="image")` path. CUDA verified on 4x H20;
+# NPU verified on 8x A3-64G.
+pipeline: hunyuan_image3_dit_only
+trust_remote_code: true
+distributed_executor_backend: mp
+async_chunk: false
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    enforce_eager: true
+    tensor_parallel_size: 4
+    enable_expert_parallel: true
+    devices: "0,1,2,3"
+    default_sampling_params:
+      num_inference_steps: 50
+      guidance_scale: 2.5
+      seed: 42
+
+platforms:
+  npu:
+    stages:
+      - stage_id: 0
+        gpu_memory_utilization: 0.65
+        tensor_parallel_size: 8
+        enable_expert_parallel: true
+        devices: "0,1,2,3,4,5,6,7"
+        max_num_batched_tokens: 32768