Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion examples/offline_inference/hunyuan_image3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
--prompts "A cute cat sitting on a windowsill watching the sunset"
```

**With VAE tiling (required on A100 GPUs):**
```bash
python end2end.py --model tencent/HunyuanImage-3.0-Instruct \
--modality text2img \
--prompts "A cute cat sitting on a windowsill watching the sunset" \
--vae-use-tiling
```

#### Image to Image (img2img)

- **Pipeline**: Image + Text → AR (CoT + recaption + latent) → DiT → Edited Image
Expand Down Expand Up @@ -103,6 +111,7 @@ python end2end.py --modality text2img \
| `--stage-configs-path` | string | auto | Custom stage config YAML path |
| `--enforce-eager` | flag | `False` | Disable torch.compile |
| `--init-timeout` | int | `300` | Initialization timeout (seconds) |
| `--vae-use-tiling` | flag | `False` | Enable VAE tiling for memory optimization (required to avoid OOM on A100) |

------

Expand Down Expand Up @@ -153,7 +162,7 @@ helper handles segment-by-segment tokenization (matches HF `apply_chat_template`

## FAQ

- **OOM errors**: Decrease `gpu_memory_utilization` in the YAML stage config, or use a smaller `max_num_batched_tokens`.
- **OOM errors**: Decrease `gpu_memory_utilization` in the YAML stage config, use a smaller `max_num_batched_tokens`, or enable VAE tiling with `--vae-use-tiling` (required on A100 GPUs).
- **Custom image sizes**: Use `--height` and `--width` flags (multiples of 16 recommended).

| Stage | VRAM (approx) |
Expand Down
31 changes: 29 additions & 2 deletions examples/offline_inference/hunyuan_image3/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniPromptType

# task -> (sys_type, bot_task, trigger_tag)
_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = {
"t2t": ("en_unified", None, None),
"i2t": ("en_unified", None, None),
"it2i_think": ("en_unified", "think", "<think>"),
"it2i_recaption": ("en_unified", "recaption", "<recaption>"),
"t2i_think": ("en_unified", "think", "<think>"),
"t2i_recaption": ("en_unified", "recaption", "<recaption>"),
"t2i_vanilla": ("en_vanilla", "image", None),
}

# Modality → prompt_utils task mapping
_MODALITY_TASK_MAP = {
"text2img": "t2i_think",
Expand Down Expand Up @@ -73,6 +84,11 @@ def parse_args():
parser.add_argument("--seed", type=int, default=42, help="Random seed.")
parser.add_argument("--height", type=int, default=1024, help="Output image height.")
parser.add_argument("--width", type=int, default=1024, help="Output image width.")
parser.add_argument(
"--vae-use-tiling",
action="store_true",
help="Enable VAE tiling for memory optimization.",
)

# Prompt configuration
parser.add_argument(
Expand Down Expand Up @@ -113,6 +129,7 @@ def main():
# Build Omni
omni_kwargs = {
"model": args.model,
"vae_use_tiling": args.vae_use_tiling,
"stage_configs_path": stage_configs_path,
"log_stats": args.log_stats,
"init_timeout": args.init_timeout,
Expand Down Expand Up @@ -148,8 +165,18 @@ def main():
formatted_prompts: list[OmniPromptType] = []
for p in prompts:
token_ids = build_prompt_tokens(p, tokenizer, task=task, sys_type=args.sys_type)

prompt_dict: dict = {"prompt_token_ids": token_ids, "prompt": p}
preset_sys_type, _, _ = _TASK_PRESETS[task]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest we list tasks explicitly in end2end.py rather than private enumerate

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion adopted.

effective_sys_type = args.sys_type or preset_sys_type

# `prompt_token_ids` drives the AR stage (matches HF byte-for-byte).
# `prompt` and `use_system_prompt` are forwarded by ar2diffusion to
# the DiT stage so the diffusion pipeline can rebuild the same
# system prefix when constructing its model inputs.
prompt_dict: dict = {
"prompt_token_ids": token_ids,
"prompt": p,
"use_system_prompt": effective_sys_type,
}

if args.modality == "text2img":
prompt_dict["modalities"] = ["image"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Verify the IT2I AR-prefill prompt matches the official HF chat-template output.

PR #3107 builds the AR prefill via
:func:`vllm_omni.diffusion.models.hunyuan_image3.prompt_utils.build_prompt_tokens`,
which segment-tokenizes the canonical Instruct chat template (`<|startoftext|>`
+ `{system}\\n\\n` + `User: [<img>]{user_prompt}\\n\\nAssistant: {trigger?}`).

The official HunyuanImage-3.0-Instruct repo ships a Jinja `chat_template` in
its tokenizer config and an `image_processor.py` whose `process_image`
defines the same VAE/VIT preprocessing the diffusion pipeline uses on the
condition image. To prevent silent drift between the AR's input distribution
and what the model was actually trained on, this test asserts:

1. ``build_prompt_tokens`` token-id sequence equals the HF reference produced
by ``tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)``
for the same `(system, user_prompt, image)` triple.
2. The image-tensor produced by the diffusion-side ``_resize_and_crop_center``
is byte-identical to the AR-side ``HunyuanImage3Processor._resize_and_crop``
output (i.e. AR and DiT preprocess the IT2I condition image identically).

Both checks need the official tokenizer/image-processor classes; we gate on
``HF_HOME`` cache availability so the suite stays runnable on machines
without the model weights.
"""

from __future__ import annotations

import os
import pathlib

import pytest

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


_HUNYUAN_MODEL_ID = "tencent/HunyuanImage-3.0-Instruct"


def _hf_cached(model_id: str) -> bool:
hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
snap_dir = os.path.join(hf_home, "hub", f"models--{model_id.replace('/', '--')}", "snapshots")
return os.path.isdir(snap_dir) and any(os.scandir(snap_dir))


def _snapshot_dir(model_id: str) -> pathlib.Path:
hf_home = os.environ.get("HF_HOME") or os.path.expanduser("~/.cache/huggingface")
snap_root = pathlib.Path(hf_home) / "hub" / f"models--{model_id.replace('/', '--')}" / "snapshots"
snap = next(iter(snap_root.iterdir()))
return snap


# --- Real AR-output comparison lives in
# tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py ---
#
# Earlier revisions of this file shipped a CPU-only "compare prefill
# token sequences" check that called the official tokenizer's
# `apply_chat_template`. That comparison was misleading: it only verified
# the *input* prompt template, not the AR-stage *generated output*; and
# it kept skipping because instantiating
# `HunyuanImage3TokenizerFast.from_pretrained(snap)` returns a
# byte-fallback (char-level) tokenizer that is not the same encoding the
# vllm-omni production path actually uses (which goes through the
# standard `AutoTokenizer.from_pretrained`).
#
# The "AR output matches official" contract is genuinely a GPU-required
# end-to-end test: it must drive `model.prepare_model_inputs` +
# `model.generate(do_sample=False)` on the HF side and the IT2I `i2t`
# stage on the omni side, then compare AR-generated token sequences.
# That is now the responsibility of the e2e test in
# tests/e2e/accuracy/test_hunyuan_image3_it2i_ar_output.py.


_OFFICIAL_PKG = "_hunyuan_image_3_official_snapshot"


def _import_official_snapshot_modules():
"""Register the HunyuanImage-3.0-Instruct snapshot as a fake package so
its ``image_processor.py`` (which does ``from .tokenization_hunyuan_image_3
import ...``) can be loaded with relative imports intact.

Returns ``(tokenization_module, image_processor_module)`` or ``(None, None)``
if either fails (e.g. snapshot missing, optional dep like diffusers absent).
"""
import importlib.util
import sys
import types

if _OFFICIAL_PKG in sys.modules:
pkg = sys.modules[_OFFICIAL_PKG]
return (
sys.modules.get(f"{_OFFICIAL_PKG}.tokenization_hunyuan_image_3"),
sys.modules.get(f"{_OFFICIAL_PKG}.image_processor"),
)

snap = _snapshot_dir(_HUNYUAN_MODEL_ID)
if not (snap / "image_processor.py").is_file():
return None, None

pkg = types.ModuleType(_OFFICIAL_PKG)
pkg.__path__ = [str(snap)]
sys.modules[_OFFICIAL_PKG] = pkg

def _load(name: str):
full = f"{_OFFICIAL_PKG}.{name}"
spec = importlib.util.spec_from_file_location(full, snap / f"{name}.py")
if spec is None or spec.loader is None:
return None
mod = importlib.util.module_from_spec(spec)
sys.modules[full] = mod
try:
spec.loader.exec_module(mod)
except Exception:
del sys.modules[full]
return None
return mod

tok_mod = _load("tokenization_hunyuan_image_3")
if tok_mod is None:
return None, None
img_mod = _load("image_processor")
return tok_mod, img_mod


@pytest.mark.skipif(
not _hf_cached(_HUNYUAN_MODEL_ID),
reason=f"{_HUNYUAN_MODEL_ID} not in HF cache",
)
def test_dit_condition_image_preprocessing_byte_matches_official_hf():
"""The diffusion pipeline's ``_resize_and_crop_center`` (used to feed
the VAE encoder for IT2I conditioning) must produce byte-identical
pixels to the **official** HuggingFace
``image_processor.resize_and_crop`` (loaded straight out of the
HunyuanImage-3.0-Instruct snapshot's bundled ``image_processor.py``)
at ``crop_type='center'``.

Bounty-hunter's PR #3107 review flagged that the DiT-side helper had
drifted from the AR-side processor on rounding boundaries; PR #3107
commit ``0a7e0e6f`` aligned the DiT helper to the AR-side algorithm.
AR and DiT both *claim* to mirror the HF reference, so the actual
contract is "DiT (and AR) match the HF reference verbatim". We
enforce that contract here by comparing directly to the HF function
rather than to a sibling vllm-omni copy.
"""
import numpy as np
from PIL import Image

from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import (
_resize_and_crop_center,
)

_tok_mod, official_module = _import_official_snapshot_modules()
if official_module is None or not hasattr(official_module, "resize_and_crop"):
pytest.skip("Official HunyuanImage3 image_processor.py not loadable")
official_resize_and_crop = official_module.resize_and_crop

rng = np.random.default_rng(seed=42)
src_size_pairs = [(640, 1024), (1024, 1024), (1280, 720), (480, 800)]
target_size_pairs = [(1024, 1024), (1024, 768), (768, 1024)]

for src_w, src_h in src_size_pairs:
src_arr = rng.integers(0, 256, size=(src_h, src_w, 3), dtype=np.uint8)
src = Image.fromarray(src_arr, mode="RGB")
for tw, th in target_size_pairs:
ref_out = official_resize_and_crop(
src,
target_size=(tw, th),
resample=Image.Resampling.LANCZOS,
crop_type="center",
)
dit_out = _resize_and_crop_center(src, tw, th)
assert ref_out.size == dit_out.size == (tw, th), (
f"size mismatch for src={(src_w, src_h)} target={(tw, th)}: "
f"hf_official={ref_out.size} dit={dit_out.size}"
)
ref_pixels = np.asarray(ref_out)
dit_pixels = np.asarray(dit_out)
assert np.array_equal(ref_pixels, dit_pixels), (
f"DiT condition-image preprocessing diverged from HF "
f"image_processor.resize_and_crop at src={(src_w, src_h)} "
f"target={(tw, th)}: max abs diff = "
f"{int(np.abs(ref_pixels.astype(int) - dit_pixels.astype(int)).max())}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,9 @@ def __init__(self, config):

self.layers = modules

def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.layers(x)


class HunYuanRotary2DEmbedder:
r"""
Expand Down
Loading
Loading