diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index c33d7b4d10d..f2fa7ae1143 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -446,6 +446,50 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · HunyuanImage-3.0-Instruct e2e" + timeout_in_minutes: 60 + commands: + - | + timeout 55m bash -c ' + set -e + export VLLM_TEST_CLEAN_GPU_MEMORY=1 + pytest -s -v tests/e2e/offline_inference/test_hunyuanimage3_i2t.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/offline_inference/test_hunyuanimage3_t2i.py -m "advanced_model" --run-level "advanced_model" + ' + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Diffusion X2I(&A&T) · Perf Test" key: nightly-diffusion-x2iat-performance timeout_in_minutes: 180 diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_i2t.py b/tests/e2e/offline_inference/test_hunyuanimage3_i2t.py new file mode 100644 index 00000000000..43d9fc9b593 --- /dev/null +++ b/tests/e2e/offline_inference/test_hunyuanimage3_i2t.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E402 +"""Smoke test for HunyuanImage-3.0 Image-to-Text (I2T) pipeline.""" + +import sys +from collections.abc import Generator +from pathlib import Path + +import pytest +import torch + +from vllm_omni import Omni + +MODEL_NAME = "tencent/HunyuanImage-3.0-Instruct" +REPO_ROOT = Path(__file__).resolve().parents[3] +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_i2t.yaml" + +# Longest stable prefix shared by HF greedy reference and vllm-omni AR output on +# this input (verified 2026-05-04 via scripts/bench/hf_i2t_pr2986_baseline.py + +# vllm_omni_i2t_pr2986_check.py). vllm-omni vs HF is not bitwise-alignable past +# this point — see memory/hf/hf_omni_alignment_method.md. +EXPECTED_PREFIX = "The image is a solid" + +# Allow importing end2end from examples +sys.path.insert(0, str(REPO_ROOT / "examples" / "offline_inference" / "hunyuan_image3")) +from end2end import build_prompt + +pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] + + +@pytest.fixture(scope="module") +def omni() -> Generator[Omni, None, None]: + engine = Omni( + model=MODEL_NAME, + stage_configs_path=str(STAGE_CONFIG_PATH), + stage_init_timeout=600, + init_timeout=900, + ) + try: + yield engine + finally: + engine.close() + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 CUDA GPUs.") +def test_i2t_generates_text(omni: Omni) -> None: + """Verify I2T output starts with the HF-aligned 20-char prefix `EXPECTED_PREFIX`.""" + # Solid-color image keeps the input self-contained and reproducible. + from PIL import Image + + input_image = Image.new("RGB", (256, 256), color=(128, 200, 100)) + + prompt = build_prompt("Describe the content of the picture.", task="i2t") + prompt_dict = { + "prompt": prompt, + "modalities": ["text"], + "multi_modal_data": {"image": input_image}, + } + + outputs = omni.generate(prompts=[prompt_dict]) + assert outputs, "No outputs returned from Omni.generate()" + + first_output = outputs[0] + request_output = getattr(first_output, "request_output", first_output) + assert request_output.outputs, "No completion outputs" + + generated_text = request_output.outputs[0].text + assert isinstance(generated_text, str), f"Expected str, got {type(generated_text)}" + n = len(EXPECTED_PREFIX) + assert len(generated_text) >= n, f"AR output shorter than {n} chars (got {len(generated_text)}): {generated_text!r}" + assert generated_text[:n] == EXPECTED_PREFIX, ( + f"AR prefix drift vs HF reference\n" + f" expected: {EXPECTED_PREFIX!r}\n" + f" actual : {generated_text[:n]!r}\n" + f" full : {generated_text!r}" + ) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_t2i.py b/tests/e2e/offline_inference/test_hunyuanimage3_t2i.py new file mode 100644 index 00000000000..6b15fd5f07e --- /dev/null +++ b/tests/e2e/offline_inference/test_hunyuanimage3_t2i.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E402 +"""Smoke test for HunyuanImage-3.0 Text-to-Image (T2I) pipeline.""" + +import sys +from collections.abc import Generator +from pathlib import Path + +import pytest +import torch +from PIL import Image + +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +MODEL_NAME = "tencent/HunyuanImage-3.0-Instruct" +REPO_ROOT = Path(__file__).resolve().parents[3] +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml" + +sys.path.insert(0, str(REPO_ROOT / "examples" / "offline_inference" / "hunyuan_image3")) +from end2end import build_prompt + +pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] + + +@pytest.fixture(scope="module") +def omni() -> Generator[Omni, None, None]: + engine = Omni( + model=MODEL_NAME, + stage_configs_path=str(STAGE_CONFIG_PATH), + stage_init_timeout=600, + init_timeout=900, + ) + try: + yield engine + finally: + engine.close() + + +@pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 CUDA GPUs.") +def test_t2i_generates_image(omni: Omni) -> None: + """Verify that the T2I pipeline produces a PIL Image output.""" + sampling_params = OmniDiffusionSamplingParams( + seed=1234, + num_outputs_per_prompt=1, + ) + + prompt = build_prompt( + "A brown and white dog is running on the grass", + task="t2i_think", + ) + + outputs = omni.generate( + {"prompt": prompt, "modalities": ["image"]}, + sampling_params, + ) + assert outputs, "No outputs returned from Omni.generate()" + + first_output = outputs[0] + images = getattr(first_output, "images", None) + if images is None: + request_output = getattr(first_output, "request_output", None) + assert request_output is not None, "No request_output in Omni output" + images = getattr(request_output, "images", None) + + assert images and len(images) > 0, "No image was generated by the T2I pipeline" + assert isinstance(images[0], Image.Image), f"Expected PIL Image, got {type(images[0])}" + assert images[0].size[0] > 0 and images[0].size[1] > 0, "Generated image has zero dimensions"