diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7355e2b4c7..24fc6dd3dc 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -113,7 +113,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - label: "Diffusion Sequence Parallelism Test" - timeout_in_minutes: 20 + timeout_in_minutes: 25 depends_on: upload-merge-pipeline commands: - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 06b7c14ae1..31b3e17976 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -141,7 +141,6 @@ steps: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" agents: queue: "mithril-h100-pool" plugins: @@ -244,7 +243,7 @@ steps: - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-omni-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py - python tools/nightly/send_nightly_email.py --report-file "tests/dfx/perf/results/*.xlsx, tests/dfx/perf/results/*.html" diff --git a/tests/conftest.py b/tests/conftest.py index 18a0ee57d9..9c739533b8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1771,8 +1771,12 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st server_args = params.server_args or [] if params.use_omni and params.stage_init_timeout is not None: server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] + else: + server_args = [*server_args, "--stage-init-timeout", "600"] if params.init_timeout is not None: server_args = [*server_args, "--init-timeout", str(params.init_timeout)] + else: + server_args = [*server_args, "--init-timeout", "900"] if params.use_stage_cli: if not params.use_omni: raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") @@ -2870,9 +2874,9 @@ def __init__( self, model_name: str, seed: int = 42, - stage_init_timeout: int = 300, + stage_init_timeout: int = 600, batch_timeout: int = 10, - init_timeout: int = 300, + init_timeout: int = 900, shm_threshold_bytes: int = 65536, log_stats: bool = False, stage_configs_path: str | None = None, diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index a0c3f6cc9f..63d2a37da7 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -22,9 +22,9 @@ from PIL import Image from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -210,11 +210,10 @@ def test_bagel_img2img_shared_memory_connector(run_level): input_image = _load_input_image() config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_img2img(omni, input_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_img2img(runner.omni, input_image) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 593a640478..501d23eaa8 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -22,7 +22,6 @@ from vllm_omni.outputs import OmniRequestOutput os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -32,9 +31,9 @@ from PIL import Image from safetensors.torch import save_file -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id @@ -154,8 +153,8 @@ def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): """Validate LoRA effect, bounded perturbation, and clean deactivation.""" config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level) - omni = Omni(model=MODEL, stage_configs_path=config_path, stage_init_timeout=300) - try: + with OmniRunner(MODEL, stage_configs_path=config_path) as runner: + omni = runner.omni lora_request = _make_file_lora_request(tmp_path / "bagel_lora") # 1) Baseline (no LoRA) @@ -194,5 +193,3 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): # (d) Deactivation fully restores base model assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 7cce8da3a7..e45d64f2ac 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -16,7 +16,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" import signal import socket import subprocess @@ -28,9 +27,9 @@ import pytest from PIL import Image -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -199,14 +198,13 @@ def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_image(omni) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool: @@ -319,7 +317,6 @@ def test_bagel_text2img_mooncake_connector(run_level): mooncake_master_proc = None temp_config_file = None - omni = None try: _cleanup_mooncake_processes() @@ -349,15 +346,16 @@ def test_bagel_text2img_mooncake_connector(run_level): ) temp_config_file = _resolve_stage_config(temp_config_file, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300) - - generated_image = _generate_bagel_image(omni) - if run_level == "advanced_model": - _validate_pixels(generated_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=temp_config_file, + stage_init_timeout=300, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) + if run_level == "advanced_model": + _validate_pixels(generated_image) finally: - if omni: - omni.close() if temp_config_file: try: os.unlink(temp_config_file) diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py index 6f95e7ee00..bbee329807 100644 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -21,15 +21,13 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path import pytest from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") @@ -76,13 +74,11 @@ def _extract_text(omni_outputs: list) -> str: def test_bagel_text2text(run_level): """Test Bagel text2text produces correct text output.""" config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, - stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -100,8 +96,6 @@ def test_bagel_text2text(run_level): assert text == REFERENCE_TEXT_TEXT2TEXT, ( f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}" ) - finally: - omni.close() @pytest.mark.core_model @@ -112,13 +106,12 @@ def test_bagel_img2text(run_level): """Test Bagel img2text produces correct text output.""" input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -140,5 +133,3 @@ def test_bagel_img2text(run_level): if run_level == "advanced_model": assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index 0e31413dc0..fc08da7bed 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -8,27 +8,15 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -48,20 +36,17 @@ def test_cache_dit(model_name: str): "residual_diff_threshold": 0.24, "max_continuous_cached_steps": 3, } - m = None - try: - m = Omni( - model=model_name, - cache_backend="cache_dit", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="cache_dit", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -90,9 +75,3 @@ def test_cache_dit(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index f3830f02e9..257755ef8b 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -1,22 +1,14 @@ import gc -import sys -from pathlib import Path import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - models = ["riverclouds/qwen_image_random"] @@ -27,30 +19,29 @@ def inference(model_name: str, offload: bool = True): current_omni_platform.reset_peak_memory_stats() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", enable_cpu_offload=offload, - ) - current_omni_platform.reset_peak_memory_stats() - height = 256 - width = 256 + ) as runner: + current_omni_platform.reset_peak_memory_stats() + height = 256 + width = 256 - m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=9, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=9, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py index 6132f1bd0e..bdfd594c77 100644 --- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py @@ -1,21 +1,12 @@ -import sys -from pathlib import Path - import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Models to test and expected saved memory in MB, correspondingly MODELS_SAVED_MEMORY_MB = { "riverclouds/qwen_image_random": 4500, @@ -33,34 +24,33 @@ def run_inference( monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, enable_layerwise_offload=layerwise_offload, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", boundary_ratio=0.875, flow_shift=5.0, - ) - - current_omni_platform.reset_peak_memory_stats() - - # Refer to tests/e2e/offline_inference/test_t2v_model.py - # Use minimal settings for testing - height = 480 - width = 640 - num_frames = 5 - - m.generate( - "A cat sitting on a table", - OmniDiffusionSamplingParams( - height=height, - width=width, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - guidance_scale=1.0, - num_inference_steps=num_inference_steps, - num_frames=num_frames, - ), - ) + ) as runner: + current_omni_platform.reset_peak_memory_stats() + + # Refer to tests/e2e/offline_inference/test_t2v_model.py + # Use minimal settings for testing + height = 480 + width = 640 + num_frames = 5 + + runner.omni.generate( + "A cat sitting on a table", + OmniDiffusionSamplingParams( + height=height, + width=width, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + guidance_scale=1.0, + num_inference_steps=num_inference_steps, + num_frames=num_frames, + ), + ) peak = monitor.peak_used_mb monitor.stop() diff --git a/tests/e2e/offline_inference/test_diffusion_lora.py b/tests/e2e/offline_inference/test_diffusion_lora.py index b414fe30ee..7edd03f20d 100644 --- a/tests/e2e/offline_inference/test_diffusion_lora.py +++ b/tests/e2e/offline_inference/test_diffusion_lora.py @@ -7,6 +7,7 @@ import torch from safetensors.torch import save_file +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -16,15 +17,12 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # This test is specific to Z-Image LoRA behavior. Keep it focused on a single # model to reduce runtime and avoid extra downloads. models = ["Tongyi-MAI/Z-Image-Turbo"] -DIFFUSION_INIT_TIMEOUT_S = 600 @pytest.mark.parametrize("model_name", models) @@ -77,12 +75,8 @@ def _write_zimage_lora(adapter_dir: Path) -> str: ) return str(adapter_dir) - m = Omni( - model=model_name, - stage_init_timeout=DIFFUSION_INIT_TIMEOUT_S, - init_timeout=DIFFUSION_INIT_TIMEOUT_S, - ) - try: + with OmniRunner(model_name) as runner: + m = runner.omni # high resolution may cause OOM on L4 height = 256 width = 256 @@ -140,5 +134,3 @@ def _write_zimage_lora(adapter_dir: Path) -> str: diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean() assert diff > 0.0 - finally: - m.close() diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py index d17e7b8175..5388ac6746 100644 --- a/tests/e2e/offline_inference/test_dynin_omni.py +++ b/tests/e2e/offline_inference/test_dynin_omni.py @@ -18,7 +18,6 @@ import torch from transformers import AutoTokenizer -from tests.conftest import OmniRunner from tests.utils import hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -37,6 +36,7 @@ pytestmark = [ pytest.mark.core_model, pytest.mark.omni, + pytest.mark.parametrize("omni_runner", test_params, indirect=True), ] @@ -291,20 +291,11 @@ def _numel(value: Any) -> int: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2i_decode_to_image(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2i_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) image_output = _find_stage_output(outputs, "image") assert image_output is not None @@ -314,25 +305,16 @@ def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_mmu_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_prompt( tokenizer=tokenizer, question="What is 2 + 2? Answer in one short sentence.", dynin_config_path=DYNIN_CONFIG_PATH, ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -341,11 +323,9 @@ def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_image_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Describe the image briefly in one sentence.", @@ -353,14 +333,7 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: image=_generate_synthetic_image(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -369,11 +342,9 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_speech_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Transcribe the audio briefly in one sentence.", @@ -381,14 +352,7 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: audio=_generate_synthetic_audio(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -397,20 +361,11 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2s_decode_to_audio(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2s_decode_to_audio(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2s_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) audio_output = _find_stage_output(outputs, "audio") assert audio_output is not None diff --git a/tests/e2e/offline_inference/test_expert_parallel.py b/tests/e2e/offline_inference/test_expert_parallel.py index ba126986ec..29d84d7a3e 100644 --- a/tests/e2e/offline_inference/test_expert_parallel.py +++ b/tests/e2e/offline_inference/test_expert_parallel.py @@ -18,8 +18,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -96,12 +96,26 @@ def _run_inference( tensor_parallel_size=tensor_parallel_size, enable_expert_parallel=enable_expert_parallel, ) - omni = Omni(model=model_name, parallel_config=parallel_config) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner(model_name, parallel_config=parallel_config) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=guidance_scale, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, @@ -112,28 +126,13 @@ def _run_inference( num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=guidance_scale, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py index 42aab7f26a..cbcd1009dd 100644 --- a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -8,31 +8,21 @@ """ import gc -import sys -from pathlib import Path +import os as _os import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - QUANTIZED_MODEL = "vllm-project-org/FLUX.1-dev-AutoRound-w4a16" BASELINE_MODEL = "black-forest-labs/FLUX.1-dev" -# Allow overriding via environment for local testing -import os as _os - QUANTIZED_MODEL = _os.environ.get("FLUX_AUTOROUND_MODEL", QUANTIZED_MODEL) BASELINE_MODEL = _os.environ.get("FLUX_BASELINE_MODEL", BASELINE_MODEL) @@ -51,19 +41,18 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni(model=model_name, enforce_eager=True, **extra_kwargs) - - current_omni_platform.reset_peak_memory_stats() - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_inference_steps=NUM_STEPS, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + with OmniRunner(model_name, enforce_eager=True, **extra_kwargs) as runner: + current_omni_platform.reset_peak_memory_stats() + outputs = runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_inference_steps=NUM_STEPS, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() @@ -74,7 +63,6 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") images = req_out.images - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_flux_kontext.py b/tests/e2e/offline_inference/test_flux_kontext.py index 93dca21c9a..cd711d6b81 100644 --- a/tests/e2e/offline_inference/test_flux_kontext.py +++ b/tests/e2e/offline_inference/test_flux_kontext.py @@ -9,23 +9,14 @@ - Image editing with text guidance """ -import os -import sys -from pathlib import Path - import pytest from PIL import Image +from vllm.assets.image import ImageAsset +from tests.conftest import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - MODEL = "black-forest-labs/FLUX.1-Kontext-dev" @@ -33,17 +24,15 @@ @pytest.mark.diffusion def test_flux_kontext_text_to_image(): """Test FluxKontext text-to-image generation with real model.""" - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=["A photo of a cat sitting on a laptop"], sampling_params_list=OmniDiffusionSamplingParams( height=512, @@ -54,43 +43,37 @@ def test_flux_kontext_text_to_image(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) @pytest.mark.core_model @pytest.mark.diffusion def test_flux_kontext_image_edit(): """Test FluxKontext image-to-image editing with real model.""" - from vllm.assets.image import ImageAsset - input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=[ { "prompt": "Transform this image into a Vincent van Gogh style painting", @@ -107,20 +90,18 @@ def test_flux_kontext_image_edit(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break - - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break + + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 5522f33eaa..79bb64dca1 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -8,6 +8,7 @@ from PIL import Image from transformers import CLIPModel, CLIPProcessor +from tests.conftest import OmniRunner from vllm_omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -271,16 +272,11 @@ def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: @pytest.fixture(scope="module") def omni() -> Generator[Omni, None, None]: - engine = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=str(STAGE_CONFIG_PATH), - stage_init_timeout=600, - init_timeout=900, - ) - try: - yield engine - finally: - engine.close() + ) as runner: + yield runner.omni def _extract_generated_image(outputs: list[object]) -> Image.Image: diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index 8648216a92..abb7f9c163 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -8,9 +8,9 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -49,12 +49,6 @@ def test_magi_human_e2e(run_level): model_path = "SII-GAIR/daVinci-MagiHuman-Base-1080p" - omni = Omni( - model=model_path, - init_timeout=1200, - tensor_parallel_size=2, - ) - prompt = ( "A young woman with long, wavy golden blonde hair and bright blue eyes, " "wearing a fitted ivory silk blouse with a delicate lace collar, sits " @@ -94,7 +88,12 @@ def test_magi_human_e2e(run_level): }, ) - try: + with OmniRunner( + model_path, + init_timeout=1200, + tensor_parallel_size=2, + ) as runner: + omni = runner.omni outputs = list( omni.generate( prompts=[prompt], @@ -140,5 +139,3 @@ def test_magi_human_e2e(run_level): assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" _validate_mp4(video_bytes) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py index 5293b5ed1b..ff744c86e1 100644 --- a/tests/e2e/offline_inference/test_mammoth_moda2.py +++ b/tests/e2e/offline_inference/test_mammoth_moda2.py @@ -23,10 +23,9 @@ import torch from vllm.sampling_params import SamplingParams +from tests.conftest import OmniRunner from tests.utils import hardware_test -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -116,8 +115,6 @@ def test_mammothmoda2_t2i_e2e(): - A fixed set of pixel values matches a golden reference (regenerate with ``UPDATE_GOLDEN=1``). """ - from vllm_omni import Omni - if not Path(MODEL_PATH).exists(): pytest.skip(f"Model weights not found at {MODEL_PATH}") if not Path(T2I_STAGE_CONFIG).exists(): @@ -135,8 +132,8 @@ def test_mammothmoda2_t2i_e2e(): prompt_text = "A cat sitting on a laptop keyboard" formatted_prompt = _format_t2i_prompt(prompt_text, ar_width, ar_height) - omni = Omni(model=MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) - try: + with OmniRunner(MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) as runner: + omni = runner.omni # Greedy / deterministic sampling so pixel values are reproducible. ar_sampling = SamplingParams( temperature=0.0, @@ -211,5 +208,3 @@ def test_mammothmoda2_t2i_e2e(): found_image = True assert found_image, "No image tensor found in pipeline output" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_omnivoice.py b/tests/e2e/offline_inference/test_omnivoice.py index 4b093e357d..bb4c8a5dd7 100644 --- a/tests/e2e/offline_inference/test_omnivoice.py +++ b/tests/e2e/offline_inference/test_omnivoice.py @@ -16,6 +16,7 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test MODEL = "k2-fsa/OmniVoice" @@ -37,48 +38,42 @@ def test_omnivoice_text_to_audio() -> None: Input Modal: text Output Modal: audio """ - from vllm_omni.entrypoints.omni import Omni + from vllm_omni.inputs.data import OmniDiffusionSamplingParams - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=get_stage_config(), trust_remote_code=True, log_stats=True, - ) - - try: + ) as runner: prompts = {"prompt": "Hello, this is a test for text to audio."} - from vllm_omni.inputs.data import OmniDiffusionSamplingParams - sampling_params_list = [OmniDiffusionSamplingParams()] - outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list)) + outputs = list(runner.omni.generate(prompts, sampling_params_list=sampling_params_list)) - assert len(outputs) > 0, "No outputs generated" + assert len(outputs) > 0, "No outputs generated" - # Check final output has audio - final_output = outputs[-1] - ro = final_output.request_output - assert ro is not None, "No request_output" + # Check final output has audio + final_output = outputs[-1] + ro = final_output.request_output + assert ro is not None, "No request_output" - mm = getattr(ro, "multimodal_output", None) - if not mm and ro.outputs: - mm = getattr(ro.outputs[0], "multimodal_output", None) + mm = getattr(ro, "multimodal_output", None) + if not mm and ro.outputs: + mm = getattr(ro.outputs[0], "multimodal_output", None) - assert mm is not None, "No multimodal_output" - assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" + assert mm is not None, "No multimodal_output" + assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" - audio = mm["audio"] - if isinstance(audio, np.ndarray): - audio_np = audio - else: - audio_np = audio.cpu().numpy().squeeze() + audio = mm["audio"] + if isinstance(audio, np.ndarray): + audio_np = audio + else: + audio_np = audio.cpu().numpy().squeeze() - assert audio_np.size > 0, "Audio output is empty" - rms = np.sqrt(np.mean(audio_np**2)) - assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" + assert audio_np.size > 0, "Audio output is empty" + rms = np.sqrt(np.mean(audio_np**2)) + assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" - print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") - finally: - omni.close() + print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index f71c53de74..291779fd93 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -29,7 +29,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path from typing import Any @@ -37,8 +36,8 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -61,16 +60,15 @@ def _generate_single_stage_image( Returns (images, peak_memory_gib). """ - omni_kwargs: dict[str, Any] = {"model": model, **extra_omni_kwargs} + omni_kwargs: dict[str, Any] = dict(extra_omni_kwargs) if quantization: omni_kwargs["quantization"] = quantization - omni = Omni(**omni_kwargs) - try: + with OmniRunner(model, **omni_kwargs) as runner: torch.cuda.reset_peak_memory_stats() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(seed) - outputs = omni.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -94,8 +92,6 @@ def _generate_single_stage_image( assert images[0].height == height return images, peak_mem - finally: - omni.close() def _generate_bagel_image( @@ -115,8 +111,9 @@ def _generate_bagel_image( if quantization_config: omni_kwargs["quantization_config"] = quantization_config - omni = Omni(**omni_kwargs) - try: + model_name = omni_kwargs.pop("model") + with OmniRunner(model_name, **omni_kwargs) as runner: + omni = runner.omni torch.cuda.reset_peak_memory_stats() params_list = omni.default_sampling_params_list @@ -168,8 +165,6 @@ def _generate_bagel_image( ) return generated_image, peak_mem - finally: - omni.close() # ─── Single-stage diffusion model tests ────────────────────────────────────── diff --git a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py index d5f82f893e..f0b0b55c9f 100644 --- a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py +++ b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py @@ -28,7 +28,6 @@ import argparse import asyncio -import os import sys import time import uuid @@ -37,6 +36,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -48,9 +48,6 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # ------------------------------------------------------------------ models = ["tiny-random/Qwen-Image"] @@ -391,31 +388,28 @@ async def main(model: str, num_prompts: int, mode: str, batch_size: int = 1) -> def test_diffusion_batching_sync_sequential(model_name: str): """Test that synchronous Omni can generate images for multiple prompts submitted sequentially (one at a time) and each returns a valid image.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - for i, prompt in enumerate(prompts): - outputs = m.generate(prompt, sp) - first_output = outputs[0] - assert first_output.final_output_type == "image", ( - f"Expected 'image', got '{first_output.final_output_type}'" - ) + for i, prompt in enumerate(prompts): + outputs = m.generate(prompt, sp) + first_output = outputs[0] + assert first_output.final_output_type == "image", ( + f"Expected 'image', got '{first_output.final_output_type}'" + ) - # Images are surfaced both at top-level and inside request_output - images = _extract_images(first_output) - assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images)") + # Images are surfaced both at top-level and inside request_output + images = _extract_images(first_output) + assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images)") except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -431,34 +425,31 @@ def test_diffusion_batching_sync_multi_prompt(model_name: str): handling at the diffusion stage, not the explicit list-batch path (which is only available via AsyncOmni). """ - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - for i, output in enumerate(outputs): - assert output.final_output_type == "image", ( - f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" - ) - images = _extract_images(output) - assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") - - # Verify all request_ids are distinct - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" + for i, output in enumerate(outputs): + assert output.final_output_type == "image", ( + f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" + ) + images = _extract_images(output) + assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") + + # Verify all request_ids are distinct + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -552,32 +543,29 @@ async def _inner(): def test_diffusion_batching_num_outputs(model_name: str): """Test that the diffusion model respects num_outputs_per_prompt and generates the correct number of images per request.""" - m = None try: - m = Omni(model=model_name) - num_outputs = 2 - sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) - - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - sp, - ) + with OmniRunner(model_name) as runner: + m = runner.omni + num_outputs = 2 + sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) + + outputs = m.generate( + "a photo of a cat sitting on a laptop keyboard", + sp, + ) - first_output = outputs[0] - assert first_output.final_output_type == "image" - images = _extract_images(first_output) - assert images is not None and len(images) == num_outputs, ( - f"Expected {num_outputs} images, got {len(images) if images else 0}" - ) - for img in images: - assert img.width == 256 - assert img.height == 256 + first_output = outputs[0] + assert first_output.final_output_type == "image" + images = _extract_images(first_output) + assert images is not None and len(images) == num_outputs, ( + f"Expected {num_outputs} images, got {len(images) if images else 0}" + ) + for img in images: + assert img.width == 256 + assert img.height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -587,34 +575,31 @@ def test_diffusion_batching_num_outputs(model_name: str): def test_diffusion_batching_distinct_results(model_name: str): """Test that different prompts produce distinct images when batched, ensuring the batching logic does not mix up results across requests.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = [ - {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, - {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, - ] - - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - - # Verify each output has a unique request_id - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" - - # Verify each output has images - for i, output in enumerate(outputs): - images = _extract_images(output) - assert images and len(images) >= 1, f"No images for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = [ + {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, + {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, + ] + + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + + # Verify each output has a unique request_id + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" + + # Verify each output has images + for i, output in enumerate(outputs): + images = _extract_images(output) + assert images and len(images) >= 1, f"No images for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() # ------------------------------------------------------------------ diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py index 16239a1c52..d3abccd78c 100644 --- a/tests/e2e/offline_inference/test_sequence_parallel.py +++ b/tests/e2e/offline_inference/test_sequence_parallel.py @@ -20,8 +20,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -92,49 +92,48 @@ def _run_inference( warmup: If True, run one warmup iteration before the timed run. """ parallel_config = DiffusionParallelConfig(ulysses_degree=ulysses_degree, ring_degree=ring_degree) - omni = Omni( - model=model_name, - parallel_config=parallel_config, - dtype=dtype, - attention_backend=attn_backend, - ) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner( + model_name, + parallel_config=parallel_config, + dtype=dtype, + attention_backend=attn_backend, + ) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, width=width, num_inference_steps=DEFAULT_STEPS, guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].request_output.images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].request_output.images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py index ff4d9b4017..21d75aad52 100644 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ b/tests/e2e/offline_inference/test_stable_audio_model.py @@ -1,6 +1,3 @@ -import sys -from pathlib import Path - import numpy as np import pytest import torch @@ -10,31 +7,25 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Use random weights model for CI testing (small, no authentication required) models = ["linyueqian/stable_audio_random"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_model(model_name: str): - m = Omni(model=model_name) - +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_stable_audio_model(omni_runner): # Use minimal settings for testing # Generate a short 2-second audio clip with minimal inference steps audio_start_in_s = 0.0 audio_end_in_s = 2.0 # Short duration for fast testing sample_rate = 44100 # Stable Audio uses 44100 Hz - outputs = m.generate( + outputs = omni_runner.omni.generate( prompts={ "prompt": "The sound of a dog barking", "negative_prompt": "Low quality.", diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index 55a154f61b..fc54f9a7ff 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -1,7 +1,3 @@ -import os -import sys -from pathlib import Path - import pytest import torch @@ -10,14 +6,12 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) +# Match unprefixed HF id even when MODEL_PREFIX is set (omni_runner resolves full path). +_QWEN_IMAGE_RANDOM_ID = "riverclouds/qwen_image_random" -from vllm_omni import Omni -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +def _is_qwen_image_random(model_path: str) -> bool: + return model_path.rstrip("/").endswith(_QWEN_IMAGE_RANDOM_ID) models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"] @@ -27,56 +21,55 @@ if current_omni_platform.is_npu(): models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 1, "xpu": 2}) -@pytest.mark.parametrize("model_name", models) -def test_diffusion_model(model_name: str, run_level): - if run_level == "core_model" and model_name != "riverclouds/qwen_image_random": +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_diffusion_model(omni_runner, run_level): + resolved = omni_runner.model_name + if run_level == "core_model" and not _is_qwen_image_random(resolved): pytest.skip() - if run_level == "advanced_model" and model_name == "riverclouds/qwen_image_random": + if run_level == "advanced_model" and _is_qwen_image_random(resolved): pytest.skip() - m = None - try: - m = Omni(model=model_name) - # high resolution may cause OOM on L4 - height = 256 - width = 256 - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=2, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=2, - ), - ) - # Extract images from request_output['images'] - first_output = outputs[0] - assert first_output.final_output_type == "image" - if not hasattr(first_output, "request_output") or not first_output.request_output: - raise ValueError("No request_output found in OmniRequestOutput") - - req_out = first_output.request_output - if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): - raise ValueError("Invalid request_output structure or missing 'images' key") - - images = req_out.images - - assert len(images) == 2 - # check image size - assert images[0].width == width - assert images[0].height == height - images[0].save("image_output.png") - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() + # high resolution may cause OOM on L4 + height = 256 + width = 256 + sampling = OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=2, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + num_outputs_per_prompt=2, + ) + + # OmniRunner.generate() is typed for list[TextPrompt]; diffusion uses Omni.generate(str, ...). + outputs = omni_runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + sampling, + ) + + # Extract images from request_output['images'] + first_output = outputs[0] + assert first_output.final_output_type == "image" + if not hasattr(first_output, "request_output") or not first_output.request_output: + raise ValueError("No request_output found in OmniRequestOutput") + + req_out = first_output.request_output + if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): + raise ValueError("Invalid request_output structure or missing 'images' key") + + images = req_out.images + + assert len(images) == 2 + # check image size + assert images[0].width == width + assert images[0].height == height + images[0].save("image_output.png") diff --git a/tests/e2e/offline_inference/test_t2v_model.py b/tests/e2e/offline_inference/test_t2v_model.py index 94c9dedf74..6fe623cfc8 100644 --- a/tests/e2e/offline_inference/test_t2v_model.py +++ b/tests/e2e/offline_inference/test_t2v_model.py @@ -1,22 +1,13 @@ import os -import sys -from pathlib import Path import pytest import torch +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" models = ["Wan-AI/Wan2.2-T2V-A14B-Diffusers"] @@ -24,28 +15,28 @@ @pytest.mark.parametrize("model_name", models) def test_video_diffusion_model(model_name: str): - m = Omni( - model=model_name, + with OmniRunner( + model_name, boundary_ratio=0.875, flow_shift=5.0, - ) - # Use minimal settings for testing - # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 - # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... - height = 480 - width = 640 - num_frames = 5 - outputs = m.generate( - prompts="A cat sitting on a table", - sampling_params_list=OmniDiffusionSamplingParams( - height=height, - width=width, - num_frames=num_frames, - num_inference_steps=2, - guidance_scale=1.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - ), - ) + ) as runner: + # Use minimal settings for testing + # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 + # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... + height = 480 + width = 640 + num_frames = 5 + outputs = runner.omni.generate( + prompts="A cat sitting on a table", + sampling_params_list=OmniDiffusionSamplingParams( + height=height, + width=width, + num_frames=num_frames, + num_inference_steps=2, + guidance_scale=1.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + ), + ) first_output = outputs[0] assert first_output.final_output_type == "image" if not hasattr(first_output, "request_output") or not first_output.request_output: diff --git a/tests/e2e/offline_inference/test_teacache.py b/tests/e2e/offline_inference/test_teacache.py index efc0e43e86..7cd1c5a479 100644 --- a/tests/e2e/offline_inference/test_teacache.py +++ b/tests/e2e/offline_inference/test_teacache.py @@ -8,26 +8,14 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.platforms import current_omni_platform - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +from vllm_omni.platforms import current_omni_platform # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -44,20 +32,17 @@ def test_teacache(model_name: str): cache_config = { "rel_l1_thresh": 0.2, # Default threshold } - m = None - try: - m = Omni( - model=model_name, - cache_backend="tea_cache", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="tea_cache", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -86,9 +71,3 @@ def test_teacache(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_vae_decode_parallelism.py b/tests/e2e/offline_inference/test_vae_decode_parallelism.py index cee76fac2e..0fce28d669 100644 --- a/tests/e2e/offline_inference/test_vae_decode_parallelism.py +++ b/tests/e2e/offline_inference/test_vae_decode_parallelism.py @@ -18,7 +18,7 @@ import time -from vllm_omni import Omni +from tests.conftest import OmniRunner from vllm_omni.platforms import current_omni_platform # os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" @@ -72,23 +72,22 @@ def is_nextstep_model(model_name: str) -> bool: def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, vae_patch_parallel_size=1): - m = None - try: - parallel_config = DiffusionParallelConfig( - tensor_parallel_size=tp, - vae_patch_parallel_size=vae_patch_parallel_size, - ) + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=tp, + vae_patch_parallel_size=vae_patch_parallel_size, + ) - omni_kwargs = { - "model": model_configs["model_name"], - "vae_use_tiling": using_tile, - "parallel_config": parallel_config, - } - use_nextstep = is_nextstep_model(model_configs["model_name"]) - if use_nextstep: - # NextStep-1.1 requires explicit pipeline class - omni_kwargs["model_class_name"] = "NextStep11Pipeline" - m = Omni(**omni_kwargs) + omni_kwargs = { + "vae_use_tiling": using_tile, + "parallel_config": parallel_config, + } + use_nextstep = is_nextstep_model(model_configs["model_name"]) + if use_nextstep: + # NextStep-1.1 requires explicit pipeline class + omni_kwargs["model_class_name"] = "NextStep11Pipeline" + + with OmniRunner(model_configs["model_name"], **omni_kwargs) as runner: + m = runner.omni image = Image.new("RGB", (out_width, out_height), (0, 0, 0)) start = time.perf_counter() outputs = m.generate( @@ -115,9 +114,6 @@ def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, # frames shape: (batch, num_frames, height, width, channels) cost = (end - start) * 1000 return frames, cost - finally: - if m is not None: - m.close() cleanup_dist_env_and_memory() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py index 7e17c6a369..4e4f635d5c 100644 --- a/tests/e2e/offline_inference/test_voxcpm2.py +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test VOXCPM2_MODEL = "openbmb/VoxCPM2" @@ -24,10 +25,8 @@ @pytest.fixture(scope="module") def voxcpm2_engine(): """Create VoxCPM2 engine for testing.""" - from vllm_omni import Omni - - engine = Omni(model=VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) - yield engine + with OmniRunner(VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) as runner: + yield runner.omni def _extract_audio(multimodal_output: dict) -> torch.Tensor: diff --git a/tests/e2e/offline_inference/test_voxtral_tts.py b/tests/e2e/offline_inference/test_voxtral_tts.py index b559cc252d..4f440f243b 100644 --- a/tests/e2e/offline_inference/test_voxtral_tts.py +++ b/tests/e2e/offline_inference/test_voxtral_tts.py @@ -19,7 +19,6 @@ import uuid os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -30,10 +29,9 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import SamplingParams -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.omni import Omni MODEL = "mistralai/Voxtral-4B-TTS-2603" STAGE_CONFIG = str( @@ -83,14 +81,12 @@ def test_voxtral_tts_offline_basic(run_level): """Test basic Voxtral TTS offline inference with a voice preset.""" stage_config = _resolve_stage_config(run_level) - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=stage_config, - stage_init_timeout=300, enforce_eager=True, - ) - - try: + ) as runner: + omni = runner.omni inputs = _compose_request(MODEL, TEST_TEXT, VOICE) sampling_params = SamplingParams(max_tokens=2500) @@ -127,9 +123,6 @@ def test_voxtral_tts_offline_basic(run_level): # Verify audio isn't all zeros / silence assert np.max(np.abs(audio_array)) > 0.01, "Audio appears to be silence" - finally: - omni.close() - @pytest.mark.advanced_model @pytest.mark.omni diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py index b685704ae4..27edc48f20 100644 --- a/tests/e2e/offline_inference/test_zimage_parallelism.py +++ b/tests/e2e/offline_inference/test_zimage_parallelism.py @@ -12,7 +12,6 @@ """ import os -import sys import time from pathlib import Path @@ -20,21 +19,14 @@ import pytest import torch from PIL import Image -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" PROMPT = "a photo of a cat sitting on a laptop keyboard" @@ -97,61 +89,61 @@ def _run_zimage_generate( device_index = current_omni_platform.current_device() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=_get_zimage_model(), - parallel_config=DiffusionParallelConfig( - tensor_parallel_size=tp_size, - vae_patch_parallel_size=vae_patch_parallel_size, - ), - enforce_eager=enforce_eager, - vae_use_tiling=vae_use_tiling, - ) try: - # NOTE: Omni closes itself when a generate() call is exhausted. - # To avoid measuring teardown time (process shutdown, memory cleanup), - # we measure the latency to produce *subsequent* outputs within a single - # generator run. - # - # This also serves as a warmup: the first output may include extra - # compilation/caching overhead, while later outputs are closer to - # steady-state inference. - gen = m.generate( - [PROMPT] * num_requests, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=0.0, - seed=seed, - num_outputs_per_prompt=1, + # Each run needs a distinct DiffusionParallelConfig; use OmniRunner per call (not the + # parametrized omni_runner fixture, which is fixed per module). + with OmniRunner( + _get_zimage_model(), + parallel_config=DiffusionParallelConfig( + tensor_parallel_size=tp_size, + vae_patch_parallel_size=vae_patch_parallel_size, ), - py_generator=True, - ) - - warmup_output = next(gen) - - t_prev = time.perf_counter() - per_request_times_s: list[float] = [] - last_output = warmup_output - for _ in range(num_requests - 1): - last_output = next(gen) - t_now = time.perf_counter() - per_request_times_s.append(t_now - t_prev) - t_prev = t_now - - # Ensure the generator is fully consumed so it can clean up. - for _ in gen: - pass - - median_time_s = float(np.median(per_request_times_s)) - - peak_memory_mb = monitor.peak_used_mb - - return _extract_single_image([last_output]), median_time_s, peak_memory_mb + enforce_eager=enforce_eager, + vae_use_tiling=vae_use_tiling, + ) as runner: + # NOTE: Omni closes itself when a generate() call is exhausted. + # To avoid measuring teardown time (process shutdown, memory cleanup), + # we measure the latency to produce *subsequent* outputs within a single + # generator run. + # + # This also serves as a warmup: the first output may include extra + # compilation/caching overhead, while later outputs are closer to + # steady-state inference. + gen = runner.omni.generate( + [PROMPT] * num_requests, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=num_inference_steps, + guidance_scale=0.0, + seed=seed, + num_outputs_per_prompt=1, + ), + py_generator=True, + ) + + warmup_output = next(gen) + + t_prev = time.perf_counter() + per_request_times_s: list[float] = [] + last_output = warmup_output + for _ in range(num_requests - 1): + last_output = next(gen) + t_now = time.perf_counter() + per_request_times_s.append(t_now - t_prev) + t_prev = t_now + + # Ensure the generator is fully consumed so it can clean up. + for _ in gen: + pass + + median_time_s = float(np.median(per_request_times_s)) + + peak_memory_mb = monitor.peak_used_mb + + return _extract_single_image([last_output]), median_time_s, peak_memory_mb finally: monitor.stop() - m.close() - cleanup_dist_env_and_memory() @pytest.mark.advanced_model diff --git a/tests/e2e/online_serving/test_images_generations_lora.py b/tests/e2e/online_serving/test_images_generations_lora.py index 8c826591a5..fb1e3ea1e0 100644 --- a/tests/e2e/online_serving/test_images_generations_lora.py +++ b/tests/e2e/online_serving/test_images_generations_lora.py @@ -28,7 +28,7 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" MODEL = "Tongyi-MAI/Z-Image-Turbo" -DIFFUSION_INIT_TIMEOUT_S = 700 +DIFFUSION_INIT_TIMEOUT_S = 900 PROMPT = "a photo of a cat sitting on a laptop keyboard"