vllm-project · Gaohan123 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -113,7 +113,7 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Sequence Parallelism Test"
-    timeout_in_minutes: 20
+    timeout_in_minutes: 25
     depends_on: upload-merge-pipeline
     commands:
       - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py

@@ -141,7 +141,6 @@ steps:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
           - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py
           - buildkite-agent artifact upload "tests/dfx/perf/results/*.json"
-          - buildkite-agent artifact upload "tests/dfx/perf/results/*.html"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -244,7 +243,7 @@ steps:
       - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results
       - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance
       - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance
-      - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-omni-performance
+      - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics
       - python tools/nightly/generate_nightly_perf_excel.py
       - python tools/nightly/generate_nightly_perf_html.py
       - python tools/nightly/send_nightly_email.py --report-file "tests/dfx/perf/results/*.xlsx, tests/dfx/perf/results/*.html"

@@ -1771,8 +1771,12 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st
         server_args = params.server_args or []
         if params.use_omni and params.stage_init_timeout is not None:
             server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)]
+        else:
+            server_args = [*server_args, "--stage-init-timeout", "600"]
         if params.init_timeout is not None:
             server_args = [*server_args, "--init-timeout", str(params.init_timeout)]
+        else:
+            server_args = [*server_args, "--init-timeout", "900"]
         if params.use_stage_cli:
             if not params.use_omni:
                 raise ValueError("omni_server with use_stage_cli=True requires use_omni=True")
@@ -2870,9 +2874,9 @@ def __init__(
         self,
         model_name: str,
         seed: int = 42,
-        stage_init_timeout: int = 300,
+        stage_init_timeout: int = 600,
         batch_timeout: int = 10,
-        init_timeout: int = 300,
+        init_timeout: int = 900,
         shm_threshold_bytes: int = 65536,
         log_stats: bool = False,
         stage_configs_path: str | None = None,

@@ -22,9 +22,9 @@
 from PIL import Image
 from vllm.assets.image import ImageAsset
 
-from tests.conftest import modify_stage_config
+from tests.conftest import OmniRunner, modify_stage_config
 from tests.utils import hardware_test
-from vllm_omni.entrypoints.omni import Omni
+from vllm_omni import Omni
 from vllm_omni.platforms import current_omni_platform
 
 # Reference pixel data extracted from the known-good output image
@@ -210,11 +210,10 @@ def test_bagel_img2img_shared_memory_connector(run_level):
     input_image = _load_input_image()
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
     config_path = _resolve_stage_config(config_path, run_level)
-    omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
-
-    try:
-        generated_image = _generate_bagel_img2img(omni, input_image)
+    with OmniRunner(
+        "ByteDance-Seed/BAGEL-7B-MoT",
+        stage_configs_path=config_path,
+    ) as runner:
+        generated_image = _generate_bagel_img2img(runner.omni, input_image)
         if run_level == "advanced_model":
             _validate_pixels(generated_image)
-    finally:
-        omni.close()
@@ -22,7 +22,6 @@
 from vllm_omni.outputs import OmniRequestOutput
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 
 from pathlib import Path
 
@@ -32,9 +31,9 @@
 from PIL import Image
 from safetensors.torch import save_file
 
-from tests.conftest import modify_stage_config
+from tests.conftest import OmniRunner, modify_stage_config
 from tests.utils import hardware_test
-from vllm_omni.entrypoints.omni import Omni
+from vllm_omni import Omni
 from vllm_omni.lora.request import LoRARequest
 from vllm_omni.lora.utils import stable_lora_int_id
 
@@ -154,8 +153,8 @@ def _make_file_lora_request(adapter_dir: Path) -> LoRARequest:
 def test_bagel_lora_scale_and_deactivation(run_level, tmp_path):
     """Validate LoRA effect, bounded perturbation, and clean deactivation."""
     config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level)
-    omni = Omni(model=MODEL, stage_configs_path=config_path, stage_init_timeout=300)
-    try:
+    with OmniRunner(MODEL, stage_configs_path=config_path) as runner:
+        omni = runner.omni
         lora_request = _make_file_lora_request(tmp_path / "bagel_lora")
 
         # 1) Baseline (no LoRA)
@@ -194,5 +193,3 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path):
 
         # (d) Deactivation fully restores base model
         assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}"
-    finally:
-        omni.close()
@@ -16,7 +16,6 @@
 import os
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 import signal
 import socket
 import subprocess
@@ -28,9 +27,9 @@
 import pytest
 from PIL import Image
 
-from tests.conftest import modify_stage_config
+from tests.conftest import OmniRunner, modify_stage_config
 from tests.utils import hardware_test
-from vllm_omni.entrypoints.omni import Omni
+from vllm_omni import Omni
 from vllm_omni.platforms import current_omni_platform
 
 # Reference pixel data extracted from the known-good output image
@@ -199,14 +198,13 @@ def test_bagel_text2img_shared_memory_connector(run_level):
     """Test Bagel text2img with shared memory connector."""
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
     config_path = _resolve_stage_config(config_path, run_level)
-    omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
-
-    try:
-        generated_image = _generate_bagel_image(omni)
+    with OmniRunner(
+        "ByteDance-Seed/BAGEL-7B-MoT",
+        stage_configs_path=config_path,
+    ) as runner:
+        generated_image = _generate_bagel_image(runner.omni)
         if run_level == "advanced_model":
             _validate_pixels(generated_image)
-    finally:
-        omni.close()
 
 
 def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool:
@@ -319,7 +317,6 @@ def test_bagel_text2img_mooncake_connector(run_level):
 
     mooncake_master_proc = None
     temp_config_file = None
-    omni = None
 
     try:
         _cleanup_mooncake_processes()
@@ -349,15 +346,16 @@ def test_bagel_text2img_mooncake_connector(run_level):
         )
 
         temp_config_file = _resolve_stage_config(temp_config_file, run_level)
-        omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300)
-
-        generated_image = _generate_bagel_image(omni)
-        if run_level == "advanced_model":
-            _validate_pixels(generated_image)
+        with OmniRunner(
+            "ByteDance-Seed/BAGEL-7B-MoT",
+            stage_configs_path=temp_config_file,
+            stage_init_timeout=300,
+        ) as runner:
+            generated_image = _generate_bagel_image(runner.omni)
+            if run_level == "advanced_model":
+                _validate_pixels(generated_image)
 
     finally:
-        if omni:
-            omni.close()
         if temp_config_file:
             try:
                 os.unlink(temp_config_file)

@@ -21,15 +21,13 @@
 import os
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 from pathlib import Path
 
 import pytest
 from vllm.assets.image import ImageAsset
 
-from tests.conftest import modify_stage_config
+from tests.conftest import OmniRunner, modify_stage_config
 from tests.utils import hardware_test
-from vllm_omni.entrypoints.omni import Omni
 
 MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT"
 STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
@@ -76,13 +74,11 @@ def _extract_text(omni_outputs: list) -> str:
 def test_bagel_text2text(run_level):
     """Test Bagel text2text produces correct text output."""
     config_path = _resolve_stage_config(STAGE_CONFIG, run_level)
-    omni = Omni(
-        model=MODEL_NAME,
+    with OmniRunner(
+        MODEL_NAME,
         stage_configs_path=config_path,
-        stage_init_timeout=300,
-    )
-
-    try:
+    ) as runner:
+        omni = runner.omni
         prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n"
         params_list = omni.default_sampling_params_list
         omni_outputs = list(
@@ -100,8 +96,6 @@ def test_bagel_text2text(run_level):
             assert text == REFERENCE_TEXT_TEXT2TEXT, (
                 f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}"
             )
-    finally:
-        omni.close()
 
 
 @pytest.mark.core_model
@@ -112,13 +106,12 @@ def test_bagel_img2text(run_level):
     """Test Bagel img2text produces correct text output."""
     input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
     config_path = _resolve_stage_config(STAGE_CONFIG, run_level)
-    omni = Omni(
-        model=MODEL_NAME,
+    with OmniRunner(
+        MODEL_NAME,
         stage_configs_path=config_path,
         stage_init_timeout=300,
-    )
-
-    try:
+    ) as runner:
+        omni = runner.omni
         prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n"
         params_list = omni.default_sampling_params_list
         omni_outputs = list(
@@ -140,5 +133,3 @@ def test_bagel_img2text(run_level):
 
         if run_level == "advanced_model":
             assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}"
-    finally:
-        omni.close()
@@ -8,27 +8,15 @@
 It uses minimal settings to keep test time short for CI.
 """
 
-import os
-import sys
-from pathlib import Path
-
 import pytest
 import torch
 
+from tests.conftest import OmniRunner
 from tests.utils import hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-
-# ruff: noqa: E402
-REPO_ROOT = Path(__file__).resolve().parents[2]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(REPO_ROOT))
-
-from vllm_omni import Omni
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
 
-os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
-
 # Use random weights model for testing
 models = ["riverclouds/qwen_image_random"]
 
@@ -48,20 +36,17 @@ def test_cache_dit(model_name: str):
         "residual_diff_threshold": 0.24,
         "max_continuous_cached_steps": 3,
     }
-    m = None
-    try:
-        m = Omni(
-            model=model_name,
-            cache_backend="cache_dit",
-            cache_config=cache_config,
-        )
-
+    with OmniRunner(
+        model_name,
+        cache_backend="cache_dit",
+        cache_config=cache_config,
+    ) as runner:
         # Use minimal settings for fast testing
         height = 256
         width = 256
         num_inference_steps = 4  # Minimal steps for fast test
 
-        outputs = m.generate(
+        outputs = runner.omni.generate(
             "a photo of a cat sitting on a laptop keyboard",
             OmniDiffusionSamplingParams(
                 height=height,
@@ -90,9 +75,3 @@ def test_cache_dit(model_name: str):
         # Check image size
         assert images[0].width == width
         assert images[0].height == height
-    except Exception as e:
-        print(f"Test failed with error: {e}")
-        raise
-    finally:
-        if m is not None and hasattr(m, "close"):
-            m.close()
@@ -1,22 +1,14 @@
 import gc
-import sys
-from pathlib import Path
 
 import pytest
 import torch
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
+from tests.conftest import OmniRunner
 from tests.utils import DeviceMemoryMonitor, hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.platforms import current_omni_platform
 
-# ruff: noqa: E402
-REPO_ROOT = Path(__file__).resolve().parents[2]
-if str(REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(REPO_ROOT))
-
-from vllm_omni import Omni
-
 models = ["riverclouds/qwen_image_random"]
 
 
@@ -27,30 +19,29 @@ def inference(model_name: str, offload: bool = True):
     current_omni_platform.reset_peak_memory_stats()
     monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
     monitor.start()
-    m = Omni(
-        model=model_name,
+    with OmniRunner(
+        model_name,
         # TODO: we might want to add overlapped feature e2e tests
         # cache_backend="cache_dit",
         enable_cpu_offload=offload,
-    )
-    current_omni_platform.reset_peak_memory_stats()
-    height = 256
-    width = 256
+    ) as runner:
+        current_omni_platform.reset_peak_memory_stats()
+        height = 256
+        width = 256
 
-    m.generate(
-        "a photo of a cat sitting on a laptop keyboard",
-        OmniDiffusionSamplingParams(
-            height=height,
-            width=width,
-            num_inference_steps=9,
-            guidance_scale=0.0,
-            generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
-        ),
-    )
+        runner.omni.generate(
+            "a photo of a cat sitting on a laptop keyboard",
+            OmniDiffusionSamplingParams(
+                height=height,
+                width=width,
+                num_inference_steps=9,
+                guidance_scale=0.0,
+                generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
+            ),
+        )
     peak = monitor.peak_used_mb
     monitor.stop()
 
-    del m
     gc.collect()
     current_omni_platform.empty_cache()