From 5ecac3de77ecadf4a4f6a5e786c1c1fdea93b4fb Mon Sep 17 00:00:00 2001
From: princepride <wangzhipeng628@gmail.com>
Date: Thu, 19 Mar 2026 03:25:23 +0000
Subject: [PATCH 1/3] [CI] Split BAGEL tests into dummy/real weight tiers
 (L2/L3)

Add tiered testing for BAGEL model following the Qwen3-Omni pattern:
- Pre-merge (test-ready): run with load_format: dummy for fast validation
- Post-merge (test-merge): run with real weights for pixel-level accuracy

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 .buildkite/test-merge.yml                     | 75 ++++++++++---------
 .buildkite/test-ready.yml                     |  6 +-
 .../stage_configs/bagel_mooncake_ci.yaml      |  2 +
 .../stage_configs/bagel_sharedmemory_ci.yaml  |  2 +
 .../offline_inference/test_bagel_img2img.py   | 27 ++++++-
 .../offline_inference/test_bagel_text2img.py  | 34 ++++++++-
 tests/e2e/online_serving/test_bagel_online.py | 32 +++++++-
 7 files changed, 131 insertions(+), 47 deletions(-)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 5479f8ac1e8..5913ff5747e 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -272,37 +272,44 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
-  # - label: "Bagel Text2Img Model Test with H100"
-  #   timeout_in_minutes: 30
-  #   depends_on: upload-merge-pipeline
-  #   commands:
-  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  #     - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
-  #   agents:
-  #     queue: "mithril-h100-pool"
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           containers:
-  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #               resources:
-  #                 limits:
-  #                   nvidia.com/gpu: 1
-  #               volumeMounts:
-  #                 - name: devshm
-  #                   mountPath: /dev/shm
-  #                 - name: hf-cache
-  #                   mountPath: /root/.cache/huggingface
-  #               env:
-  #                 - name: HF_HOME
-  #                   value: /root/.cache/huggingface
-  #           nodeSelector:
-  #             node.kubernetes.io/instance-type: gpu-h100-sxm
-  #           volumes:
-  #             - name: devshm
-  #               emptyDir:
-  #                 medium: Memory
-  #             - name: hf-cache
-  #               hostPath:
-  #                 path: /mnt/hf-cache
-  #                 type: DirectoryOrCreate
+  - label: "Bagel Model Test with H100 (Real Weights)"
+    timeout_in_minutes: 60
+    depends_on: upload-merge-pipeline
+    commands:
+      - |
+        timeout 55m bash -c '
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          export VLLM_IMAGE_FETCH_TIMEOUT=60
+          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory"
+          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model"
+          pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model"
+        '
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index c8579979c84..77fbcc07eaf 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -268,7 +268,7 @@ steps:
         timeout 30m bash -c '
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
-          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"
@@ -306,7 +306,7 @@ steps:
         timeout 30m bash -c '
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
-          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py
+          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"
@@ -345,7 +345,7 @@ steps:
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
           export VLLM_IMAGE_FETCH_TIMEOUT=60
-          pytest -s -v tests/e2e/online_serving/test_bagel_online.py
+          pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"
diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
index dbb93344b99..7f3a3a6f4e4 100644
--- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml
@@ -21,6 +21,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_send_cache: true
         kv_transfer_criteria:
@@ -54,6 +55,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]
diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
index 721c50248a0..aa3cc77188a 100644
--- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml
@@ -21,6 +21,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_send_cache: true
         kv_transfer_criteria:
@@ -53,6 +54,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]
diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py
index da9df2778fa..8c734c6a250 100644
--- a/tests/e2e/offline_inference/test_bagel_img2img.py
+++ b/tests/e2e/offline_inference/test_bagel_img2img.py
@@ -22,6 +22,7 @@
 from PIL import Image
 from vllm.assets.image import ImageAsset
 
+from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
 
@@ -168,17 +169,39 @@ def _generate_bagel_img2img(
     return generated_image
 
 
+def _resolve_stage_config(config_path: str, run_level: str) -> str:
+    """Resolve stage config based on run level.
+
+    For advanced_model (real weights), strip load_format: dummy so the model
+    falls back to loading real weights from HuggingFace.
+    """
+    if run_level == "advanced_model":
+        return modify_stage_config(
+            config_path,
+            deletes={
+                "stage_args": {
+                    0: ["engine_args.load_format"],
+                    1: ["engine_args.load_format"],
+                }
+            },
+        )
+    return config_path
+
+
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_img2img_shared_memory_connector():
+def test_bagel_img2img_shared_memory_connector(run_level):
     """Test Bagel img2img with shared memory connector."""
     input_image = _load_input_image()
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    config_path = _resolve_stage_config(config_path, run_level)
     omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
 
     try:
         generated_image = _generate_bagel_img2img(omni, input_image)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
     finally:
         omni.close()
diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py
index 7990ac980e1..ed369aedd92 100644
--- a/tests/e2e/offline_inference/test_bagel_text2img.py
+++ b/tests/e2e/offline_inference/test_bagel_text2img.py
@@ -28,6 +28,7 @@
 import pytest
 from PIL import Image
 
+from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
 
@@ -158,17 +159,39 @@ def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Ima
     return generated_image
 
 
+def _resolve_stage_config(config_path: str, run_level: str) -> str:
+    """Resolve stage config based on run level.
+
+    For advanced_model (real weights), strip load_format: dummy so the model
+    falls back to loading real weights from HuggingFace.
+    """
+    if run_level == "advanced_model":
+        return modify_stage_config(
+            config_path,
+            deletes={
+                "stage_args": {
+                    0: ["engine_args.load_format"],
+                    1: ["engine_args.load_format"],
+                }
+            },
+        )
+    return config_path
+
+
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_shared_memory_connector():
+def test_bagel_text2img_shared_memory_connector(run_level):
     """Test Bagel text2img with shared memory connector."""
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    config_path = _resolve_stage_config(config_path, run_level)
     omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
 
     try:
         generated_image = _generate_bagel_image(omni)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
     finally:
         omni.close()
 
@@ -251,9 +274,10 @@ def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str:
 
 
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_mooncake_connector():
+def test_bagel_text2img_mooncake_connector(run_level):
     """Test Bagel text2img with Mooncake connector for inter-stage communication."""
     MOONCAKE_HOST = "127.0.0.1"
     MOONCAKE_RPC_PORT = _find_free_port()
@@ -291,10 +315,12 @@ def test_bagel_text2img_mooncake_connector():
             http_port=MOONCAKE_HTTP_PORT,
         )
 
+        temp_config_file = _resolve_stage_config(temp_config_file, run_level)
         omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300)
 
         generated_image = _generate_bagel_image(omni)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
 
     finally:
         if omni:
diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py
index 4056cfdef6d..06fcad56466 100644
--- a/tests/e2e/online_serving/test_bagel_online.py
+++ b/tests/e2e/online_serving/test_bagel_online.py
@@ -36,6 +36,7 @@
 from PIL import Image
 from vllm.assets.image import ImageAsset
 
+from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 
 MODEL = "ByteDance-Seed/BAGEL-7B-MoT"
@@ -47,6 +48,25 @@
 IMG2IMG_PROMPT = "Change the grass color to red"
 
 
+def _resolve_stage_config(config_path: str, run_level: str) -> str:
+    """Resolve stage config based on run level.
+
+    For advanced_model (real weights), strip load_format: dummy so the model
+    falls back to loading real weights from HuggingFace.
+    """
+    if run_level == "advanced_model":
+        return modify_stage_config(
+            config_path,
+            deletes={
+                "stage_args": {
+                    0: ["engine_args.load_format"],
+                    1: ["engine_args.load_format"],
+                }
+            },
+        )
+    return config_path
+
+
 class BagelOmniServer:
     """Context manager to start/stop a vLLM-Omni server for Bagel model tests."""
 
@@ -205,11 +225,13 @@ def _extract_image_from_response(data: dict[str, Any]) -> Image.Image | None:
 
 
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_online():
+def test_bagel_text2img_online(run_level):
     """Test Bagel text2img via OpenAI-compatible chat completions API."""
-    with BagelOmniServer() as server:
+    stage_config = _resolve_stage_config(STAGE_CONFIGS_PATH, run_level)
+    with BagelOmniServer(stage_configs_path=stage_config) as server:
         response_data = _send_chat_request(
             server.base_url,
             TEXT2IMG_PROMPT,
@@ -225,13 +247,15 @@ def test_bagel_text2img_online():
 
 
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_img2img_online():
+def test_bagel_img2img_online(run_level):
     """Test Bagel img2img via OpenAI-compatible chat completions API."""
     input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
 
-    with BagelOmniServer() as server:
+    stage_config = _resolve_stage_config(STAGE_CONFIGS_PATH, run_level)
+    with BagelOmniServer(stage_configs_path=stage_config) as server:
         response_data = _send_chat_request(
             server.base_url,
             IMG2IMG_PROMPT,

From d4551d9a285e60e430a9e5392127b7d8a8eb7f8a Mon Sep 17 00:00:00 2001
From: princepride <wangzhipeng628@gmail.com>
Date: Thu, 19 Mar 2026 03:35:26 +0000
Subject: [PATCH 2/3] [CI] Add set -e to BAGEL merge step to fail on first test
 failure

Without set -e, only the exit code of the last pytest command is
checked, masking failures in earlier test invocations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 .buildkite/test-merge.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
index 5913ff5747e..434ef498812 100644
--- a/.buildkite/test-merge.yml
+++ b/.buildkite/test-merge.yml
@@ -278,6 +278,7 @@ steps:
     commands:
       - |
         timeout 55m bash -c '
+          set -e
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
           export VLLM_IMAGE_FETCH_TIMEOUT=60

From 9d7879a49983bdbc0397601ef9dd724887005a7e Mon Sep 17 00:00:00 2001
From: princepride <wangzhipeng628@gmail.com>
Date: Thu, 19 Mar 2026 03:55:17 +0000
Subject: [PATCH 3/3] [CI] Refactor BAGEL online test to use omni_server +
 openai_client fixtures

Replace custom BagelOmniServer with shared omni_server fixture and
openai_client.send_diffusion_request() to unify code style with
Qwen3-Omni tests.

Also fix omni_server fixture to dynamically detect stage IDs from config
instead of hardcoding stages 0/1/2, so it works for models with any
number of stages (e.g., BAGEL has 2, Qwen3-Omni has 3).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 tests/conftest.py                             |  13 +-
 tests/e2e/online_serving/test_bagel_online.py | 272 ++++--------------
 2 files changed, 64 insertions(+), 221 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 2460cfd5bda..a624462b346 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1236,15 +1236,14 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st
         port = params.port
         stage_config_path = params.stage_config_path
         if run_level == "advanced_model" and stage_config_path is not None:
+            # Dynamically detect stages from config to avoid KeyError
+            # for models with fewer stages (e.g., BAGEL has 2, Qwen3-Omni has 3)
+            with open(stage_config_path, encoding="utf-8") as f:
+                _cfg = yaml.safe_load(f) or {}
+            _stage_ids = [s["stage_id"] for s in _cfg.get("stage_args", []) if "stage_id" in s]
             stage_config_path = modify_stage_config(
                 stage_config_path,
-                deletes={
-                    "stage_args": {
-                        0: ["engine_args.load_format"],
-                        1: ["engine_args.load_format"],
-                        2: ["engine_args.load_format"],
-                    }
-                },
+                deletes={"stage_args": {sid: ["engine_args.load_format"] for sid in _stage_ids}},
             )
 
         server_args = params.server_args or []
diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py
index 06fcad56466..a5e26db1ea1 100644
--- a/tests/e2e/online_serving/test_bagel_online.py
+++ b/tests/e2e/online_serving/test_bagel_online.py
@@ -22,23 +22,18 @@
 
 import base64
 import os
-import signal
-import socket
-import subprocess
-import sys
-import time
 from io import BytesIO
 from pathlib import Path
-from typing import Any
 
 import pytest
-import requests
-from PIL import Image
 from vllm.assets.image import ImageAsset
 
-from tests.conftest import modify_stage_config
+from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
+
 MODEL = "ByteDance-Seed/BAGEL-7B-MoT"
 STAGE_CONFIGS_PATH = str(
     Path(__file__).parent.parent / "offline_inference" / "stage_configs" / "bagel_sharedmemory_ci.yaml"
@@ -47,225 +42,74 @@
 TEXT2IMG_PROMPT = "A cute cat"
 IMG2IMG_PROMPT = "Change the grass color to red"
 
-
-def _resolve_stage_config(config_path: str, run_level: str) -> str:
-    """Resolve stage config based on run level.
-
-    For advanced_model (real weights), strip load_format: dummy so the model
-    falls back to loading real weights from HuggingFace.
-    """
-    if run_level == "advanced_model":
-        return modify_stage_config(
-            config_path,
-            deletes={
-                "stage_args": {
-                    0: ["engine_args.load_format"],
-                    1: ["engine_args.load_format"],
-                }
-            },
-        )
-    return config_path
-
-
-class BagelOmniServer:
-    """Context manager to start/stop a vLLM-Omni server for Bagel model tests."""
-
-    def __init__(
-        self,
-        model: str = MODEL,
-        stage_configs_path: str = STAGE_CONFIGS_PATH,
-        env_dict: dict[str, str] | None = None,
-    ) -> None:
-        self.model = model
-        self.stage_configs_path = stage_configs_path
-        self.env_dict = env_dict
-        self.proc: subprocess.Popen | None = None
-        self.host = "127.0.0.1"
-        self.port = _find_free_port()
-
-    @property
-    def base_url(self) -> str:
-        return f"http://{self.host}:{self.port}"
-
-    def _start_server(self) -> None:
-        env = os.environ.copy()
-        if self.env_dict is not None:
-            env.update(self.env_dict)
-
-        cmd = [
-            sys.executable,
-            "-m",
-            "vllm_omni.entrypoints.cli.main",
-            "serve",
-            self.model,
-            "--omni",
-            "--host",
-            self.host,
-            "--port",
-            str(self.port),
-            "--stage-configs-path",
-            self.stage_configs_path,
-            "--stage-init-timeout",
-            "300",
-        ]
-
-        self.proc = subprocess.Popen(
-            cmd,
-            env=env,
-            start_new_session=True,
-        )
-
-        try:
-            if not _wait_for_port(self.host, self.port, timeout=600, proc=self.proc):
-                self.terminate()
-                raise RuntimeError(f"Server failed to start within 600 seconds on {self.host}:{self.port}")
-        except Exception:
-            self.terminate()
-            raise
-
-    def __enter__(self):
-        self._start_server()
-        return self
-
-    def terminate(self) -> None:
-        if self.proc:
-            try:
-                os.killpg(os.getpgid(self.proc.pid), signal.SIGTERM)
-            except ProcessLookupError:
-                pass
-            try:
-                self.proc.wait(timeout=30)
-            except subprocess.TimeoutExpired:
-                try:
-                    os.killpg(os.getpgid(self.proc.pid), signal.SIGKILL)
-                except ProcessLookupError:
-                    pass
-                self.proc.wait()
-            self.proc = None
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.terminate()
-
-
-def _find_free_port() -> int:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("127.0.0.1", 0))
-        s.listen(1)
-        return s.getsockname()[1]
-
-
-def _wait_for_port(host: str, port: int, timeout: int = 600, proc: subprocess.Popen | None = None) -> bool:
-    start = time.time()
-    while time.time() - start < timeout:
-        if proc is not None and proc.poll() is not None:
-            # Server process exited early
-            return False
-        try:
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-                sock.settimeout(1)
-                if sock.connect_ex((host, port)) == 0:
-                    return True
-        except Exception:
-            pass
-        time.sleep(2)
-    return False
-
-
-def _send_chat_request(
-    server_url: str,
-    prompt: str,
-    *,
-    modality: str = "text2img",
-    image: Image.Image | None = None,
-    timeout: int = 300,
-) -> dict[str, Any]:
-    """Send a chat completion request matching the openai_chat_client.py format."""
-    content: list[dict[str, Any]] = [{"type": "text", "text": f"<|im_start|>{prompt}<|im_end|>"}]
-
-    if image is not None:
-        buffer = BytesIO()
-        image.save(buffer, format="JPEG")
-        b64_data = base64.b64encode(buffer.getvalue()).decode("utf-8")
-        content.append(
-            {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/jpeg;base64,{b64_data}"},
-            }
-        )
-
-    payload: dict[str, Any] = {
-        "messages": [{"role": "user", "content": content}],
-    }
-
-    if modality in ("text2img", "img2img"):
-        payload["modalities"] = ["image"]
-
-    resp = requests.post(
-        f"{server_url}/v1/chat/completions",
-        headers={"Content-Type": "application/json"},
-        json=payload,
-        timeout=timeout,
-    )
-    resp.raise_for_status()
-    return resp.json()
-
-
-def _extract_image_from_response(data: dict[str, Any]) -> Image.Image | None:
-    """Extract the generated PIL Image from a chat completion response."""
-    for choice in data.get("choices", []):
-        content = choice.get("message", {}).get("content")
-        if isinstance(content, list) and content:
-            first_item = content[0]
-            if isinstance(first_item, dict) and "image_url" in first_item:
-                url = first_item["image_url"].get("url", "")
-                if url.startswith("data:image"):
-                    _, b64 = url.split(",", 1)
-                    return Image.open(BytesIO(base64.b64decode(b64)))
-    return None
+# Create parameter combinations for model and stage config
+test_params = [
+    OmniServerParams(
+        model=MODEL,
+        stage_config_path=STAGE_CONFIGS_PATH,
+        server_args=["--stage-init-timeout", "300"],
+    ),
+]
+
+
+def _build_text2img_messages(prompt: str) -> list[dict]:
+    """Build OpenAI-format messages for text2img generation."""
+    return [
+        {
+            "role": "user",
+            "content": [{"type": "text", "text": f"<|im_start|>{prompt}<|im_end|>"}],
+        }
+    ]
+
+
+def _build_img2img_messages(prompt: str, image_b64: str) -> list[dict]:
+    """Build OpenAI-format messages for img2img generation."""
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"<|im_start|>{prompt}<|im_end|>"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"},
+                },
+            ],
+        }
+    ]
 
 
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_online(run_level):
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_bagel_text2img_online(omni_server, openai_client) -> None:
     """Test Bagel text2img via OpenAI-compatible chat completions API."""
-    stage_config = _resolve_stage_config(STAGE_CONFIGS_PATH, run_level)
-    with BagelOmniServer(stage_configs_path=stage_config) as server:
-        response_data = _send_chat_request(
-            server.base_url,
-            TEXT2IMG_PROMPT,
-            modality="text2img",
-        )
-
-        image = _extract_image_from_response(response_data)
-        assert image is not None, f"No image in response: {response_data}"
-        image.load()
+    request_config = {
+        "model": omni_server.model,
+        "messages": _build_text2img_messages(TEXT2IMG_PROMPT),
+        "modalities": ["image"],
+    }
 
-        w, h = image.size
-        assert w > 0 and h > 0, f"Invalid image size: {image.size}"
+    openai_client.send_diffusion_request(request_config)
 
 
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_img2img_online(run_level):
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_bagel_img2img_online(omni_server, openai_client) -> None:
     """Test Bagel img2img via OpenAI-compatible chat completions API."""
     input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB")
+    buffer = BytesIO()
+    input_image.save(buffer, format="JPEG")
+    image_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": _build_img2img_messages(IMG2IMG_PROMPT, image_b64),
+        "modalities": ["image"],
+    }
 
-    stage_config = _resolve_stage_config(STAGE_CONFIGS_PATH, run_level)
-    with BagelOmniServer(stage_configs_path=stage_config) as server:
-        response_data = _send_chat_request(
-            server.base_url,
-            IMG2IMG_PROMPT,
-            modality="img2img",
-            image=input_image,
-        )
-
-        image = _extract_image_from_response(response_data)
-        assert image is not None, f"No image in response: {response_data}"
-        image.load()
-
-        w, h = image.size
-        assert w > 0 and h > 0, f"Invalid image size: {image.size}"
+    openai_client.send_diffusion_request(request_config)