vllm-project · princepride · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
@@ -272,37 +272,45 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
-  # - label: "Bagel Text2Img Model Test with H100"
-  #   timeout_in_minutes: 30
-  #   depends_on: upload-merge-pipeline
-  #   commands:
-  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  #     - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
-  #   agents:
-  #     queue: "mithril-h100-pool"
-  #   plugins:
-  #     - kubernetes:
-  #         podSpec:
-  #           containers:
-  #             - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #               resources:
-  #                 limits:
-  #                   nvidia.com/gpu: 1
-  #               volumeMounts:
-  #                 - name: devshm
-  #                   mountPath: /dev/shm
-  #                 - name: hf-cache
-  #                   mountPath: /root/.cache/huggingface
-  #               env:
-  #                 - name: HF_HOME
-  #                   value: /root/.cache/huggingface
-  #           nodeSelector:
-  #             node.kubernetes.io/instance-type: gpu-h100-sxm
-  #           volumes:
-  #             - name: devshm
-  #               emptyDir:
-  #                 medium: Memory
-  #             - name: hf-cache
-  #               hostPath:
-  #                 path: /mnt/hf-cache
-  #                 type: DirectoryOrCreate
+  - label: "Bagel Model Test with H100 (Real Weights)"
+    timeout_in_minutes: 60
+    depends_on: upload-merge-pipeline
+    commands:
+      - |
+        timeout 55m bash -c '
+          set -e
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          export VLLM_IMAGE_FETCH_TIMEOUT=60
+          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory"
+          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model"
+          pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model"
+        '
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
@@ -268,7 +268,7 @@ steps:
         timeout 30m bash -c '
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
-          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+          pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"
@@ -306,7 +306,7 @@ steps:
         timeout 30m bash -c '
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
-          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py
+          pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"
@@ -345,7 +345,7 @@ steps:
           export VLLM_WORKER_MULTIPROC_METHOD=spawn
           export VLLM_TEST_CLEAN_GPU_MEMORY=1
           export VLLM_IMAGE_FETCH_TIMEOUT=60
-          pytest -s -v tests/e2e/online_serving/test_bagel_online.py
+          pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model"
         '
     agents:
       queue: "mithril-h100-pool"

@@ -1236,15 +1236,14 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st
         port = params.port
         stage_config_path = params.stage_config_path
         if run_level == "advanced_model" and stage_config_path is not None:
+            # Dynamically detect stages from config to avoid KeyError
+            # for models with fewer stages (e.g., BAGEL has 2, Qwen3-Omni has 3)
+            with open(stage_config_path, encoding="utf-8") as f:
+                _cfg = yaml.safe_load(f) or {}
+            _stage_ids = [s["stage_id"] for s in _cfg.get("stage_args", []) if "stage_id" in s]
             stage_config_path = modify_stage_config(
                 stage_config_path,
-                deletes={
-                    "stage_args": {
-                        0: ["engine_args.load_format"],
-                        1: ["engine_args.load_format"],
-                        2: ["engine_args.load_format"],
-                    }
-                },
+                deletes={"stage_args": {sid: ["engine_args.load_format"] for sid in _stage_ids}},
             )
 
         server_args = params.server_args or []

@@ -21,6 +21,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_send_cache: true
         kv_transfer_criteria:
@@ -54,6 +55,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]

@@ -21,6 +21,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_send_cache: true
         kv_transfer_criteria:
@@ -53,6 +54,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       tensor_parallel_size: 1
+      load_format: dummy
       omni_kv_config:
         need_recv_cache: true
     engine_input_source: [0]

@@ -22,6 +22,7 @@
 from PIL import Image
 from vllm.assets.image import ImageAsset
 
+from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
 
@@ -168,17 +169,39 @@ def _generate_bagel_img2img(
     return generated_image
 
 
+def _resolve_stage_config(config_path: str, run_level: str) -> str:
+    """Resolve stage config based on run level.
+
+    For advanced_model (real weights), strip load_format: dummy so the model
+    falls back to loading real weights from HuggingFace.
+    """
+    if run_level == "advanced_model":
+        return modify_stage_config(
+            config_path,
+            deletes={
+                "stage_args": {
+                    0: ["engine_args.load_format"],
+                    1: ["engine_args.load_format"],
+                }
+            },
+        )
+    return config_path
+
+
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_img2img_shared_memory_connector():
+def test_bagel_img2img_shared_memory_connector(run_level):
     """Test Bagel img2img with shared memory connector."""
     input_image = _load_input_image()
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    config_path = _resolve_stage_config(config_path, run_level)
     omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
 
     try:
         generated_image = _generate_bagel_img2img(omni, input_image)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
     finally:
         omni.close()
@@ -28,6 +28,7 @@
 import pytest
 from PIL import Image
 
+from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
 
@@ -158,17 +159,39 @@ def _generate_bagel_image(omni: Omni, prompt: str = DEFAULT_PROMPT) -> Image.Ima
     return generated_image
 
 
+def _resolve_stage_config(config_path: str, run_level: str) -> str:
+    """Resolve stage config based on run level.
+
+    For advanced_model (real weights), strip load_format: dummy so the model
+    falls back to loading real weights from HuggingFace.
+    """
+    if run_level == "advanced_model":
+        return modify_stage_config(
+            config_path,
+            deletes={
+                "stage_args": {
+                    0: ["engine_args.load_format"],
+                    1: ["engine_args.load_format"],
+                }
+            },
+        )
+    return config_path
+
+
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_shared_memory_connector():
+def test_bagel_text2img_shared_memory_connector(run_level):
     """Test Bagel text2img with shared memory connector."""
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
+    config_path = _resolve_stage_config(config_path, run_level)
     omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300)
 
     try:
         generated_image = _generate_bagel_image(omni)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
     finally:
         omni.close()
 
@@ -251,9 +274,10 @@ def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str:
 
 
 @pytest.mark.core_model
+@pytest.mark.advanced_model
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"})
-def test_bagel_text2img_mooncake_connector():
+def test_bagel_text2img_mooncake_connector(run_level):
     """Test Bagel text2img with Mooncake connector for inter-stage communication."""
     MOONCAKE_HOST = "127.0.0.1"
     MOONCAKE_RPC_PORT = _find_free_port()
@@ -291,10 +315,12 @@ def test_bagel_text2img_mooncake_connector():
             http_port=MOONCAKE_HTTP_PORT,
         )
 
+        temp_config_file = _resolve_stage_config(temp_config_file, run_level)
         omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300)
 
         generated_image = _generate_bagel_image(omni)
-        _validate_pixels(generated_image)
+        if run_level == "advanced_model":
+            _validate_pixels(generated_image)
 
     finally:
         if omni: