vllm-project · hsliuustc0106 · Apr 22, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 20, 2026
@@ -13,7 +13,7 @@ steps:
       - label: ":full_moon: Omni · Function Test"
         timeout_in_minutes: 90
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -m "full_model and H100 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -52,7 +52,7 @@ steps:
         timeout_in_minutes: 90
         commands:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model"
+          - pytest -s -v tests/examples/ -m "full_model and omni and L4" --run-level "full_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -70,7 +70,7 @@ steps:
       - label: ":full_moon: Omni · Doc Test with H100"
         timeout_in_minutes: 90
         commands:
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model"
+          - pytest -s -v tests/examples/ -m "full_model and omni and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -160,7 +160,7 @@ steps:
         timeout_in_minutes: 90
         commands:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -m "full_model and L4 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -233,7 +233,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model"
+          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -271,7 +271,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -290,7 +290,7 @@ steps:
         timeout_in_minutes: 60
         commands:
           - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/examples/*/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
+          - pytest -s -v tests/examples/*/test_text_to_image.py -m "full_model and example and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -328,7 +328,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
         agents:
           queue: "mithril-h100-pool"
@@ -367,7 +367,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
+          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level full_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv"
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json"
         agents:
@@ -409,7 +409,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Accuracy Test"
         timeout_in_minutes: 180
         commands:
-          - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model
+          - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level full_model
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -514,7 +514,7 @@ steps:
       - label: ":full_moon: Diffusion X2V · Function Test"
         timeout_in_minutes: 90
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "advanced_model" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "full_model" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -552,7 +552,7 @@ steps:
       - label: ":full_moon: Diffusion X2V · Accuracy Test"
         timeout_in_minutes: 180
         commands:
-          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m advanced_model --run-level advanced_model
+          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m full_model --run-level full_model
         agents:
           queue: "mithril-h100-pool"
         plugins:

@@ -120,23 +120,6 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Audio Generation Model Test"
-    depends_on: upload-ready-pipeline
-    commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
-    agents:
-      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
-    plugins:
-      - docker#v5.2.0:
-          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-          always-pull: true
-          propagate-environment: true
-          environment:
-            - "HF_HOME=/fsx/hf_cache"
-            - "HF_TOKEN"
-          volumes:
-            - "/fsx/hf_cache:/fsx/hf_cache"
-
   - label: "Diffusion Cache Backend Test"
     depends_on: upload-ready-pipeline
     commands:

@@ -23,5 +23,5 @@ Test guidance:
 - Local static/self-checks live in `tests/benchmarks/test_accuracy_bench_utils.py`.
 - End-to-end generation/evaluation should be validated in a remote GPU
   environment. In the current repo marker system there is `L4` but no `L5`
-  marker, so benchmark smoke tests should be wired as `advanced_model +
-  benchmark + L4` when GPU capacity is available.
+  marker, so benchmark smoke tests should be wired as `full_model +
+  benchmark + L4` for nightly when GPU capacity is available.
@@ -99,5 +99,5 @@ Notes:
 - This flow requires the optional Hugging Face `datasets` package.
 - `generate` writes `generation_manifest.json` with local output coverage.
 - The current repo marker set exposes `L4` but not `L5`, so if you promote an
-  end-to-end smoke test into CI, use the existing `advanced_model`, `benchmark`,
-  and `L4` markers or introduce a new repo-wide marker explicitly first.
+  end-to-end smoke test into CI, use the `full_model`, `benchmark`,
+  and `L4` markers for nightly (or `advanced_model` for merge) or introduce a new repo-wide marker explicitly first.
@@ -418,13 +418,13 @@ L3 level testing executes after code is merged into the main branch. Its core pu
 
     **Explanation**:
 
-    @pytest.mark.advanced_model: Marks the test as L3 or L4 level, indicating that this test case performs deep validation, using real models for performance, integration, and accuracy testing. This forms a "basic-advanced" correspondence with the core_model mark at the L2 level.
+    @pytest.mark.advanced_model: Marks the test as L3 merge level, indicating deep validation with real models. @pytest.mark.full_model: Marks L4 nightly-only suites (e.g. `test_*_expansion.py`, doc examples).
 
     @pytest.mark.core_model: Marks the test as L1 or L2 level, indicating that this test case validates the basic functionality of the core model. It uses mock weights and only checks if the relevant interface functions correctly.
 
     @pytest.mark.parametrize: A parameterization decorator that allows abstracting test data into parameters, enabling reuse of the same test logic across different data configurations. indirect=True indicates that parameters will be passed to the fixture for processing.
 
-    **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation run logic at the merge or nightly level, you can mark it only with @pytest.mark.advanced_model. If you believe the test case needs to accommodate both basic run and deep validation test logic, you should mark it with both @pytest.mark.core_model and @pytest.mark.advanced_model.
+    **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation at merge (L3), use @pytest.mark.advanced_model. For L4 nightly-only expansion and doc-example tests, use @pytest.mark.full_model with `--run-level full_model`. If the test case needs both basic run and deep validation, mark with @pytest.mark.core_model and the appropriate L3/L4 marker (`advanced_model` and/or `full_model`).
 
     **2.4.2 Test Function Definition and Documentation**
 
@@ -516,9 +516,11 @@ L3 level testing executes after code is merged into the main branch. Its core pu
 
     **Single Request**: The comment clearly states this is a single-request completion test. For concurrent testing, it can be extended to multiple requests using request_num = n.
 
-    **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model performs deep validation.
+    **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model and full_model perform deep validation.
 
--   ***Run Command***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`
+-   ***Run Command (L3 merge)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`
+
+-   ***Run Command (L4 nightly expansion)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}_expansion.py -m full_model --run-level=full_model`
 
 ## Chapter 3: L4 Level Testing - Full Functionality, Performance, and Documentation Testing
 

@@ -37,7 +37,7 @@ Currently all the features are available in online serving mode. Hence, only nee
 **Code Style**
 
 - Validation: test that the multimodal output files of your model have the correct shapes. `OpenAIClientHandler.send_diffusion_request` should have taken care of this.
-- Test marks: always add `advanced_model` and `diffusion`. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
+- Test marks: always add `full_model` and `diffusion` for L4 nightly `test_*_expansion.py` cases. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
 - To maximize code reuse, you may refer to
     - `tests/conftest.py` for `omni_server` (running server in subprocess) and `openai_client` fixtures (sending requests and validating output), `generate_synthetic_image` and `assert_XXX_valid` helper.
     - `tests/helpers/mark.py` for `@hardware_test(...)` and `hardware_marks`.

@@ -42,31 +42,46 @@ Our test scripts use the pytest framework. First, please use `git clone https://
     ```
     The latest test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-ready.yml).
 
-=== "L3 level & L4 level"
+=== "L3 level"
 
     ```bash
     pytest -s -v -m "advanced_model" --run-level=advanced_model
     ```
-    If you only want to run L3 test case, you can use:
+    If you only want to run a specific test case, you can use:
+    ```bash
+    pytest -s -v test_xxxx.py --run-level=advanced_model
+    ```
+    If you only want to run specific test cases on a particular platform, you can use:
     ```bash
-    pytest -s -v e2e/ --ignore-glob='*expansion.py' -m "advanced_model" --run-level=advanced_model
+    pytest -s -v -m "advanced_model and distributed_cuda and L4"  --run-level=advanced_model
+    ```
+    The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).
+
+
+=== "L4 level"
+
+    ```bash
+    cd tests
+    pytest -s -v -m "full_model" --run-level=full_model
     ```
     If you only want to run a specific test case, you can use:
     ```bash
-    pytest -s -v test_xxxx.py --run-level=advanced_model
+    pytest -s -v test_xxxx.py --run-level=full_model
     ```
     If you only want to run specific test cases on a particular platform, you can use:
     ```bash
-    pytest -s -v -m "core_model and distributed_cuda and L4"  --run-level=core_model
+    pytest -s -v -m "full_model and distributed_cuda and L4"  --run-level=full_model
     ```
     Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS):
     ```bash
     pytest -s -v tests/dfx/perf/scripts/run_benchmark.py
     ```
+    The latest L4 (nightly) test commands use the `full_model` marker and `--run-level full_model` (see [test-nightly.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml) and [test-nightly-diffusion.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly-diffusion.yml)). Example:
 
-    The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).
-
-    The latest L4 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml).
+    ```bash
+    cd tests
+    pytest -s -v -m "full_model and omni and H100" --run-level=full_model
+    ```
 
 You can find more information about markers in the documentation: [marker doc](./tests_markers.md)
 

@@ -8,7 +8,8 @@ Defined in `pyproject.toml`:
 | Marker             | Description                                               |
 | ------------------ | --------------------------------------------------------- |
 | `core_model`       | L1&L2 tests (run in each PR)                              |
-| `advanced_model`   | L3&L4 level tests (run in each merge or nightly)          |
+| `advanced_model`   | L3 tests (run on each merge to main)                 |
+| `full_model`       | L4 tests (run nightly) |
 | `diffusion`        | Diffusion model tests                                     |
 | `omni`             | Omni model tests                                          |
 | `cache`            | Cache backend tests                                       |

@@ -179,7 +179,8 @@ addopts = [
 markers = [
     # ci/cd required
     "core_model: L1&L2 tests (run in each PR)",
-    "advanced_model: L3&L4 level tests (run in each merge or nightly)",
+    "advanced_model: L3 level tests (run on each merge)",
+    "full_model: L4 level tests (run nightly)",
     # function module markers
     "diffusion: Diffusion model tests",
     "omni: Omni model tests",

@@ -6,6 +6,9 @@
 import pytest
 from PIL import Image
 
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+
+
 REPO_ROOT = Path(__file__).resolve().parents[2]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
@@ -37,8 +40,6 @@
     summarize_gebench_results,
 )
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
-
 
 def test_summarize_gebench_generated_records_groups_by_type():
     records = [

@@ -17,6 +17,9 @@
 )
 from tests.helpers.runtime import OmniServer
 
+pytestmark = [pytest.mark.full_model, pytest.mark.omni]
+
+
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 
@@ -293,6 +296,7 @@ def assert_result(
                 print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}")
 
 
+@pytest.mark.benchmark
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 @pytest.mark.parametrize("benchmark_params", benchmark_indices, indirect=True)
 def test_performance_benchmark(omni_server, benchmark_params):

@@ -32,6 +32,8 @@
 import psutil
 import pytest
 
+pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]
+
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 os.environ.setdefault("DIFFUSION_ATTENTION_BACKEND", "FLASH_ATTN")
@@ -662,8 +664,7 @@ def assert_result(result: dict[str, Any], params: dict[str, Any]) -> None:
 # ---------------------------------------------------------------------------
 # Test entry point
 # ---------------------------------------------------------------------------
-
-
+@pytest.mark.benchmark
 @pytest.mark.parametrize(
     "diffusion_server",
     server_params,

@@ -6,6 +6,8 @@
 
 from vllm_omni.diffusion.data import DiffusionParallelConfig
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 @pytest.fixture(scope="function", autouse=True)
 def setup_sp_groups(mocker):
@@ -24,9 +26,6 @@ def setup_sp_groups(mocker):
     yield
 
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
 def test_glm_image_sp_plan_defined():
     """Test that _sp_plan is properly defined on GlmImageTransformer2DModel."""
     from vllm_omni.diffusion.models.glm_image.glm_image_transformer import (

@@ -17,8 +17,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
 
 def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]:
     ctx = zmq.Context.instance()