From c976fbbdb1dfaf10267f1ce2edcbdc09497f9e68 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Thu, 9 Apr 2026 16:52:35 +0800
Subject: [PATCH 1/9] [CI] Update test markers and configurations to use
 'full_model' for L4 nightly tests

- Changed test markers from 'advanced_model' to 'full_model' across various test files to align with the new testing structure.
- Updated the 'pyproject.toml' to reflect the new marker definitions.
- Adjusted Buildkite configurations to run full model tests in nightly pipelines.
- Enhanced documentation to clarify the use of 'full_model' for nightly tests and 'advanced_model' for merge tests.

Signed-off-by: wangyu <410167048@qq.com>
---
 .buildkite/test-nightly-diffusion.yml         | 16 ++++----
 .buildkite/test-nightly.yml                   |  8 ++--
 benchmarks/accuracy/README.md                 |  4 +-
 benchmarks/accuracy/image_to_image/README.md  |  4 +-
 docs/contributing/ci/CI_5levels.md            | 10 +++--
 .../l4_functionality_tests.inc.md             |  2 +-
 docs/contributing/ci/test_guide.md            | 31 +++++++++++----
 docs/contributing/ci/tests_markers.md         |  3 +-
 pyproject.toml                                |  3 +-
 tests/conftest.py                             | 23 ++++++-----
 tests/dfx/perf/scripts/run_benchmark.py       |  3 ++
 .../perf/scripts/run_diffusion_benchmark.py   |  5 ++-
 tests/e2e/accuracy/test_gebench_h100_smoke.py |  2 +-
 .../accuracy/test_gedit_bench_h100_smoke.py   |  2 +-
 .../test_wan22_i2v_video_similarity.py        |  6 +--
 .../online_serving/test_bagel_expansion.py    |  2 +-
 .../online_serving/test_flux2_expansion.py    |  2 +-
 .../test_flux_2_dev_expansion.py              |  2 +-
 .../test_flux_kontext_expansion.py            | 10 ++---
 .../test_hunyuan_video_15_expansion.py        |  2 +-
 .../test_longcat_image_edit_expansion.py      |  2 +-
 .../test_longcat_image_expansion.py           |  2 +-
 .../test_qwen3_omni_expansion.py              | 38 +++++++++----------
 .../test_qwen3_tts_base_expansion.py          |  4 +-
 .../test_qwen3_tts_customvoice_expansion.py   |  8 ++--
 .../test_qwen_image_edit_expansion.py         |  4 +-
 .../test_qwen_image_expansion.py              |  4 +-
 .../test_qwen_image_layered_expansion.py      |  6 +--
 .../e2e/online_serving/test_sd3_expansion.py  |  2 +-
 .../online_serving/test_wan22_expansion.py    |  2 +-
 .../test_wan_2_1_vace_expansion.py            |  2 +-
 .../online_serving/test_zimage_expansion.py   |  2 +-
 .../offline_inference/test_text_to_image.py   |  2 +-
 .../online_serving/test_qwen2_5_omni.py       | 16 ++++----
 .../online_serving/test_qwen3_omni.py         | 16 ++++----
 .../online_serving/test_text_to_image.py      |  2 +-
 36 files changed, 140 insertions(+), 112 deletions(-)

diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml
index 73bf4551136..68ae7d23fca 100644
--- a/.buildkite/test-nightly-diffusion.yml
+++ b/.buildkite/test-nightly-diffusion.yml
@@ -11,7 +11,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "full_model and diffusion and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -51,7 +51,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -m "full_model and diffusion and L4" --run-level "full_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -72,7 +72,7 @@ steps:
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
           - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
+          - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "full_model and example and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -115,7 +115,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "full_model" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -156,7 +156,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model
+          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level full_model
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -199,7 +199,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/online_serving/test_qwen_image* -m "full_model and diffusion and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -240,7 +240,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
         agents:
           queue: "mithril-h100-pool"
@@ -282,7 +282,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
+          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level full_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv"
           - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json"
         agents:
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 62f6e4dceb6..b6c0d45c280 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -9,7 +9,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -m "full_model and H100 and omni" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -51,7 +51,7 @@ steps:
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/ -m "full_model and L4 and omni" --run-level "full_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -73,7 +73,7 @@ steps:
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model"
+          - pytest -s -v tests/examples/ -m "full_model and omni and L4" --run-level "full_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -94,7 +94,7 @@ steps:
         if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
         commands:
           - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model"
+          - pytest -s -v tests/examples/ -m "full_model and omni and H100" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
diff --git a/benchmarks/accuracy/README.md b/benchmarks/accuracy/README.md
index 0d73215b692..dbe20916a77 100644
--- a/benchmarks/accuracy/README.md
+++ b/benchmarks/accuracy/README.md
@@ -23,5 +23,5 @@ Test guidance:
 - Local static/self-checks live in `tests/benchmarks/test_accuracy_bench_utils.py`.
 - End-to-end generation/evaluation should be validated in a remote GPU
   environment. In the current repo marker system there is `L4` but no `L5`
-  marker, so benchmark smoke tests should be wired as `advanced_model +
-  benchmark + L4` when GPU capacity is available.
+  marker, so benchmark smoke tests should be wired as `full_model +
+  benchmark + L4` for nightly when GPU capacity is available.
diff --git a/benchmarks/accuracy/image_to_image/README.md b/benchmarks/accuracy/image_to_image/README.md
index ee1d58f108b..86e7b0cf328 100644
--- a/benchmarks/accuracy/image_to_image/README.md
+++ b/benchmarks/accuracy/image_to_image/README.md
@@ -99,5 +99,5 @@ Notes:
 - This flow requires the optional Hugging Face `datasets` package.
 - `generate` writes `generation_manifest.json` with local output coverage.
 - The current repo marker set exposes `L4` but not `L5`, so if you promote an
-  end-to-end smoke test into CI, use the existing `advanced_model`, `benchmark`,
-  and `L4` markers or introduce a new repo-wide marker explicitly first.
+  end-to-end smoke test into CI, use the `full_model`, `benchmark`,
+  and `L4` markers for nightly (or `advanced_model` for merge) or introduce a new repo-wide marker explicitly first.
diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 74ae1a38eb8..047ec561f2b 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -417,13 +417,13 @@ L3 level testing executes after code is merged into the main branch. Its core pu
 
     **Explanation**:
 
-    @pytest.mark.advanced_model: Marks the test as L3 or L4 level, indicating that this test case performs deep validation, using real models for performance, integration, and accuracy testing. This forms a "basic-advanced" correspondence with the core_model mark at the L2 level.
+    @pytest.mark.advanced_model: Marks the test as L3 merge level, indicating deep validation with real models. @pytest.mark.full_model: Marks L4 nightly-only suites (e.g. `test_*_expansion.py`, doc examples).
 
     @pytest.mark.core_model: Marks the test as L1 or L2 level, indicating that this test case validates the basic functionality of the core model. It uses mock weights and only checks if the relevant interface functions correctly.
 
     @pytest.mark.parametrize: A parameterization decorator that allows abstracting test data into parameters, enabling reuse of the same test logic across different data configurations. indirect=True indicates that parameters will be passed to the fixture for processing.
 
-    **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation run logic at the merge or nightly level, you can mark it only with @pytest.mark.advanced_model. If you believe the test case needs to accommodate both basic run and deep validation test logic, you should mark it with both @pytest.mark.core_model and @pytest.mark.advanced_model.
+    **Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation at merge (L3), use @pytest.mark.advanced_model. For L4 nightly-only expansion and doc-example tests, use @pytest.mark.full_model with `--run-level full_model`. If the test case needs both basic run and deep validation, mark with @pytest.mark.core_model and the appropriate L3/L4 marker (`advanced_model` and/or `full_model`).
 
     **2.4.2 Test Function Definition and Documentation**
 
@@ -515,9 +515,11 @@ L3 level testing executes after code is merged into the main branch. Its core pu
 
     **Single Request**: The comment clearly states this is a single-request completion test. For concurrent testing, it can be extended to multiple requests using request_num = n.
 
-    **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model performs deep validation.
+    **Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model and full_model perform deep validation.
 
--   ***Run Command***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`
+-   ***Run Command (L3 merge)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`
+
+-   ***Run Command (L4 nightly expansion)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}_expansion.py -m full_model --run-level=full_model`
 
 ## Chapter 3: L4 Level Testing - Full Functionality, Performance, and Documentation Testing
 
diff --git a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md
index 69d6ad82871..d90275afc99 100644
--- a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md
+++ b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md
@@ -37,7 +37,7 @@ Currently all the features are available in online serving mode. Hence, only nee
 **Code Style**
 
 - Validation: test that the multimodal output files of your model have the correct shapes. `OpenAIClientHandler.send_diffusion_request` should have taken care of this.
-- Test marks: always add `advanced_model` and `diffusion`. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
+- Test marks: always add `full_model` and `diffusion` for L4 nightly `test_*_expansion.py` cases. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
 - To maximize code reuse, you may refer to
     - `tests/conftest.py` for `omni_server` (running server in subprocess) and `openai_client` fixtures (sending requests and validating output), `generate_synthetic_image` and `assert_XXX_valid` helper.
     - `tests/utils.py` for `@hardware_test(...)` and `hardware_marks`.
diff --git a/docs/contributing/ci/test_guide.md b/docs/contributing/ci/test_guide.md
index 425f24332c2..b429d732658 100644
--- a/docs/contributing/ci/test_guide.md
+++ b/docs/contributing/ci/test_guide.md
@@ -42,32 +42,47 @@ Our test scripts use the pytest framework. First, please use `git clone https://
     ```
     The latest test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-ready.yml).
 
-=== "L3 level & L4 level"
+=== "L3 level"
 
     ```bash
     cd tests
     pytest -s -v -m "advanced_model" --run-level=advanced_model
     ```
-    If you only want to run L3 test case, you can use:
+    If you only want to run a specific test case, you can use:
+    ```bash
+    pytest -s -v test_xxxx.py --run-level=advanced_model
+    ```
+    If you only want to run specific test cases on a particular platform, you can use:
     ```bash
-    pytest -s -v e2e/ --ignore-glob='*expansion.py' -m "advanced_model" --run-level=advanced_model
+    pytest -s -v -m "advanced_model and distributed_cuda and L4"  --run-level=advanced_model
+    ```
+    The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).
+
+
+=== "L4 level"
+
+    ```bash
+    cd tests
+    pytest -s -v -m "full_model" --run-level=full_model
     ```
     If you only want to run a specific test case, you can use:
     ```bash
-    pytest -s -v test_xxxx.py --run-level=advanced_model
+    pytest -s -v test_xxxx.py --run-level=full_model
     ```
     If you only want to run specific test cases on a particular platform, you can use:
     ```bash
-    pytest -s -v -m "core_model and distributed_cuda and L4"  --run-level=core_model
+    pytest -s -v -m "full_model and distributed_cuda and L4"  --run-level=full_model
     ```
     Note: To run performance tests, use:
     ```bash
     pytest -s -v perf/scripts/run_benchmark.py
     ```
+    The latest L4 (nightly) test commands use the `full_model` marker and `--run-level full_model` (see [test-nightly.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml) and [test-nightly-diffusion.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly-diffusion.yml)). Example:
 
-    The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).
-
-    The latest L4 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml).
+    ```bash
+    cd tests
+    pytest -s -v -m "full_model and omni and H100" --run-level=full_model
+    ```
 
 You can find more information about markers in the documentation: [marker doc](./tests_markers.md)
 
diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md
index 7c1ba1c73bd..b651946d79b 100644
--- a/docs/contributing/ci/tests_markers.md
+++ b/docs/contributing/ci/tests_markers.md
@@ -8,7 +8,8 @@ Defined in `pyproject.toml`:
 | Marker             | Description                                               |
 | ------------------ | --------------------------------------------------------- |
 | `core_model`       | L1&L2 tests (run in each PR)                              |
-| `advanced_model`   | L3&L4 level tests (run in each merge or nightly)          |
+| `advanced_model`   | L3 tests (run on each merge to main)                 |
+| `full_model`       | L4 tests (run nightly) |
 | `diffusion`        | Diffusion model tests                                     |
 | `omni`             | Omni model tests                                          |
 | `cache`            | Cache backend tests                                       |
diff --git a/pyproject.toml b/pyproject.toml
index e49aa6e3251..2f160b21a50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,7 +163,8 @@ addopts = [
 markers = [
     # ci/cd required
     "core_model: L1&L2 tests (run in each PR)",
-    "advanced_model: L3&L4 level tests (run in each merge or nightly)",
+    "advanced_model: L3 level tests (run on each merge)",
+    "full_model: L4 level tests (run nightly)",
     # function module markers
     "diffusion: Diffusion model tests",
     "omni: Omni model tests",
diff --git a/tests/conftest.py b/tests/conftest.py
index 8ac790f137d..b17d52d29aa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -54,6 +54,11 @@
 logger = init_logger(__name__)
 
 
+def _is_deep_run_level(run_level: str | None) -> bool:
+    """True for merge/nightly deep validation (--run-level advanced_model or full_model)."""
+    return run_level in ("advanced_model", "full_model")
+
+
 PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None
 PromptImageInput = list[Any] | Any | None
 PromptVideoInput = list[Any] | Any | None
@@ -105,7 +110,7 @@ def assert_image_diffusion_response(
             f"Expected {num_outputs_per_prompt} images, got {len(response.images)}"
         )
 
-    if run_level == "advanced_model":
+    if _is_deep_run_level(run_level):
         width = extra_body.get("width")
         height = extra_body.get("height")
 
@@ -359,7 +364,7 @@ def _run_pre_test_cleanup(enable_force=False):
 
 def _run_post_test_cleanup(enable_force=False):
     if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force:
-        print("GPU cleanup disabled")
+        print("\nPost-test GPU cleanup skipped(Default off is typical when one worker/instance runs many tests.)\n")
         return
 
     if torch.cuda.is_available():
@@ -1551,8 +1556,8 @@ def pytest_addoption(parser):
         "--run-level",
         action="store",
         default="core_model",
-        choices=["core_model", "advanced_model"],
-        help="Test level to run: L2, L3",
+        choices=["core_model", "advanced_model", "full_model"],
+        help="Test level to run: L2, L3 (merge), L4 nightly (full_model)",
     )
 
 
@@ -1577,7 +1582,7 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st
         model = model_prefix + params.model
         port = params.port
         stage_config_path = params.stage_config_path
-        if run_level == "advanced_model" and stage_config_path is not None:
+        if _is_deep_run_level(run_level) and stage_config_path is not None:
             with open(stage_config_path, encoding="utf-8") as f:
                 cfg = yaml.safe_load(f) or {}
             stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage]
@@ -1867,7 +1872,7 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
 
     modalities = request_config.get("modalities", ["text", "audio"])
 
-    if run_level == "advanced_model":
+    if _is_deep_run_level(run_level):
         if "audio" in modalities:
             assert response.audio_content is not None, "No audio output is generated"
             print(f"audio content is: {response.audio_content}")
@@ -1915,7 +1920,7 @@ def assert_audio_speech_response(
 ) -> None:
     """
     Validate /v1/audio/speech response: success, optional format check, transcription similarity
-    and gender (non-PCM only for advanced_model), and int16 PCM HNR when response_format is pcm.
+    and gender (non-PCM only for advanced/full run level), and int16 PCM HNR when response_format is pcm.
     """
     assert response.success, "The request failed."
 
@@ -1937,7 +1942,7 @@ def assert_audio_speech_response(
     if e2e_latency is not None:
         print(f"the avg e2e latency is: {e2e_latency}")
 
-    if run_level == "advanced_model" and req_fmt != "pcm":
+    if _is_deep_run_level(run_level) and req_fmt != "pcm":
         # Text–audio semantic similarity check (skipped for raw PCM: no Whisper transcript).
         expected_text = request_config.get("input")
         if expected_text:
@@ -1967,7 +1972,7 @@ def assert_diffusion_response(response: DiffusionResponse, request_config: dict[
     Args:
         response: DiffusionResponse object.
         request_config: Request configuration dictionary.
-        run_level: Test run level (e.g. "core_model", "advanced_model")
+        run_level: Test run level (e.g. "core_model", "advanced_model", "full_model")
 
     Raises:
         AssertionError: When the response does not meet validation criteria
diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py
index 9e375fa9fec..8717ee457c1 100644
--- a/tests/dfx/perf/scripts/run_benchmark.py
+++ b/tests/dfx/perf/scripts/run_benchmark.py
@@ -194,6 +194,9 @@ def assert_result(
                 print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}")
 
 
+@pytest.mark.full_model
+@pytest.mark.benchmark
+@pytest.mark.omni
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 @pytest.mark.parametrize("benchmark_params", benchmark_indices, indirect=True)
 def test_performance_benchmark(omni_server, benchmark_params):
diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py
index 1bd9bf1a143..8a371efc3f5 100644
--- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py
+++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py
@@ -534,8 +534,9 @@ def assert_result(result: dict[str, Any], params: dict[str, Any]) -> None:
 # ---------------------------------------------------------------------------
 # Test entry point
 # ---------------------------------------------------------------------------
-
-
+@pytest.mark.full_model
+@pytest.mark.benchmark
+@pytest.mark.diffusion
 @pytest.mark.parametrize(
     "diffusion_server",
     server_params,
diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py
index b4b83187135..09c01807bed 100644
--- a/tests/e2e/accuracy/test_gebench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py
@@ -10,7 +10,7 @@
 from tests.utils import hardware_test
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.benchmark
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
diff --git a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py
index ac5f2cb3cfd..9f9d1bbcde0 100644
--- a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py
+++ b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py
@@ -11,7 +11,7 @@
 from tests.utils import hardware_test
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.benchmark
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
diff --git a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py
index 3cdda1f9ffa..21bb6c0921b 100644
--- a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py
+++ b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py
@@ -537,7 +537,7 @@ def _generate_offline_video(*, image_source: str) -> tuple[Path, Path]:
     return offline_path, offline_metadata_path
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.benchmark
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
@@ -563,7 +563,7 @@ def test_wan22_i2v_diffusers_offline_generates_video(
     assert offline_metadata["frame_count"] == NUM_FRAMES
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.benchmark
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"}, num_cards=2)
@@ -594,7 +594,7 @@ def test_wan22_i2v_online_serving_generates_video(
     assert online_metadata["frame_count"] == NUM_FRAMES
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.benchmark
 @pytest.mark.diffusion
 @hardware_test(res={"cuda": "H100"}, num_cards=2)
diff --git a/tests/e2e/online_serving/test_bagel_expansion.py b/tests/e2e/online_serving/test_bagel_expansion.py
index e2d75e0d199..4da9e0a5efa 100644
--- a/tests/e2e/online_serving/test_bagel_expansion.py
+++ b/tests/e2e/online_serving/test_bagel_expansion.py
@@ -117,7 +117,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_flux2_expansion.py b/tests/e2e/online_serving/test_flux2_expansion.py
index 336bd83a1d2..5a202fa983d 100644
--- a/tests/e2e/online_serving/test_flux2_expansion.py
+++ b/tests/e2e/online_serving/test_flux2_expansion.py
@@ -90,7 +90,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
index eba0fbda225..5c54f74e77d 100644
--- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py
+++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py
@@ -50,7 +50,7 @@ def _get_flux_2_dev_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_flux_kontext_expansion.py b/tests/e2e/online_serving/test_flux_kontext_expansion.py
index c13e1e8189d..177c36a0445 100644
--- a/tests/e2e/online_serving/test_flux_kontext_expansion.py
+++ b/tests/e2e/online_serving/test_flux_kontext_expansion.py
@@ -34,7 +34,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -59,7 +59,7 @@ def test_flux_kontext_text_to_image(omni_server: OmniServer, openai_client: Open
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -88,7 +88,7 @@ def test_flux_kontext_image_edit(omni_server: OmniServer, openai_client: OpenAIC
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -115,7 +115,7 @@ def test_flux_kontext_image_edit_no_negative(omni_server: OmniServer, openai_cli
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -140,7 +140,7 @@ def test_flux_kontext_high_resolution(omni_server: OmniServer, openai_client: Op
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_hunyuan_video_15_expansion.py b/tests/e2e/online_serving/test_hunyuan_video_15_expansion.py
index de950edb900..7da68b38cd8 100644
--- a/tests/e2e/online_serving/test_hunyuan_video_15_expansion.py
+++ b/tests/e2e/online_serving/test_hunyuan_video_15_expansion.py
@@ -66,7 +66,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_longcat_image_edit_expansion.py b/tests/e2e/online_serving/test_longcat_image_edit_expansion.py
index 8a2cfbcc145..3a40dfebfe1 100644
--- a/tests/e2e/online_serving/test_longcat_image_edit_expansion.py
+++ b/tests/e2e/online_serving/test_longcat_image_edit_expansion.py
@@ -55,7 +55,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_longcat_image_expansion.py b/tests/e2e/online_serving/test_longcat_image_expansion.py
index 161e7cd2e65..60c4ef7cadb 100644
--- a/tests/e2e/online_serving/test_longcat_image_expansion.py
+++ b/tests/e2e/online_serving/test_longcat_image_expansion.py
@@ -56,7 +56,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index 0bcc86840ba..2c96dab57c1 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -113,7 +113,7 @@ def get_max_batch_size(size_type="few"):
     return batch_sizes.get(size_type, 5)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -137,7 +137,7 @@ def test_text_to_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
@@ -159,7 +159,7 @@ def test_text_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -184,7 +184,7 @@ def test_image_to_text_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -208,7 +208,7 @@ def test_image_to_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -232,7 +232,7 @@ def test_image_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -256,7 +256,7 @@ def test_video_to_text_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -280,7 +280,7 @@ def test_video_to_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -304,7 +304,7 @@ def test_video_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
@@ -329,7 +329,7 @@ def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
@@ -355,7 +355,7 @@ def test_text_image_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
@@ -383,7 +383,7 @@ def test_text_video_to_text_audio_001(omni_server, openai_client) -> None:
 
 
 @pytest.mark.skip(reason="There is a known issue with shape mismatch error.")
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
@@ -414,7 +414,7 @@ def test_mix_to_text_audio_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -442,7 +442,7 @@ def test_audio_in_video_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -471,7 +471,7 @@ def test_audio_in_video_002(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -507,7 +507,7 @@ def test_one_word_prompt_001(omni_server, openai_client) -> None:
             print(f"Similarity assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}")
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -534,7 +534,7 @@ def test_speaker_001(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -571,7 +571,7 @@ def test_speaker_002(omni_server, openai_client) -> None:
             print(f"Gender assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}")
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -598,7 +598,7 @@ def test_speaker_003(omni_server, openai_client) -> None:
     openai_client.send_omni_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
index 3c33485e4f4..36610534890 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -64,7 +64,7 @@ def get_max_batch_size(size_type="few"):
 ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.core_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
@@ -92,7 +92,7 @@ def test_voice_clone_streaming_001(omni_server, openai_client) -> None:
     openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size("few"))
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.core_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
diff --git a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py
index 03a985896e4..3967e317651 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py
@@ -62,7 +62,7 @@ def get_max_batch_size(size_type="few"):
 ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
@@ -95,7 +95,7 @@ def test_voice_001(omni_server, openai_client) -> None:
             raise
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
@@ -120,7 +120,7 @@ def test_voice_002(omni_server, openai_client) -> None:
     openai_client.send_audio_speech_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
@@ -145,7 +145,7 @@ def test_voice_003(omni_server, openai_client) -> None:
     openai_client.send_audio_speech_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
diff --git a/tests/e2e/online_serving/test_qwen_image_edit_expansion.py b/tests/e2e/online_serving/test_qwen_image_edit_expansion.py
index 14e4c915b6b..2b371cea9e1 100644
--- a/tests/e2e/online_serving/test_qwen_image_edit_expansion.py
+++ b/tests/e2e/online_serving/test_qwen_image_edit_expansion.py
@@ -113,7 +113,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -143,7 +143,7 @@ def test_qwen_image_edit(omni_server: OmniServer, openai_client: OpenAIClientHan
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_qwen_image_expansion.py b/tests/e2e/online_serving/test_qwen_image_expansion.py
index 88e56cc3e10..fe8771c18ed 100644
--- a/tests/e2e/online_serving/test_qwen_image_expansion.py
+++ b/tests/e2e/online_serving/test_qwen_image_expansion.py
@@ -122,7 +122,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
@@ -147,7 +147,7 @@ def test_qwen_image(omni_server: OmniServer, openai_client: OpenAIClientHandler)
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_qwen_image_layered_expansion.py b/tests/e2e/online_serving/test_qwen_image_layered_expansion.py
index fc73801c0e0..0e9c0074e48 100644
--- a/tests/e2e/online_serving/test_qwen_image_layered_expansion.py
+++ b/tests/e2e/online_serving/test_qwen_image_layered_expansion.py
@@ -77,7 +77,7 @@
 ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize("omni_server", FEATURE_CASES, indirect=True)
 def test_feature(omni_server: OmniServer, openai_client: OpenAIClientHandler):
@@ -155,7 +155,7 @@ def _collect_image_url_items(openai_client: OpenAIClientHandler, request_config:
     return image_items
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server, expected_layers",
@@ -230,7 +230,7 @@ def test_layered_output_image_count(
 ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize("omni_server", PROMPT_CASES, indirect=True)
 def test_empty_prompt(omni_server: OmniServer, openai_client: OpenAIClientHandler):
diff --git a/tests/e2e/online_serving/test_sd3_expansion.py b/tests/e2e/online_serving/test_sd3_expansion.py
index 3ed5cc5f308..819440dd027 100644
--- a/tests/e2e/online_serving/test_sd3_expansion.py
+++ b/tests/e2e/online_serving/test_sd3_expansion.py
@@ -39,7 +39,7 @@ def _get_diffusion_feature_cases(model: str):
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_wan22_expansion.py b/tests/e2e/online_serving/test_wan22_expansion.py
index e5e2d748d58..cf49e9fb857 100644
--- a/tests/e2e/online_serving/test_wan22_expansion.py
+++ b/tests/e2e/online_serving/test_wan22_expansion.py
@@ -83,7 +83,7 @@ def _get_wan22_feature_cases():
     return cases
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py b/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py
index 0de70afe862..c01f206d2f9 100644
--- a/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py
+++ b/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py
@@ -135,7 +135,7 @@ def _get_vace_feature_cases():
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/e2e/online_serving/test_zimage_expansion.py b/tests/e2e/online_serving/test_zimage_expansion.py
index 9f90ec855b6..05d5af4e4f5 100644
--- a/tests/e2e/online_serving/test_zimage_expansion.py
+++ b/tests/e2e/online_serving/test_zimage_expansion.py
@@ -105,7 +105,7 @@ def _get_diffusion_feature_cases():
     ]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.diffusion
 @pytest.mark.parametrize(
     "omni_server",
diff --git a/tests/examples/offline_inference/test_text_to_image.py b/tests/examples/offline_inference/test_text_to_image.py
index a08d16f1614..58a7d3894fc 100644
--- a/tests/examples/offline_inference/test_text_to_image.py
+++ b/tests/examples/offline_inference/test_text_to_image.py
@@ -11,7 +11,7 @@
 from tests.examples.conftest import EXAMPLES, ExampleRunner, ReadmeSnippet
 from tests.utils import hardware_marks
 
-pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
+pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
 
 T2I_SCRIPT = EXAMPLES / "offline_inference" / "text_to_image" / "text_to_image.py"
 README_PATH = T2I_SCRIPT.with_name("README.md")
diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py
index a78ccf5924a..1d24426bf54 100644
--- a/tests/examples/online_serving/test_qwen2_5_omni.py
+++ b/tests/examples/online_serving/test_qwen2_5_omni.py
@@ -21,7 +21,7 @@
 )
 from tests.utils import hardware_test
 
-pytestmark = [pytest.mark.advanced_model, pytest.mark.example]
+pytestmark = [pytest.mark.full_model, pytest.mark.example]
 
 models = ["Qwen/Qwen2.5-Omni-7B"]
 
@@ -44,7 +44,7 @@
 common_args = ["python", os.path.join(example_dir, "openai_chat_completion_client_for_multimodal_generation.py")]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -81,7 +81,7 @@ def test_send_multimodal_request_001(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -118,7 +118,7 @@ def test_send_multimodal_request_002(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -145,7 +145,7 @@ def test_send_multimodal_request_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -175,7 +175,7 @@ def test_modality_control_001(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -204,7 +204,7 @@ def test_modality_control_002(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -242,7 +242,7 @@ def test_modality_control_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py
index 65f99d7bf28..81582da63e8 100644
--- a/tests/examples/online_serving/test_qwen3_omni.py
+++ b/tests/examples/online_serving/test_qwen3_omni.py
@@ -21,7 +21,7 @@
 )
 from tests.utils import hardware_test
 
-pytestmark = [pytest.mark.advanced_model, pytest.mark.example]
+pytestmark = [pytest.mark.full_model, pytest.mark.example]
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 
@@ -42,7 +42,7 @@
 common_args = ["python", os.path.join(example_dir, "openai_chat_completion_client_for_multimodal_generation.py")]
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -72,7 +72,7 @@ def test_send_multimodal_request_001(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -105,7 +105,7 @@ def test_send_multimodal_request_002(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -122,7 +122,7 @@ def test_send_multimodal_request_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -146,7 +146,7 @@ def test_modality_control_001(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -170,7 +170,7 @@ def test_modality_control_002(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
@@ -202,7 +202,7 @@ def test_modality_control_003(omni_server) -> None:
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
-@pytest.mark.advanced_model
+@pytest.mark.full_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
diff --git a/tests/examples/online_serving/test_text_to_image.py b/tests/examples/online_serving/test_text_to_image.py
index 51b7ff61bc9..30261fff393 100644
--- a/tests/examples/online_serving/test_text_to_image.py
+++ b/tests/examples/online_serving/test_text_to_image.py
@@ -17,7 +17,7 @@
 from tests.examples.conftest import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora
 from tests.utils import hardware_marks
 
-pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
+pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
 
 T2I_ONLINE_CLIENT = EXAMPLES / "online_serving" / "text_to_image" / "openai_chat_client.py"
 EXAMPLE_OUTPUT_SUBFOLDER = "example_online_t2i"

From 69799bafc59b288a14e649a0fbb5b260c129dce6 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Mon, 20 Apr 2026 21:17:01 +0800
Subject: [PATCH 2/9] Refactor test files to remove unnecessary blank lines and
 ensure consistent pytestmark usage across various test modules.

Signed-off-by: wangyu <410167048@qq.com>
---
 tests/core/sched/test_omni_scheduler_mixin.py |   5 +-
 tests/diffusion/cache/test_cache_backends.py  | 272 ++----------------
 .../diffusion/models/bagel/test_bagel_lora.py |   1 -
 .../models/bagel/test_trajectory_recording.py |   1 -
 .../test_hunyuan_image3_sampler.py            |   1 -
 .../models/t5_encoder/test_t5_encoder_tp.py   |   1 -
 .../quantization/test_int8_config.py          |   1 -
 .../diffusion/test_diffusion_step_pipeline.py |   1 -
 .../test_mooncake_transfer_engine_buffer.py   |   3 +-
 .../test_mooncake_transfer_engine_rdma.py     |   3 +-
 .../custom_pipeline/test_worker_extension.py  |   5 +-
 .../e2e/offline_inference/test_dynin_omni.py  |   4 +-
 .../test_hunyuanimage3_text2img.py            |   4 +-
 .../test_dynin_omni_expansion.py              |   1 -
 tests/engine/test_output_modality.py          |   4 +-
 .../openai_api/test_image_server.py           |   1 -
 .../openai_api/test_serving_speech.py         |   1 -
 tests/entrypoints/test_pd_disaggregation.py   |   1 -
 .../offline_inference/test_text_to_image.py   |   4 +-
 .../online_serving/test_text_to_image.py      |   3 +-
 tests/examples/test_slerp_interpolation.py    |   4 +-
 .../test_mimo_audio_code2wav_batch_decode.py  |   1 -
 .../qwen2_5_omni/test_qwen2_5_omni_embed.py   |   1 -
 .../qwen3_tts/test_cuda_graph_decoder.py      |   1 -
 .../test_cuda_graph_acoustic_transformer.py   |   1 -
 .../voxtral_tts/test_text_preprocess.py       |   5 +-
 .../test_qwen3_tts_async_chunk.py             |   1 -
 tests/test_diffusion_config_fields.py         |   1 -
 28 files changed, 40 insertions(+), 292 deletions(-)

diff --git a/tests/core/sched/test_omni_scheduler_mixin.py b/tests/core/sched/test_omni_scheduler_mixin.py
index 096972694f6..e04a9c39fbc 100644
--- a/tests/core/sched/test_omni_scheduler_mixin.py
+++ b/tests/core/sched/test_omni_scheduler_mixin.py
@@ -21,11 +21,10 @@
 from vllm.v1.request import Request, RequestStatus, StreamingUpdate
 from vllm_omni.core.sched.omni_scheduler_mixin import OmniSchedulerMixin
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
 # isort: on
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class _SchedulerStub(OmniSchedulerMixin):
     """Minimal scheduler surface required by OmniSchedulerMixin."""
diff --git a/tests/diffusion/cache/test_cache_backends.py b/tests/diffusion/cache/test_cache_backends.py
index ecb99394592..0b7ef723585 100644
--- a/tests/diffusion/cache/test_cache_backends.py
+++ b/tests/diffusion/cache/test_cache_backends.py
@@ -2,269 +2,39 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Unit tests for cache backends (cache-dit and teacache).
-
-This module tests the cache backend implementations:
-- CacheDiTBackend: cache-dit acceleration backend
-- TeaCacheBackend: TeaCache hook-based backend
-- Cache selector function: get_cache_backend
-- DiffusionCacheConfig: configuration dataclass
+Model specific tests for CacheDiT enablement.
 """
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-from vllm_omni.diffusion.cache.cache_dit_backend import (
-    CUSTOM_DIT_ENABLERS,
-    CacheDiTBackend,
-)
-from vllm_omni.diffusion.cache.selector import get_cache_backend
-from vllm_omni.diffusion.cache.teacache.backend import TeaCacheBackend
+import vllm_omni.diffusion.cache.cache_dit_backend as cd_backend
 from vllm_omni.diffusion.data import DiffusionCacheConfig
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
+SEPARATE_CFG_ENABLERS = [
+    cd_backend.enable_cache_for_ltx2,
+    cd_backend.enable_cache_for_wan22,
+    cd_backend.enable_cache_for_longcat_image,
+]
 
-class TestCacheDiTBackend:
-    """Test CacheDiTBackend implementation."""
-
-    def test_init_with_dict(self):
-        """Test initialization with dictionary config."""
-        config_dict = {"Fn_compute_blocks": 4, "max_warmup_steps": 8}
-        backend = CacheDiTBackend(config_dict)
-        assert backend.config.Fn_compute_blocks == 4
-        assert backend.config.max_warmup_steps == 8
-        assert backend.enabled is False
-
-    def test_init_with_config_object(self):
-        """Test initialization with DiffusionCacheConfig object."""
-        config = DiffusionCacheConfig(Fn_compute_blocks=4)
-        backend = CacheDiTBackend(config)
-        assert backend.config.Fn_compute_blocks == 4
-        assert backend.enabled is False
-
-    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
-    def test_enable_single_transformer(self, mock_cache_dit):
-        """Test enabling cache-dit on single-transformer pipeline."""
-        # Mock pipeline
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "DiTPipeline"
-        mock_transformer = Mock()
-        mock_pipeline.transformer = mock_transformer
-
-        # Mock cache_dit functions
-        mock_cache_dit.enable_cache = Mock()
-        mock_cache_dit.refresh_context = Mock()
-
-        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
-        backend.enable(mock_pipeline)
-
-        # Verify cache-dit was enabled
-        assert backend.enabled is True
-        assert backend._refresh_func is not None
-        mock_cache_dit.enable_cache.assert_called_once()
-
-    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
-    def test_refresh(self, mock_cache_dit):
-        """Test refreshing cache context with SCM mask policy updates when num_inference_steps changes."""
-        # Mock pipeline
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "DiTPipeline"
-        mock_transformer = Mock()
-        mock_pipeline.transformer = mock_transformer
-
-        # Mock cache_dit functions
-        mock_cache_dit.enable_cache = Mock()
-        mock_cache_dit.refresh_context = Mock()
-        mock_steps_mask_50 = [1, 0, 1, 0, 1] * 10  # Mock mask for 50 steps
-        mock_steps_mask_100 = [1, 0, 1, 0, 1] * 20  # Mock mask for 100 steps
-        mock_cache_dit.steps_mask = Mock(side_effect=[mock_steps_mask_50, mock_steps_mask_100])
-
-        # Enable cache-dit with SCM enabled (using mask policy)
-        config = DiffusionCacheConfig(
-            scm_steps_mask_policy="fast",
-            scm_steps_policy="dynamic",
-        )
-        backend = CacheDiTBackend(config)
-        backend.enable(mock_pipeline)
-
-        # First refresh with 50 steps
-        backend.refresh(mock_pipeline, num_inference_steps=50)
-        assert backend._last_num_inference_steps == 50
-
-        # Verify steps_mask was called with mask policy (not direct steps mask)
-        mock_cache_dit.steps_mask.assert_called_with(mask_policy="fast", total_steps=50)
-        assert mock_cache_dit.steps_mask.call_count == 1
-
-        # Verify refresh_context was called with cache_config (SCM path)
-        mock_cache_dit.refresh_context.assert_called_once()
-        call_args = mock_cache_dit.refresh_context.call_args
-        assert call_args[0][0] == mock_transformer
-        # Check that cache_config was passed (not num_inference_steps directly when SCM is enabled)
-        assert "cache_config" in call_args[1]
-        cache_config_arg = call_args[1]["cache_config"]
-        assert cache_config_arg is not None
-
-        # Change num_inference_steps and refresh again
-        mock_cache_dit.refresh_context.reset_mock()
-        backend.refresh(mock_pipeline, num_inference_steps=100)
-
-        # Verify steps_mask was called again with new num_inference_steps (using mask policy)
-        assert mock_cache_dit.steps_mask.call_count == 2
-        # Check the last call was with 100 steps and mask policy
-        assert mock_cache_dit.steps_mask.call_args_list[-1].kwargs["total_steps"] == 100
-        assert mock_cache_dit.steps_mask.call_args_list[-1].kwargs["mask_policy"] == "fast"
-
-        # Verify refresh_context was called again with updated mask
-        mock_cache_dit.refresh_context.assert_called_once()
-        call_args = mock_cache_dit.refresh_context.call_args
-        assert call_args[0][0] == mock_transformer
-        assert "cache_config" in call_args[1]
-        assert backend._last_num_inference_steps == 100
-
-    def test_hunyuan_custom_enabler_registered(self):
-        """Test HunyuanImage3 custom cache-dit enabler is registered."""
-        assert "HunyuanImage3Pipeline" in CUSTOM_DIT_ENABLERS
-
-    @patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter")
-    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
-    def test_enable_hunyuan_pipeline_uses_model_transformer(self, mock_cache_dit, mock_block_adapter):
-        """Test HunyuanImage3 custom enabler uses pipeline.model for cache enable/refresh."""
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "HunyuanImage3Pipeline"
-        mock_pipeline.model = Mock()
-        mock_pipeline.model.layers = Mock()
-
-        mock_cache_dit.enable_cache = Mock()
-        mock_cache_dit.refresh_context = Mock()
-
-        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
-        backend.enable(mock_pipeline)
-
-        assert backend.enabled is True
-        assert backend._refresh_func is not None
-        mock_block_adapter.assert_called_once()
-        adapter_kwargs = mock_block_adapter.call_args.kwargs
-        assert adapter_kwargs["transformer"] is mock_pipeline.model
-        assert adapter_kwargs["blocks"] is mock_pipeline.model.layers
-        assert adapter_kwargs["forward_pattern"] == adapter_kwargs["forward_pattern"].__class__.Pattern_4
-        assert len(adapter_kwargs["params_modifiers"]) == 1
-        mock_cache_dit.enable_cache.assert_called_once()
-
-        backend.refresh(mock_pipeline, num_inference_steps=12)
-        mock_cache_dit.refresh_context.assert_called_once()
-        call_args = mock_cache_dit.refresh_context.call_args
-        assert call_args[0][0] is mock_pipeline.model
-        assert call_args[1]["num_inference_steps"] == 12
-
-    def test_enable_hunyuan_pipeline_requires_model_layers(self):
-        """Test HunyuanImage3 enabler fails with a formatted pipeline class name."""
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "HunyuanImage3Pipeline"
-        mock_pipeline.model = Mock(spec=[])
-
-        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
-
-        with pytest.raises(ValueError, match="HunyuanImage3Pipeline"):
-            backend.enable(mock_pipeline)
-
-
-class TestTeaCacheBackend:
-    """Test TeaCacheBackend implementation."""
-
-    def test_init(self):
-        """Test initialization."""
-        config = DiffusionCacheConfig(rel_l1_thresh=0.3)
-        backend = TeaCacheBackend(config)
-        assert backend.config.rel_l1_thresh == 0.3
-        assert backend.enabled is False
-
-    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
-    def test_enable(self, mock_apply_hook):
-        """Test enabling TeaCache on pipeline."""
-        # Mock pipeline
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
-        mock_transformer = Mock()
-        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
-        mock_pipeline.transformer = mock_transformer
-
-        config = DiffusionCacheConfig(rel_l1_thresh=0.3)
-        backend = TeaCacheBackend(config)
-        backend.enable(mock_pipeline)
-
-        # Verify hook was applied
-        assert backend.enabled is True
-        mock_apply_hook.assert_called_once()
-
-    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
-    def test_enable_with_coefficients(self, mock_apply_hook):
-        """Test enabling TeaCache with custom coefficients."""
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
-        mock_transformer = Mock()
-        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
-        mock_pipeline.transformer = mock_transformer
-
-        config = DiffusionCacheConfig(rel_l1_thresh=0.3, coefficients=[1.0, 0.5, 0.2, 0.1, 0.05])
-        backend = TeaCacheBackend(config)
-        backend.enable(mock_pipeline)
-
-        assert backend.enabled is True
-        mock_apply_hook.assert_called_once()
-
-    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
-    def test_refresh(self, mock_apply_hook):
-        """Test refreshing TeaCache state."""
-        mock_pipeline = Mock()
-        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
-        mock_transformer = Mock()
-        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
-        mock_pipeline.transformer = mock_transformer
-
-        # Mock hook registry
-        mock_hook = Mock()
-        mock_registry = Mock()
-        mock_registry.get_hook = Mock(return_value=mock_hook)
-        mock_registry.reset_hook = Mock()
-        mock_transformer._hook_registry = mock_registry
-
-        config = DiffusionCacheConfig()
-        backend = TeaCacheBackend(config)
-        backend.enable(mock_pipeline)
-
-        # Test refresh
-        backend.refresh(mock_pipeline, num_inference_steps=50)
-        mock_registry.reset_hook.assert_called_once()
-
-
-class TestCacheSelector:
-    """Test cache backend selector function."""
-
-    def test_get_cache_backend_none(self):
-        """Test getting None backend."""
-        backend = get_cache_backend(None, None)
-        assert backend is None
+SAMPLE_CACHE_CONFIG = DiffusionCacheConfig()
 
-        backend = get_cache_backend("none", None)
-        assert backend is None
 
-    def test_get_cache_backend_cache_dit(self):
-        """Test getting cache-dit backend."""
-        config_dict = {"Fn_compute_blocks": 4}
-        backend = get_cache_backend("cache_dit", config_dict)
-        assert isinstance(backend, CacheDiTBackend)
-        assert backend.config.Fn_compute_blocks == 4
+@pytest.mark.parametrize("enabler", SEPARATE_CFG_ENABLERS)
+@patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter")
+@patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
+def test_separate_cfg(mock_cache_dit, mock_block_adapter, enabler):
+    """Ensure that custom enablers for models with separate CFG pass
+    the param through to cache_dit correctly.
 
-    def test_get_cache_backend_tea_cache(self):
-        """Test getting teacache backend."""
-        config_dict = {"rel_l1_thresh": 0.3}
-        backend = get_cache_backend("tea_cache", config_dict)
-        assert isinstance(backend, TeaCacheBackend)
-        assert backend.config.rel_l1_thresh == 0.3
+    Regression test for: https://github.com/vllm-project/vllm-omni/pull/2860
+    """
+    mock_pipeline = Mock()
+    enabler(mock_pipeline, SAMPLE_CACHE_CONFIG)
 
-    def test_get_cache_backend_invalid(self):
-        """Test getting invalid backend raises error."""
-        with pytest.raises(ValueError, match="Unsupported cache backend"):
-            get_cache_backend("invalid_backend", {})
+    mock_cache_dit.enable_cache.assert_called_once()
+    adapter_kwargs = mock_block_adapter.call_args.kwargs
+    assert adapter_kwargs["has_separate_cfg"] is True
diff --git a/tests/diffusion/models/bagel/test_bagel_lora.py b/tests/diffusion/models/bagel/test_bagel_lora.py
index 79d553df0f6..c285758fe86 100644
--- a/tests/diffusion/models/bagel/test_bagel_lora.py
+++ b/tests/diffusion/models/bagel/test_bagel_lora.py
@@ -21,7 +21,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 _FakeLinearBase = FakeLinearBase
 
 
diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py
index 36781fc2e57..345eac10784 100644
--- a/tests/diffusion/models/bagel/test_trajectory_recording.py
+++ b/tests/diffusion/models/bagel/test_trajectory_recording.py
@@ -16,7 +16,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 NUM_TOKENS = 8
 HIDDEN_DIM = 16
 NUM_TIMESTEPS = 5
diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py
index 2959b52a99a..51f6a85f580 100644
--- a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py
+++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py
@@ -8,7 +8,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 # Fake token IDs for testing (avoid importing the real model).
 END_OF_THINK = 100
 RECAPTION = 101
diff --git a/tests/diffusion/models/t5_encoder/test_t5_encoder_tp.py b/tests/diffusion/models/t5_encoder/test_t5_encoder_tp.py
index e9e13c970d7..b36acc68aa6 100644
--- a/tests/diffusion/models/t5_encoder/test_t5_encoder_tp.py
+++ b/tests/diffusion/models/t5_encoder/test_t5_encoder_tp.py
@@ -12,7 +12,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 _T5_MODULE = "vllm_omni.diffusion.models.t5_encoder.t5_encoder"
 
 SMALL_T5_CONFIG = dict(
diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py
index 0af875cb3d6..875277ece42 100644
--- a/tests/diffusion/quantization/test_int8_config.py
+++ b/tests/diffusion/quantization/test_int8_config.py
@@ -14,7 +14,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
 
-
 npu_available = pytest.mark.skipif(not current_omni_platform.is_npu(), reason="NPU platform not available.")
 
 cuda_available = pytest.mark.skipif(not current_omni_platform.is_cuda(), reason="GPU platform not available.")
diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py
index 8048f52029a..06f8cd14dc8 100644
--- a/tests/diffusion/test_diffusion_step_pipeline.py
+++ b/tests/diffusion/test_diffusion_step_pipeline.py
@@ -45,7 +45,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
 
-
 # ---------------------------------------------------------------------------
 # Helpers & fixtures
 # ---------------------------------------------------------------------------
diff --git a/tests/distributed/omni_connectors/test_mooncake_transfer_engine_buffer.py b/tests/distributed/omni_connectors/test_mooncake_transfer_engine_buffer.py
index 3fadac13d2a..55351efcbc9 100755
--- a/tests/distributed/omni_connectors/test_mooncake_transfer_engine_buffer.py
+++ b/tests/distributed/omni_connectors/test_mooncake_transfer_engine_buffer.py
@@ -16,9 +16,8 @@
     ManagedBuffer,
 )
 
-pytestmark = [pytest.mark.cpu, pytest.mark.parallel, pytest.mark.core_model]
-
 # All tests in this file are pure-CPU unit tests for the memory allocator.
+pytestmark = [pytest.mark.cpu, pytest.mark.parallel, pytest.mark.core_model]
 
 
 @pytest.mark.core_model
diff --git a/tests/distributed/omni_connectors/test_mooncake_transfer_engine_rdma.py b/tests/distributed/omni_connectors/test_mooncake_transfer_engine_rdma.py
index 88744f41a0b..7b376da62b1 100755
--- a/tests/distributed/omni_connectors/test_mooncake_transfer_engine_rdma.py
+++ b/tests/distributed/omni_connectors/test_mooncake_transfer_engine_rdma.py
@@ -23,9 +23,8 @@
     TransferEngine,
 )
 
-pytestmark = [pytest.mark.parallel, pytest.mark.gpu]
-
 # All tests in this file require Mooncake TransferEngine and an RDMA environment.
+pytestmark = [pytest.mark.parallel, pytest.mark.gpu]
 
 # ---------------------------------------------------------------------------
 # Shared helpers
diff --git a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py
index ef64146cf5c..653b35d7e2f 100644
--- a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py
+++ b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py
@@ -14,9 +14,6 @@
 from vllm_omni.diffusion.worker.diffusion_worker import CustomPipelineWorkerExtension
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 
-pytestmark = [pytest.mark.core_model]
-
-
 CUSTOM_PIPELINE_CLASS = (
     "tests.e2e.offline_inference.custom_pipeline.qwen_image_pipeline_with_logprob.QwenImagePipelineWithLogProbForTest"
 )
@@ -25,6 +22,8 @@
 )
 MODEL = "tiny-random/Qwen-Image"
 
+pytestmark = [pytest.mark.core_model]
+
 
 @pytest.mark.cpu
 def test_worker_extension_inheritance():
diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py
index 4c0f16023ba..f891fc4f12e 100644
--- a/tests/e2e/offline_inference/test_dynin_omni.py
+++ b/tests/e2e/offline_inference/test_dynin_omni.py
@@ -31,14 +31,14 @@
 stage_configs = [str(_DEFAULT_STAGE_CONFIG_PATH)]
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
+DYNIN_CONFIG_PATH = str(_DEFAULT_DYNIN_CONFIG_PATH) if _DEFAULT_DYNIN_CONFIG_PATH is not None else None
+
 pytestmark = [
     pytest.mark.core_model,
     pytest.mark.omni,
     pytest.mark.parametrize("omni_runner", test_params, indirect=True),
 ]
 
-DYNIN_CONFIG_PATH = str(_DEFAULT_DYNIN_CONFIG_PATH) if _DEFAULT_DYNIN_CONFIG_PATH is not None else None
-
 
 # prompting util
 def _build_mmu_prompt(tokenizer: Any, question: str, dynin_config_path: str | None) -> dict[str, Any]:
diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py
index 91fbb8afa22..bd0d132d093 100644
--- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py
+++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py
@@ -13,15 +13,13 @@
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.platforms import current_omni_platform
 
-pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion]
-
-
 PROMPT = "A brown and white dog is running on the grass"
 MODEL_NAME = "tencent/HunyuanImage-3.0"
 LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32"
 REPO_ROOT = Path(__file__).resolve().parents[3]
 STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml"
 
+pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion]
 
 # System prompt type. Options: None, dynamic, en_vanilla, en_recaption, en_think_recaption, en_unified
 # Below are the CLIP embedding tensors from the official HunyuanImage model (seed=1234, prompt: "A brown and white dog is running on the grass").
diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py
index c4f8054947d..de95f72d9d4 100644
--- a/tests/e2e/online_serving/test_dynin_omni_expansion.py
+++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py
@@ -145,7 +145,6 @@ def test_send_t2i_request_001(omni_server, openai_client) -> None:
     openai_client.send_diffusion_request(request_config)
 
 
-@pytest.mark.core_model
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True)
 def test_send_t2s_request_001(omni_server, dynin_t2s_openai_client) -> None:
diff --git a/tests/engine/test_output_modality.py b/tests/engine/test_output_modality.py
index afb22ec27c8..7a9c765028f 100644
--- a/tests/engine/test_output_modality.py
+++ b/tests/engine/test_output_modality.py
@@ -11,10 +11,8 @@
 import pytest
 import torch
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
 # ── Load modules without triggering vllm_omni.__init__ ─────────────
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 _ENGINE_DIR = Path(__file__).resolve().parents[2] / "vllm_omni" / "engine"
 
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 8e31ce43ce1..607b3eaa813 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -26,7 +26,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 # Unit Tests
 
 
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index edf6147755d..b388b18606b 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -38,7 +38,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/tests/entrypoints/test_pd_disaggregation.py b/tests/entrypoints/test_pd_disaggregation.py
index 28f014923f6..5ffabfbf2af 100644
--- a/tests/entrypoints/test_pd_disaggregation.py
+++ b/tests/entrypoints/test_pd_disaggregation.py
@@ -24,7 +24,6 @@
 
 pytestmark = pytest.mark.skip(reason="Temporarily skip PD entrypoint tests while PD config is being removed.")
 
-
 # Suppress noisy DeprecationWarnings from optional Swig bindings imported by vLLM dependencies.
 warnings.filterwarnings(
     "ignore",
diff --git a/tests/examples/offline_inference/test_text_to_image.py b/tests/examples/offline_inference/test_text_to_image.py
index 6e736176848..041c32dc4ef 100644
--- a/tests/examples/offline_inference/test_text_to_image.py
+++ b/tests/examples/offline_inference/test_text_to_image.py
@@ -9,8 +9,10 @@
 
 from tests.examples.helpers import EXAMPLES, ExampleRunner, ReadmeSnippet
 from tests.helpers.assertions import assert_image_valid
+from tests.helpers.mark import hardware_marks
+
+pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
 
-pytestmark = [pytest.mark.full_model, pytest.mark.example]
 
 T2I_SCRIPT = EXAMPLES / "offline_inference" / "text_to_image" / "text_to_image.py"
 README_PATH = T2I_SCRIPT.with_name("README.md")
diff --git a/tests/examples/online_serving/test_text_to_image.py b/tests/examples/online_serving/test_text_to_image.py
index 29005afff30..ee0a1fedba7 100644
--- a/tests/examples/online_serving/test_text_to_image.py
+++ b/tests/examples/online_serving/test_text_to_image.py
@@ -15,9 +15,10 @@
 
 from tests.examples.helpers import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora
 from tests.helpers.assertions import assert_image_valid
+from tests.helpers.mark import hardware_marks
 from tests.helpers.runtime import OmniServer, OmniServerParams
 
-pytestmark = [pytest.mark.full_model, pytest.mark.example]
+pytestmark = [pytest.mark.full_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
 
 T2I_ONLINE_CLIENT = EXAMPLES / "online_serving" / "text_to_image" / "openai_chat_client.py"
 EXAMPLE_OUTPUT_SUBFOLDER = "example_online_t2i"
diff --git a/tests/examples/test_slerp_interpolation.py b/tests/examples/test_slerp_interpolation.py
index c99c7037a42..fe7db74ccf6 100644
--- a/tests/examples/test_slerp_interpolation.py
+++ b/tests/examples/test_slerp_interpolation.py
@@ -8,13 +8,13 @@
 import numpy as np
 import pytest
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
 # Import the slerp function from the example script.
 _examples_dir = str(Path(__file__).parent.parent.parent / "examples" / "online_serving" / "qwen3_tts")
 sys.path.insert(0, _examples_dir)
 from speaker_embedding_interpolation import slerp  # noqa: E402
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class TestSlerp:
     def test_endpoints(self):
diff --git a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py
index 2795534920b..8858d1f8f16 100644
--- a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py
+++ b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py
@@ -16,7 +16,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 _GROUP = 4
 _AC = 8
 _GROUP_WIDTH = flat_codec_group_element_count(_GROUP, _AC)
diff --git a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py
index 9e6d5f644c7..587e7f7f8b1 100644
--- a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py
+++ b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py
@@ -20,7 +20,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 # Fake token IDs
 AUDIO_TOKEN_ID = 1001
 IMAGE_TOKEN_ID = 1002
diff --git a/tests/model_executor/models/qwen3_tts/test_cuda_graph_decoder.py b/tests/model_executor/models/qwen3_tts/test_cuda_graph_decoder.py
index c8f63475ae3..86af757809d 100644
--- a/tests/model_executor/models/qwen3_tts/test_cuda_graph_decoder.py
+++ b/tests/model_executor/models/qwen3_tts/test_cuda_graph_decoder.py
@@ -22,7 +22,6 @@
 
 pytestmark = [pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")]
 
-
 DEVICE = torch.device("cuda:0")
 NUM_QUANTIZERS = 8
 TOTAL_UPSAMPLE = 4
diff --git a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py
index 3759d0c3d96..847adae06fa 100644
--- a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py
+++ b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py
@@ -26,7 +26,6 @@
     pytest.mark.L4,
 ]
 
-
 DEVICE = torch.device("cuda:0")
 HIDDEN_DIM = 64
 N_ACOUSTIC_CODEBOOK = 7
diff --git a/tests/model_executor/models/voxtral_tts/test_text_preprocess.py b/tests/model_executor/models/voxtral_tts/test_text_preprocess.py
index 58244a58187..2e7b7109121 100644
--- a/tests/model_executor/models/voxtral_tts/test_text_preprocess.py
+++ b/tests/model_executor/models/voxtral_tts/test_text_preprocess.py
@@ -12,13 +12,12 @@
 
 import pytest
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
 _HELPER_PATH = (
     Path(__file__).resolve().parents[4] / "examples" / "online_serving" / "voxtral_tts" / "text_preprocess.py"
 )
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def _load_helper():
     spec = importlib.util.spec_from_file_location("voxtral_tts_text_preprocess_demo", _HELPER_PATH)
diff --git a/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py b/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py
index c11ce7d7431..95ee229298d 100644
--- a/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py
+++ b/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py
@@ -18,7 +18,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 _FRAME = [1, 2, 3, 4]
 _Q = len(_FRAME)
 
diff --git a/tests/test_diffusion_config_fields.py b/tests/test_diffusion_config_fields.py
index 912193481cb..b87ceec1df6 100644
--- a/tests/test_diffusion_config_fields.py
+++ b/tests/test_diffusion_config_fields.py
@@ -13,7 +13,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 try:
     from vllm_omni.diffusion.data import OmniDiffusionConfig
 except Exception:

From f394f01e4f9ce35fd60d916dbdacc70a2dfda2d5 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Mon, 20 Apr 2026 21:23:58 +0800
Subject: [PATCH 3/9] remove blank

Signed-off-by: wangyu <410167048@qq.com>
---
 tests/diffusion/cache/test_cache_backends.py  | 272 ++++++++++++++++--
 tests/diffusion/cache/test_cache_dit.py       |   1 -
 .../distributed/test_sp_plan_hooks.py         |  13 +-
 .../flux/test_flux_prefix_propagation.py      |   1 -
 .../test_omni_coord_client_for_stage.py       |   3 -
 .../test_audio_tokenizer_parsing.py           |   1 -
 6 files changed, 257 insertions(+), 34 deletions(-)

diff --git a/tests/diffusion/cache/test_cache_backends.py b/tests/diffusion/cache/test_cache_backends.py
index 0b7ef723585..ecb99394592 100644
--- a/tests/diffusion/cache/test_cache_backends.py
+++ b/tests/diffusion/cache/test_cache_backends.py
@@ -2,39 +2,269 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Model specific tests for CacheDiT enablement.
+Unit tests for cache backends (cache-dit and teacache).
+
+This module tests the cache backend implementations:
+- CacheDiTBackend: cache-dit acceleration backend
+- TeaCacheBackend: TeaCache hook-based backend
+- Cache selector function: get_cache_backend
+- DiffusionCacheConfig: configuration dataclass
 """
 
 from unittest.mock import Mock, patch
 
 import pytest
 
-import vllm_omni.diffusion.cache.cache_dit_backend as cd_backend
+from vllm_omni.diffusion.cache.cache_dit_backend import (
+    CUSTOM_DIT_ENABLERS,
+    CacheDiTBackend,
+)
+from vllm_omni.diffusion.cache.selector import get_cache_backend
+from vllm_omni.diffusion.cache.teacache.backend import TeaCacheBackend
 from vllm_omni.diffusion.data import DiffusionCacheConfig
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-SEPARATE_CFG_ENABLERS = [
-    cd_backend.enable_cache_for_ltx2,
-    cd_backend.enable_cache_for_wan22,
-    cd_backend.enable_cache_for_longcat_image,
-]
 
-SAMPLE_CACHE_CONFIG = DiffusionCacheConfig()
+class TestCacheDiTBackend:
+    """Test CacheDiTBackend implementation."""
+
+    def test_init_with_dict(self):
+        """Test initialization with dictionary config."""
+        config_dict = {"Fn_compute_blocks": 4, "max_warmup_steps": 8}
+        backend = CacheDiTBackend(config_dict)
+        assert backend.config.Fn_compute_blocks == 4
+        assert backend.config.max_warmup_steps == 8
+        assert backend.enabled is False
+
+    def test_init_with_config_object(self):
+        """Test initialization with DiffusionCacheConfig object."""
+        config = DiffusionCacheConfig(Fn_compute_blocks=4)
+        backend = CacheDiTBackend(config)
+        assert backend.config.Fn_compute_blocks == 4
+        assert backend.enabled is False
+
+    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
+    def test_enable_single_transformer(self, mock_cache_dit):
+        """Test enabling cache-dit on single-transformer pipeline."""
+        # Mock pipeline
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "DiTPipeline"
+        mock_transformer = Mock()
+        mock_pipeline.transformer = mock_transformer
+
+        # Mock cache_dit functions
+        mock_cache_dit.enable_cache = Mock()
+        mock_cache_dit.refresh_context = Mock()
+
+        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
+        backend.enable(mock_pipeline)
+
+        # Verify cache-dit was enabled
+        assert backend.enabled is True
+        assert backend._refresh_func is not None
+        mock_cache_dit.enable_cache.assert_called_once()
+
+    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
+    def test_refresh(self, mock_cache_dit):
+        """Test refreshing cache context with SCM mask policy updates when num_inference_steps changes."""
+        # Mock pipeline
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "DiTPipeline"
+        mock_transformer = Mock()
+        mock_pipeline.transformer = mock_transformer
+
+        # Mock cache_dit functions
+        mock_cache_dit.enable_cache = Mock()
+        mock_cache_dit.refresh_context = Mock()
+        mock_steps_mask_50 = [1, 0, 1, 0, 1] * 10  # Mock mask for 50 steps
+        mock_steps_mask_100 = [1, 0, 1, 0, 1] * 20  # Mock mask for 100 steps
+        mock_cache_dit.steps_mask = Mock(side_effect=[mock_steps_mask_50, mock_steps_mask_100])
+
+        # Enable cache-dit with SCM enabled (using mask policy)
+        config = DiffusionCacheConfig(
+            scm_steps_mask_policy="fast",
+            scm_steps_policy="dynamic",
+        )
+        backend = CacheDiTBackend(config)
+        backend.enable(mock_pipeline)
+
+        # First refresh with 50 steps
+        backend.refresh(mock_pipeline, num_inference_steps=50)
+        assert backend._last_num_inference_steps == 50
+
+        # Verify steps_mask was called with mask policy (not direct steps mask)
+        mock_cache_dit.steps_mask.assert_called_with(mask_policy="fast", total_steps=50)
+        assert mock_cache_dit.steps_mask.call_count == 1
+
+        # Verify refresh_context was called with cache_config (SCM path)
+        mock_cache_dit.refresh_context.assert_called_once()
+        call_args = mock_cache_dit.refresh_context.call_args
+        assert call_args[0][0] == mock_transformer
+        # Check that cache_config was passed (not num_inference_steps directly when SCM is enabled)
+        assert "cache_config" in call_args[1]
+        cache_config_arg = call_args[1]["cache_config"]
+        assert cache_config_arg is not None
+
+        # Change num_inference_steps and refresh again
+        mock_cache_dit.refresh_context.reset_mock()
+        backend.refresh(mock_pipeline, num_inference_steps=100)
+
+        # Verify steps_mask was called again with new num_inference_steps (using mask policy)
+        assert mock_cache_dit.steps_mask.call_count == 2
+        # Check the last call was with 100 steps and mask policy
+        assert mock_cache_dit.steps_mask.call_args_list[-1].kwargs["total_steps"] == 100
+        assert mock_cache_dit.steps_mask.call_args_list[-1].kwargs["mask_policy"] == "fast"
+
+        # Verify refresh_context was called again with updated mask
+        mock_cache_dit.refresh_context.assert_called_once()
+        call_args = mock_cache_dit.refresh_context.call_args
+        assert call_args[0][0] == mock_transformer
+        assert "cache_config" in call_args[1]
+        assert backend._last_num_inference_steps == 100
+
+    def test_hunyuan_custom_enabler_registered(self):
+        """Test HunyuanImage3 custom cache-dit enabler is registered."""
+        assert "HunyuanImage3Pipeline" in CUSTOM_DIT_ENABLERS
+
+    @patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter")
+    @patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
+    def test_enable_hunyuan_pipeline_uses_model_transformer(self, mock_cache_dit, mock_block_adapter):
+        """Test HunyuanImage3 custom enabler uses pipeline.model for cache enable/refresh."""
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "HunyuanImage3Pipeline"
+        mock_pipeline.model = Mock()
+        mock_pipeline.model.layers = Mock()
+
+        mock_cache_dit.enable_cache = Mock()
+        mock_cache_dit.refresh_context = Mock()
+
+        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
+        backend.enable(mock_pipeline)
+
+        assert backend.enabled is True
+        assert backend._refresh_func is not None
+        mock_block_adapter.assert_called_once()
+        adapter_kwargs = mock_block_adapter.call_args.kwargs
+        assert adapter_kwargs["transformer"] is mock_pipeline.model
+        assert adapter_kwargs["blocks"] is mock_pipeline.model.layers
+        assert adapter_kwargs["forward_pattern"] == adapter_kwargs["forward_pattern"].__class__.Pattern_4
+        assert len(adapter_kwargs["params_modifiers"]) == 1
+        mock_cache_dit.enable_cache.assert_called_once()
+
+        backend.refresh(mock_pipeline, num_inference_steps=12)
+        mock_cache_dit.refresh_context.assert_called_once()
+        call_args = mock_cache_dit.refresh_context.call_args
+        assert call_args[0][0] is mock_pipeline.model
+        assert call_args[1]["num_inference_steps"] == 12
+
+    def test_enable_hunyuan_pipeline_requires_model_layers(self):
+        """Test HunyuanImage3 enabler fails with a formatted pipeline class name."""
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "HunyuanImage3Pipeline"
+        mock_pipeline.model = Mock(spec=[])
+
+        backend = CacheDiTBackend({"Fn_compute_blocks": 2})
+
+        with pytest.raises(ValueError, match="HunyuanImage3Pipeline"):
+            backend.enable(mock_pipeline)
+
+
+class TestTeaCacheBackend:
+    """Test TeaCacheBackend implementation."""
+
+    def test_init(self):
+        """Test initialization."""
+        config = DiffusionCacheConfig(rel_l1_thresh=0.3)
+        backend = TeaCacheBackend(config)
+        assert backend.config.rel_l1_thresh == 0.3
+        assert backend.enabled is False
+
+    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
+    def test_enable(self, mock_apply_hook):
+        """Test enabling TeaCache on pipeline."""
+        # Mock pipeline
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
+        mock_transformer = Mock()
+        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
+        mock_pipeline.transformer = mock_transformer
+
+        config = DiffusionCacheConfig(rel_l1_thresh=0.3)
+        backend = TeaCacheBackend(config)
+        backend.enable(mock_pipeline)
+
+        # Verify hook was applied
+        assert backend.enabled is True
+        mock_apply_hook.assert_called_once()
+
+    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
+    def test_enable_with_coefficients(self, mock_apply_hook):
+        """Test enabling TeaCache with custom coefficients."""
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
+        mock_transformer = Mock()
+        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
+        mock_pipeline.transformer = mock_transformer
+
+        config = DiffusionCacheConfig(rel_l1_thresh=0.3, coefficients=[1.0, 0.5, 0.2, 0.1, 0.05])
+        backend = TeaCacheBackend(config)
+        backend.enable(mock_pipeline)
+
+        assert backend.enabled is True
+        mock_apply_hook.assert_called_once()
+
+    @patch("vllm_omni.diffusion.cache.teacache.backend.apply_teacache_hook")
+    def test_refresh(self, mock_apply_hook):
+        """Test refreshing TeaCache state."""
+        mock_pipeline = Mock()
+        mock_pipeline.__class__.__name__ = "QwenImagePipeline"
+        mock_transformer = Mock()
+        mock_transformer.__class__.__name__ = "QwenImageTransformer2DModel"
+        mock_pipeline.transformer = mock_transformer
+
+        # Mock hook registry
+        mock_hook = Mock()
+        mock_registry = Mock()
+        mock_registry.get_hook = Mock(return_value=mock_hook)
+        mock_registry.reset_hook = Mock()
+        mock_transformer._hook_registry = mock_registry
+
+        config = DiffusionCacheConfig()
+        backend = TeaCacheBackend(config)
+        backend.enable(mock_pipeline)
+
+        # Test refresh
+        backend.refresh(mock_pipeline, num_inference_steps=50)
+        mock_registry.reset_hook.assert_called_once()
+
+
+class TestCacheSelector:
+    """Test cache backend selector function."""
+
+    def test_get_cache_backend_none(self):
+        """Test getting None backend."""
+        backend = get_cache_backend(None, None)
+        assert backend is None
 
+        backend = get_cache_backend("none", None)
+        assert backend is None
 
-@pytest.mark.parametrize("enabler", SEPARATE_CFG_ENABLERS)
-@patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter")
-@patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
-def test_separate_cfg(mock_cache_dit, mock_block_adapter, enabler):
-    """Ensure that custom enablers for models with separate CFG pass
-    the param through to cache_dit correctly.
+    def test_get_cache_backend_cache_dit(self):
+        """Test getting cache-dit backend."""
+        config_dict = {"Fn_compute_blocks": 4}
+        backend = get_cache_backend("cache_dit", config_dict)
+        assert isinstance(backend, CacheDiTBackend)
+        assert backend.config.Fn_compute_blocks == 4
 
-    Regression test for: https://github.com/vllm-project/vllm-omni/pull/2860
-    """
-    mock_pipeline = Mock()
-    enabler(mock_pipeline, SAMPLE_CACHE_CONFIG)
+    def test_get_cache_backend_tea_cache(self):
+        """Test getting teacache backend."""
+        config_dict = {"rel_l1_thresh": 0.3}
+        backend = get_cache_backend("tea_cache", config_dict)
+        assert isinstance(backend, TeaCacheBackend)
+        assert backend.config.rel_l1_thresh == 0.3
 
-    mock_cache_dit.enable_cache.assert_called_once()
-    adapter_kwargs = mock_block_adapter.call_args.kwargs
-    assert adapter_kwargs["has_separate_cfg"] is True
+    def test_get_cache_backend_invalid(self):
+        """Test getting invalid backend raises error."""
+        with pytest.raises(ValueError, match="Unsupported cache backend"):
+            get_cache_backend("invalid_backend", {})
diff --git a/tests/diffusion/cache/test_cache_dit.py b/tests/diffusion/cache/test_cache_dit.py
index 68e8bdea77b..0b7ef723585 100644
--- a/tests/diffusion/cache/test_cache_dit.py
+++ b/tests/diffusion/cache/test_cache_dit.py
@@ -14,7 +14,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 SEPARATE_CFG_ENABLERS = [
     cd_backend.enable_cache_for_ltx2,
     cd_backend.enable_cache_for_wan22,
diff --git a/tests/diffusion/distributed/test_sp_plan_hooks.py b/tests/diffusion/distributed/test_sp_plan_hooks.py
index 78b8b3679db..f95dbfe2b92 100644
--- a/tests/diffusion/distributed/test_sp_plan_hooks.py
+++ b/tests/diffusion/distributed/test_sp_plan_hooks.py
@@ -25,13 +25,6 @@
     validate_sp_plan,
 )
 
-pytestmark = [
-    pytest.mark.diffusion,
-    pytest.mark.parallel,
-    pytest.mark.core_model,
-    pytest.mark.cpu,
-]
-
 
 def is_distributed_initialized() -> bool:
     """Check if distributed environment is initialized."""
@@ -51,6 +44,12 @@ def is_distributed_initialized() -> bool:
 )
 
 # Module-level markers: these tests are diffusion + parallel related
+pytestmark = [
+    pytest.mark.diffusion,
+    pytest.mark.parallel,
+    pytest.mark.core_model,
+    pytest.mark.cpu,
+]
 
 # =============================================================================
 # Tests for sp_plan.py
diff --git a/tests/diffusion/models/flux/test_flux_prefix_propagation.py b/tests/diffusion/models/flux/test_flux_prefix_propagation.py
index 2441702384a..b51fc3384fe 100644
--- a/tests/diffusion/models/flux/test_flux_prefix_propagation.py
+++ b/tests/diffusion/models/flux/test_flux_prefix_propagation.py
@@ -15,7 +15,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
 
-
 # Standard dimensions for a minimal FLUX block
 _DIM = 64
 _HEADS = 2
diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py
index 156265da087..0ba19c7fff7 100644
--- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py
+++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py
@@ -18,9 +18,6 @@
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
-
-
 def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]:
     ctx = zmq.Context.instance()
     router = ctx.socket(zmq.ROUTER)
diff --git a/tests/model_executor/models/voxtral_tts/test_audio_tokenizer_parsing.py b/tests/model_executor/models/voxtral_tts/test_audio_tokenizer_parsing.py
index 2fc4b6f0544..5a560d95a1d 100644
--- a/tests/model_executor/models/voxtral_tts/test_audio_tokenizer_parsing.py
+++ b/tests/model_executor/models/voxtral_tts/test_audio_tokenizer_parsing.py
@@ -18,7 +18,6 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
-
 NUM_CODEBOOKS = 37
 
 

From 1e9ea0fdda5627fb6aabc01d131f932392bf3265 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 09:19:51 +0800
Subject: [PATCH 4/9] Update test-nightly.yml to use broader test directory and
 ignore accuracy tests; enhance run_args.py to include 'full_model' in
 run-level options.

Signed-off-by: wangyu <410167048@qq.com>
---
 .buildkite/test-nightly.yml        | 8 ++++----
 tests/helpers/fixtures/run_args.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index b37a9efc76d..e6912bd49c7 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -13,7 +13,7 @@ steps:
       - label: ":full_moon: Omni · Function Test"
         timeout_in_minutes: 90
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "full_model and H100 and omni" --run-level "full_model"
+          - pytest -s -v tests/e2e/ -m "full_model and H100 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -160,7 +160,7 @@ steps:
         timeout_in_minutes: 90
         commands:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "full_model and L4 and omni" --run-level "full_model"
+          - pytest -s -v tests/e2e/ -m "full_model and L4 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -233,7 +233,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
         timeout_in_minutes: 120
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model"
+          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -271,7 +271,7 @@ steps:
       - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
         timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model"
+          - pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
diff --git a/tests/helpers/fixtures/run_args.py b/tests/helpers/fixtures/run_args.py
index b18a64b9810..975584d206b 100644
--- a/tests/helpers/fixtures/run_args.py
+++ b/tests/helpers/fixtures/run_args.py
@@ -6,8 +6,8 @@ def pytest_addoption(parser):
         "--run-level",
         action="store",
         default="core_model",
-        choices=["core_model", "advanced_model"],
-        help="Test level to run: L2, L3",
+        choices=["core_model", "advanced_model", "full_model"],
+        help="Test level to run: L2, L3, L4",
     )
 
 

From 999e849e7a3561d01bce27b99e07a3e0ec0c4388 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 09:35:53 +0800
Subject: [PATCH 5/9] Remove redundant pytest.mark.core_model decorators from
 test_qwen3_tts_base_expansion.py to streamline test definitions.

Signed-off-by: wangyu <410167048@qq.com>
---
 tests/e2e/online_serving/test_qwen3_tts_base_expansion.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
index d89961c5f01..d86f96af099 100644
--- a/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_tts_base_expansion.py
@@ -63,7 +63,6 @@ def get_max_batch_size(size_type="few"):
 ]
 
 
-@pytest.mark.core_model
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
 def test_voice_clone_streaming_001(omni_server, openai_client) -> None:
@@ -89,7 +88,6 @@ def test_voice_clone_streaming_001(omni_server, openai_client) -> None:
     openai_client.send_audio_speech_request(request_config, request_num=get_max_batch_size("few"))
 
 
-@pytest.mark.core_model
 @hardware_test(res={"cuda": "L4"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
 def test_response_format_001(omni_server, openai_client) -> None:

From 9a867a758e7fe839f8e38cee2b1f47ff0e811bc4 Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 10:13:38 +0800
Subject: [PATCH 6/9] Remove Audio Generation Model Test step from
 test-ready.yml to streamline CI pipeline.

Signed-off-by: wangyu <410167048@qq.com>
---
 .buildkite/test-ready.yml | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml
index 3ca1747fe64..080f18885ef 100644
--- a/.buildkite/test-ready.yml
+++ b/.buildkite/test-ready.yml
@@ -120,23 +120,6 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
-  - label: "Audio Generation Model Test"
-    depends_on: upload-ready-pipeline
-    commands:
-      - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
-    agents:
-      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
-    plugins:
-      - docker#v5.2.0:
-          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-          always-pull: true
-          propagate-environment: true
-          environment:
-            - "HF_HOME=/fsx/hf_cache"
-            - "HF_TOKEN"
-          volumes:
-            - "/fsx/hf_cache:/fsx/hf_cache"
-
   - label: "Diffusion Cache Backend Test"
     depends_on: upload-ready-pipeline
     commands:

From cf6d5af0bea18970d59cb5ea5cf664b93b7b82ab Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 15:24:11 +0800
Subject: [PATCH 7/9] Refactor audio handling in Dynin-Omni tests: replace T2S
 with T2A prompts, update request configurations, and streamline audio
 transcription process. Adjust pytestmark for diffusion tests.

Signed-off-by: wangyu <410167048@qq.com>
---
 .../test_dynin_omni_expansion.py              |  76 ++-----------
 tests/helpers/assertions.py                   | 100 ++++++++++++++----
 2 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py
index de95f72d9d4..df62eb927e6 100644
--- a/tests/e2e/online_serving/test_dynin_omni_expansion.py
+++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py
@@ -5,31 +5,26 @@
 """
 
 import base64
-import gc
 import os
 from io import BytesIO
 from pathlib import Path
 
-import numpy as np
 import pytest
-import soundfile as sf
 from vllm.assets.image import ImageAsset
 
 from tests.helpers.mark import hardware_test
-from tests.helpers.media import convert_audio_bytes_to_text
 from tests.helpers.runtime import OmniServerParams
 
-pytestmark = [pytest.mark.full_model, pytest.mark.omni]
+pytestmark = [pytest.mark.full_model, pytest.mark.diffusion]
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 
 MODEL = "snu-aidas/Dynin-Omni"
 STAGE_CONFIG = str(Path(__file__).parent.parent / "stage_configs" / "dynin_omni_ci.yaml")
-_WHISPER_SAMPLE_RATE_HZ = 16_000
 
 T2I_PROMPT = "A high quality detailed living room interior photo."
-T2S_PROMPT = "Please read this sentence naturally: Hello from Dynin-Omni online serving."
+T2A_PROMPT = "Please read this sentence naturally: Hello from Dynin-Omni online serving."
 I2I_PROMPT = "Transform this outdoor nature boardwalk scene into a painting style with vivid colors."
 
 TEST_PARAMS = [OmniServerParams(model=MODEL, stage_config_path=STAGE_CONFIG, stage_init_timeout=600)]
@@ -37,70 +32,11 @@
 _I2I_STAGE_SAMPLING = {"max_tokens": 1, "temperature": 0.0, "top_p": 1.0, "detokenize": False}
 
 
-def _prepare_audio_waveform_for_whisper(audio_data: np.ndarray, samplerate: int) -> np.ndarray:
-    """Normalize decoded audio into a mono 16 kHz float32 waveform for Whisper."""
-    if samplerate <= 0:
-        raise ValueError(f"Invalid audio sample rate: {samplerate}")
-
-    waveform = np.asarray(audio_data, dtype=np.float32)
-    if waveform.ndim == 0:
-        raise ValueError("Audio waveform must have at least one dimension")
-    if waveform.ndim > 1:
-        waveform = np.mean(waveform, axis=1)
-    if waveform.size == 0:
-        raise ValueError("Empty audio waveform")
-
-    if samplerate != _WHISPER_SAMPLE_RATE_HZ:
-        target_num_samples = max(int(round(waveform.shape[0] * _WHISPER_SAMPLE_RATE_HZ / samplerate)), 1)
-        source_positions = np.arange(waveform.shape[0], dtype=np.float64)
-        target_positions = np.linspace(
-            0.0,
-            max(waveform.shape[0] - 1, 0),
-            num=target_num_samples,
-            dtype=np.float64,
-        )
-        waveform = np.interp(target_positions, source_positions, waveform).astype(np.float32)
-
-    return np.ascontiguousarray(np.clip(waveform, -1.0, 1.0), dtype=np.float32)
-
-
-def _convert_audio_bytes_to_text_without_ffmpeg(raw_bytes: bytes) -> str:
-    """Dynin t2s keeps Whisper transcription local to this test module and avoids ffmpeg."""
-    import whisper
-
-    data, samplerate = sf.read(BytesIO(raw_bytes), dtype="float32", always_2d=True)
-    audio_waveform = _prepare_audio_waveform_for_whisper(data, samplerate)
-
-    model = whisper.load_model("small", device="cpu")
-    try:
-        transcript = model.transcribe(
-            audio_waveform,
-            temperature=0.0,
-            word_timestamps=True,
-            condition_on_previous_text=False,
-        )["text"]
-    finally:
-        del model
-        gc.collect()
-
-    return transcript or ""
-
-
-@pytest.fixture
-def dynin_t2s_openai_client(openai_client, monkeypatch):
-    monkeypatch.setattr(
-        convert_audio_bytes_to_text,
-        "convert_audio_bytes_to_text",
-        _convert_audio_bytes_to_text_without_ffmpeg,
-    )
-    return openai_client
-
-
 def _build_t2i_messages(prompt: str) -> list[dict]:
     return [{"role": "user", "content": [{"type": "text", "text": f"<|t2i|> {prompt}"}]}]
 
 
-def _build_t2s_messages(prompt: str) -> list[dict]:
+def _build_t2a_messages(prompt: str) -> list[dict]:
     return [{"role": "user", "content": [{"type": "text", "text": f"<|t2s|> {prompt}"}]}]
 
 
@@ -147,10 +83,10 @@ def test_send_t2i_request_001(omni_server, openai_client) -> None:
 
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True)
-def test_send_t2s_request_001(omni_server, dynin_t2s_openai_client) -> None:
+def test_send_t2a_request_001(omni_server, openai_client) -> None:
     request_config = {
         "model": omni_server.model,
-        "messages": _build_t2s_messages(T2S_PROMPT),
+        "messages": _build_t2a_messages(T2A_PROMPT),
         "modalities": ["audio"],
     }
-    dynin_t2s_openai_client.send_omni_request(request_config)
+    openai_client.send_diffusion_request(request_config)
diff --git a/tests/helpers/assertions.py b/tests/helpers/assertions.py
index 8f7f9ce4130..71e4631554d 100644
--- a/tests/helpers/assertions.py
+++ b/tests/helpers/assertions.py
@@ -11,10 +11,14 @@
 import soundfile as sf
 from PIL import Image
 
-from tests.helpers.media import cosine_similarity_text
+from tests.helpers.media import (
+    convert_audio_bytes_to_text,
+    cosine_similarity_text,
+)
 
 _GENDER_PIPELINE = None
 _GENDER_PIPELINE_LOCK = threading.Lock()
+_WHISPER_SAMPLE_RATE_HZ = 16_000
 _PCM_SPEECH_SAMPLE_RATE_HZ = 24_000
 _MIN_PCM_SPEECH_HNR_DB = 1.0
 _PRESET_VOICE_GENDER_MAP: dict[str, str] = {
@@ -106,15 +110,86 @@ def assert_video_diffusion_response(
         )
 
 
+def _prepare_audio_waveform_for_whisper(audio_data: np.ndarray, samplerate: int) -> np.ndarray:
+    """Normalize decoded audio into a mono 16 kHz float32 waveform for Whisper."""
+    if samplerate <= 0:
+        raise ValueError(f"Invalid audio sample rate: {samplerate}")
+
+    waveform = np.asarray(audio_data, dtype=np.float32)
+    if waveform.ndim == 0:
+        raise ValueError("Audio waveform must have at least one dimension")
+    if waveform.ndim > 1:
+        waveform = np.mean(waveform, axis=1)
+    if waveform.size == 0:
+        raise ValueError("Empty audio waveform")
+
+    if samplerate != _WHISPER_SAMPLE_RATE_HZ:
+        target_num_samples = max(int(round(waveform.shape[0] * _WHISPER_SAMPLE_RATE_HZ / samplerate)), 1)
+        source_positions = np.arange(waveform.shape[0], dtype=np.float64)
+        target_positions = np.linspace(
+            0.0,
+            max(waveform.shape[0] - 1, 0),
+            num=target_num_samples,
+            dtype=np.float64,
+        )
+        waveform = np.interp(target_positions, source_positions, waveform).astype(np.float32)
+
+    return np.ascontiguousarray(np.clip(waveform, -1.0, 1.0), dtype=np.float32)
+
+
+def _transcribe_audio_bytes_adapted(raw_bytes: bytes) -> str:
+    """Pre-normalize raw bytes then call :func:`convert_audio_bytes_to_text` once."""
+    data, samplerate = sf.read(io.BytesIO(raw_bytes), dtype="float32", always_2d=True)
+    audio_waveform = _prepare_audio_waveform_for_whisper(data, samplerate)
+    wav_buf = io.BytesIO()
+    sf.write(wav_buf, audio_waveform, _WHISPER_SAMPLE_RATE_HZ, format="WAV", subtype="PCM_16")
+    return convert_audio_bytes_to_text(wav_buf.getvalue())
+
+
 def assert_audio_diffusion_response(
     response,
     request_config: dict[str, Any],
     run_level: str = None,
 ) -> None:
     """
-    Validate audio diffusion response.
+    Validate audio from diffusion ``response.audios`` and/or omni ``response.audio_bytes``.
+
+    Transcription adapts raw bytes to mono 16 kHz WAV first, then calls
+    :func:`~tests.helpers.media.convert_audio_bytes_to_text`.
     """
-    raise NotImplementedError("Audio validation is not implemented yet")
+    audios = getattr(response, "audios", None)
+    modalities = request_config.get("modalities", ["text", "audio"])
+
+    if audios is not None:
+        assert len(audios) > 0, "No audio in diffusion response"
+
+    if run_level not in {"advanced_model", "full_model"}:
+        return
+
+    def _validate_transcript(transcript: str) -> None:
+        assert transcript.strip(), "No audio output is generated (empty Whisper transcript)"
+        print(f"audio content is: {transcript}")
+
+        word_types = ["text", "image", "audio", "video"]
+        keywords_dict = request_config.get("key_words", {})
+        if "text" not in modalities:
+            for word_type in word_types:
+                keywords = keywords_dict.get(word_type)
+                if keywords:
+                    audio_lower = transcript.lower()
+                    assert any(str(kw).lower() in audio_lower for kw in keywords), (
+                        "The output does not contain any of the keywords."
+                    )
+
+        if "text" in modalities and "audio" in modalities and getattr(response, "text_content", None) is not None:
+            similarity = cosine_similarity_text(transcript.lower(), response.text_content.lower())
+            assert similarity is not None and similarity > 0.9, "The audio content is not same as the text"
+            print(f"similarity is: {similarity}")
+
+    if "audio" in modalities and audios is not None:
+        transcript = _transcribe_audio_bytes_adapted(audios)
+        _validate_transcript(transcript)
+        return
 
 
 def _maybe_int(value: Any) -> int | None:
@@ -425,27 +500,14 @@ def assert_omni_response(response: Any, request_config: dict[str, Any], run_leve
         # Verify image description
         word_types = ["text", "image", "audio", "video"]
         keywords_dict = request_config.get("key_words", {})
-        for word_type in word_types:
-            keywords = keywords_dict.get(word_type)
-            if "text" in modalities:
+        if "text" in modalities:
+            for word_type in word_types:
+                keywords = keywords_dict.get(word_type)
                 if keywords:
                     text_lower = response.text_content.lower()
                     assert any(str(kw).lower() in text_lower for kw in keywords), (
                         "The output does not contain any of the keywords."
                     )
-            else:
-                if keywords:
-                    audio_lower = response.audio_content.lower()
-                    assert any(str(kw).lower() in audio_lower for kw in keywords), (
-                        "The output does not contain any of the keywords."
-                    )
-
-        # Verify similarity (Whisper transcript vs streamed/detokenized text)
-        if "text" in modalities and "audio" in modalities:
-            assert response.similarity is not None and response.similarity > 0.9, (
-                "The audio content is not same as the text"
-            )
-            print(f"similarity is: {response.similarity}")
 
 
 def assert_audio_speech_response(response: Any, request_config: dict[str, Any], run_level: str) -> None:

From 58014ee029b4fda1e1f05d9803ac71a15ea4211a Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 18:04:22 +0800
Subject: [PATCH 8/9] Refactor Dynin-Omni test prompts and request handling:
 update T2A to T2S prompts, adjust pytestmark for omni tests, and enhance
 audio validation logic in assertions.

Signed-off-by: wangyu <410167048@qq.com>
---
 .../test_dynin_omni_expansion.py              |  13 ++-
 tests/helpers/assertions.py                   | 110 +++++-------------
 2 files changed, 37 insertions(+), 86 deletions(-)

diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py
index df62eb927e6..da179dbc802 100644
--- a/tests/e2e/online_serving/test_dynin_omni_expansion.py
+++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py
@@ -15,7 +15,7 @@
 from tests.helpers.mark import hardware_test
 from tests.helpers.runtime import OmniServerParams
 
-pytestmark = [pytest.mark.full_model, pytest.mark.diffusion]
+pytestmark = [pytest.mark.full_model, pytest.mark.omni]
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
@@ -24,7 +24,7 @@
 STAGE_CONFIG = str(Path(__file__).parent.parent / "stage_configs" / "dynin_omni_ci.yaml")
 
 T2I_PROMPT = "A high quality detailed living room interior photo."
-T2A_PROMPT = "Please read this sentence naturally: Hello from Dynin-Omni online serving."
+T2S_PROMPT = "Please read this sentence naturally: Hello from online serving."
 I2I_PROMPT = "Transform this outdoor nature boardwalk scene into a painting style with vivid colors."
 
 TEST_PARAMS = [OmniServerParams(model=MODEL, stage_config_path=STAGE_CONFIG, stage_init_timeout=600)]
@@ -36,7 +36,7 @@ def _build_t2i_messages(prompt: str) -> list[dict]:
     return [{"role": "user", "content": [{"type": "text", "text": f"<|t2i|> {prompt}"}]}]
 
 
-def _build_t2a_messages(prompt: str) -> list[dict]:
+def _build_t2s_messages(prompt: str) -> list[dict]:
     return [{"role": "user", "content": [{"type": "text", "text": f"<|t2s|> {prompt}"}]}]
 
 
@@ -83,10 +83,11 @@ def test_send_t2i_request_001(omni_server, openai_client) -> None:
 
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True)
-def test_send_t2a_request_001(omni_server, openai_client) -> None:
+def test_send_t2s_request_001(omni_server, openai_client) -> None:
     request_config = {
         "model": omni_server.model,
-        "messages": _build_t2a_messages(T2A_PROMPT),
+        "messages": _build_t2s_messages(T2S_PROMPT),
         "modalities": ["audio"],
+        "audio_ref_text": T2S_PROMPT,
     }
-    openai_client.send_diffusion_request(request_config)
+    openai_client.send_omni_request(request_config)
diff --git a/tests/helpers/assertions.py b/tests/helpers/assertions.py
index 71e4631554d..57dadfff9ea 100644
--- a/tests/helpers/assertions.py
+++ b/tests/helpers/assertions.py
@@ -12,13 +12,11 @@
 from PIL import Image
 
 from tests.helpers.media import (
-    convert_audio_bytes_to_text,
     cosine_similarity_text,
 )
 
 _GENDER_PIPELINE = None
 _GENDER_PIPELINE_LOCK = threading.Lock()
-_WHISPER_SAMPLE_RATE_HZ = 16_000
 _PCM_SPEECH_SAMPLE_RATE_HZ = 24_000
 _MIN_PCM_SPEECH_HNR_DB = 1.0
 _PRESET_VOICE_GENDER_MAP: dict[str, str] = {
@@ -110,86 +108,15 @@ def assert_video_diffusion_response(
         )
 
 
-def _prepare_audio_waveform_for_whisper(audio_data: np.ndarray, samplerate: int) -> np.ndarray:
-    """Normalize decoded audio into a mono 16 kHz float32 waveform for Whisper."""
-    if samplerate <= 0:
-        raise ValueError(f"Invalid audio sample rate: {samplerate}")
-
-    waveform = np.asarray(audio_data, dtype=np.float32)
-    if waveform.ndim == 0:
-        raise ValueError("Audio waveform must have at least one dimension")
-    if waveform.ndim > 1:
-        waveform = np.mean(waveform, axis=1)
-    if waveform.size == 0:
-        raise ValueError("Empty audio waveform")
-
-    if samplerate != _WHISPER_SAMPLE_RATE_HZ:
-        target_num_samples = max(int(round(waveform.shape[0] * _WHISPER_SAMPLE_RATE_HZ / samplerate)), 1)
-        source_positions = np.arange(waveform.shape[0], dtype=np.float64)
-        target_positions = np.linspace(
-            0.0,
-            max(waveform.shape[0] - 1, 0),
-            num=target_num_samples,
-            dtype=np.float64,
-        )
-        waveform = np.interp(target_positions, source_positions, waveform).astype(np.float32)
-
-    return np.ascontiguousarray(np.clip(waveform, -1.0, 1.0), dtype=np.float32)
-
-
-def _transcribe_audio_bytes_adapted(raw_bytes: bytes) -> str:
-    """Pre-normalize raw bytes then call :func:`convert_audio_bytes_to_text` once."""
-    data, samplerate = sf.read(io.BytesIO(raw_bytes), dtype="float32", always_2d=True)
-    audio_waveform = _prepare_audio_waveform_for_whisper(data, samplerate)
-    wav_buf = io.BytesIO()
-    sf.write(wav_buf, audio_waveform, _WHISPER_SAMPLE_RATE_HZ, format="WAV", subtype="PCM_16")
-    return convert_audio_bytes_to_text(wav_buf.getvalue())
-
-
 def assert_audio_diffusion_response(
     response,
     request_config: dict[str, Any],
     run_level: str = None,
 ) -> None:
     """
-    Validate audio from diffusion ``response.audios`` and/or omni ``response.audio_bytes``.
-
-    Transcription adapts raw bytes to mono 16 kHz WAV first, then calls
-    :func:`~tests.helpers.media.convert_audio_bytes_to_text`.
+    Validate audio diffusion response.
     """
-    audios = getattr(response, "audios", None)
-    modalities = request_config.get("modalities", ["text", "audio"])
-
-    if audios is not None:
-        assert len(audios) > 0, "No audio in diffusion response"
-
-    if run_level not in {"advanced_model", "full_model"}:
-        return
-
-    def _validate_transcript(transcript: str) -> None:
-        assert transcript.strip(), "No audio output is generated (empty Whisper transcript)"
-        print(f"audio content is: {transcript}")
-
-        word_types = ["text", "image", "audio", "video"]
-        keywords_dict = request_config.get("key_words", {})
-        if "text" not in modalities:
-            for word_type in word_types:
-                keywords = keywords_dict.get(word_type)
-                if keywords:
-                    audio_lower = transcript.lower()
-                    assert any(str(kw).lower() in audio_lower for kw in keywords), (
-                        "The output does not contain any of the keywords."
-                    )
-
-        if "text" in modalities and "audio" in modalities and getattr(response, "text_content", None) is not None:
-            similarity = cosine_similarity_text(transcript.lower(), response.text_content.lower())
-            assert similarity is not None and similarity > 0.9, "The audio content is not same as the text"
-            print(f"similarity is: {similarity}")
-
-    if "audio" in modalities and audios is not None:
-        transcript = _transcribe_audio_bytes_adapted(audios)
-        _validate_transcript(transcript)
-        return
+    raise NotImplementedError("Audio validation is not implemented yet")
 
 
 def _maybe_int(value: Any) -> int | None:
@@ -483,6 +410,7 @@ def assert_omni_response(response: Any, request_config: dict[str, Any], run_leve
     modalities = request_config.get("modalities", ["text", "audio"])
 
     if run_level in {"advanced_model", "full_model"}:
+        # Verify output success
         if "audio" in modalities:
             assert response.audio_content is not None, "No audio output is generated"
             print(f"audio content is: {response.audio_content}")
@@ -492,22 +420,44 @@ def assert_omni_response(response: Any, request_config: dict[str, Any], run_leve
                     response.audio_bytes,
                     speaker,
                 )
-
         if "text" in modalities:
             assert response.text_content is not None, "No text output is generated"
             print(f"text content is: {response.text_content}")
 
-        # Verify image description
+        # Verify keywords in output
         word_types = ["text", "image", "audio", "video"]
         keywords_dict = request_config.get("key_words", {})
-        if "text" in modalities:
-            for word_type in word_types:
-                keywords = keywords_dict.get(word_type)
+        for word_type in word_types:
+            keywords = keywords_dict.get(word_type)
+            if "text" in modalities:
                 if keywords:
                     text_lower = response.text_content.lower()
                     assert any(str(kw).lower() in text_lower for kw in keywords), (
                         "The output does not contain any of the keywords."
                     )
+            else:
+                if keywords:
+                    audio_lower = response.audio_content.lower()
+                    assert any(str(kw).lower() in audio_lower for kw in keywords), (
+                        "The output does not contain any of the keywords."
+                    )
+
+        # Verify similarity (Whisper transcript vs streamed/detokenized text)
+        if "audio" in modalities:
+            audio_ref_text = request_config.get("audio_ref_text")
+            if "text" in modalities:
+                assert response.similarity is not None and response.similarity > 0.9, (
+                    "The audio content is not same as the text"
+                )
+                print(f"similarity is: {response.similarity}")
+            if audio_ref_text:
+                audio_similarity = cosine_similarity_text(
+                    response.audio_content.lower(),
+                    str(audio_ref_text).lower(),
+                )
+                assert audio_similarity > 0.9, (
+                    f"The audio content does not match reference text: similarity={audio_similarity:.3f}"
+                )
 
 
 def assert_audio_speech_response(response: Any, request_config: dict[str, Any], run_level: str) -> None:

From d8075cbd414cc0dc7caf5b0d74a1369c65e7af2f Mon Sep 17 00:00:00 2001
From: wangyu <410167048@qq.com>
Date: Tue, 21 Apr 2026 21:28:14 +0800
Subject: [PATCH 9/9] Refactor audio response validation in assertions: replace
 similarity attribute with direct cosine similarity calculation, ensuring more
 accurate audio-text comparison. Clean up unused similarity variable in
 runtime handling.

Signed-off-by: wangyu <410167048@qq.com>
---
 tests/helpers/assertions.py | 10 +++++++---
 tests/helpers/runtime.py    | 10 ----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/tests/helpers/assertions.py b/tests/helpers/assertions.py
index 57dadfff9ea..604b76b62ec 100644
--- a/tests/helpers/assertions.py
+++ b/tests/helpers/assertions.py
@@ -446,10 +446,14 @@ def assert_omni_response(response: Any, request_config: dict[str, Any], run_leve
         if "audio" in modalities:
             audio_ref_text = request_config.get("audio_ref_text")
             if "text" in modalities:
-                assert response.similarity is not None and response.similarity > 0.9, (
-                    "The audio content is not same as the text"
+                transcript = (response.audio_content or "").strip()
+                text_output = (response.text_content or "").strip()
+                similarity = cosine_similarity_text(
+                    transcript.lower(),
+                    text_output.lower(),
                 )
-                print(f"similarity is: {response.similarity}")
+                assert similarity > 0.9, "The audio content is not same as the text"
+                print(f"similarity is: {similarity}")
             if audio_ref_text:
                 audio_similarity = cosine_similarity_text(
                     response.audio_content.lower(),
diff --git a/tests/helpers/runtime.py b/tests/helpers/runtime.py
index 9520aff0c53..190cb66b4af 100644
--- a/tests/helpers/runtime.py
+++ b/tests/helpers/runtime.py
@@ -35,7 +35,6 @@
 from tests.helpers.media import (
     _merge_base64_audio_to_segment,
     convert_audio_bytes_to_text,
-    cosine_similarity_text,
     decode_b64_image,
 )
 from vllm_omni.config.stage_config import resolve_deploy_yaml
@@ -500,7 +499,6 @@ class OmniResponse:
     audio_content: str | None = None
     audio_format: str | None = None
     audio_bytes: bytes | None = None
-    similarity: float | None = None
     e2e_latency: float | None = None
     success: bool = False
     error_message: str | None = None
@@ -542,19 +540,15 @@ def _process_stream_omni_response(self, chat_completion) -> OmniResponse:
                         text_content += content
             result.e2e_latency = time.perf_counter() - start_time
             audio_content = None
-            similarity = None
             if audio_data:
                 merged_seg = _merge_base64_audio_to_segment(audio_data)
                 wav_buf = BytesIO()
                 merged_seg.export(wav_buf, format="wav")
                 result.audio_bytes = wav_buf.getvalue()
                 audio_content = convert_audio_bytes_to_text(result.audio_bytes)
-            if audio_content and text_content:
-                similarity = cosine_similarity_text(audio_content.lower(), text_content.lower())
             result.text_content = text_content
             result.audio_data = audio_data
             result.audio_content = audio_content
-            result.similarity = similarity
             result.success = True
         except Exception as e:
             result.error_message = f"Stream processing error: {str(e)}"
@@ -578,15 +572,11 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse:
                 result.cached_tokens = details.cached_tokens
             result.e2e_latency = time.perf_counter() - start_time
             audio_content = None
-            similarity = None
             if audio_data:
                 result.audio_bytes = base64.b64decode(audio_data)
                 audio_content = convert_audio_bytes_to_text(result.audio_bytes)
-            if audio_content and text_content:
-                similarity = cosine_similarity_text(audio_content.lower(), text_content.lower())
             result.text_content = text_content
             result.audio_content = audio_content
-            result.similarity = similarity
             result.success = True
         except Exception as e:
             result.error_message = f"Non-stream processing error: {str(e)}"