Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .buildkite/test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
- label: ":full_moon: Omni · Function Test"
timeout_in_minutes: 90
commands:
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model"
- pytest -s -v tests/e2e/ -m "full_model and H100 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -52,7 +52,7 @@ steps:
timeout_in_minutes: 90
commands:
- export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
- pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model"
- pytest -s -v tests/examples/ -m "full_model and omni and L4" --run-level "full_model"
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand All @@ -70,7 +70,7 @@ steps:
- label: ":full_moon: Omni · Doc Test with H100"
timeout_in_minutes: 90
commands:
- pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model"
- pytest -s -v tests/examples/ -m "full_model and omni and H100" --run-level "full_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -160,7 +160,7 @@ steps:
timeout_in_minutes: 90
commands:
- export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
- pytest -s -v tests/e2e/ -m "full_model and L4 and omni" --run-level "full_model" --ignore=tests/e2e/accuracy
agents:
queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand Down Expand Up @@ -233,7 +233,7 @@ steps:
- label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
timeout_in_minutes: 120
commands:
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model"
- pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and H100" --run-level "full_model" --ignore=tests/e2e/accuracy
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -271,7 +271,7 @@ steps:
- label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
timeout_in_minutes: 60
commands:
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model"
- pytest -sv tests/e2e/ -k "not test_wan and not hunyuan" -m "full_model and diffusion and L4" --run-level "full_model" --ignore=tests/e2e/accuracy
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand All @@ -290,7 +290,7 @@ steps:
timeout_in_minutes: 60
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/examples/*/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
- pytest -s -v tests/examples/*/test_text_to_image.py -m "full_model and example and H100" --run-level "full_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -328,7 +328,7 @@ steps:
- label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test"
timeout_in_minutes: 60
commands:
- pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
- pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level full_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
- buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
agents:
queue: "mithril-h100-pool"
Expand Down Expand Up @@ -367,7 +367,7 @@ steps:
- label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test"
timeout_in_minutes: 60
commands:
- pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
- pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level full_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
- buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv"
- buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json"
agents:
Expand Down Expand Up @@ -409,7 +409,7 @@ steps:
- label: ":full_moon: Diffusion X2I(&A&T) · Accuracy Test"
timeout_in_minutes: 180
commands:
- pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model
- pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level full_model
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -514,7 +514,7 @@ steps:
- label: ":full_moon: Diffusion X2V · Function Test"
timeout_in_minutes: 90
commands:
- pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "advanced_model" --run-level "advanced_model"
- pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "full_model" --run-level "full_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -552,7 +552,7 @@ steps:
- label: ":full_moon: Diffusion X2V · Accuracy Test"
timeout_in_minutes: 180
commands:
- pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m advanced_model --run-level advanced_model
- pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m full_model --run-level full_model
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down
17 changes: 0 additions & 17 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,23 +120,6 @@ steps:
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "Audio Generation Model Test"
depends_on: upload-ready-pipeline
commands:
- timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
- "HF_TOKEN"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "Diffusion Cache Backend Test"
depends_on: upload-ready-pipeline
commands:
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/accuracy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ Test guidance:
- Local static/self-checks live in `tests/benchmarks/test_accuracy_bench_utils.py`.
- End-to-end generation/evaluation should be validated in a remote GPU
environment. In the current repo marker system there is `L4` but no `L5`
marker, so benchmark smoke tests should be wired as `advanced_model +
benchmark + L4` when GPU capacity is available.
marker, so benchmark smoke tests should be wired as `full_model +
benchmark + L4` for nightly when GPU capacity is available.
4 changes: 2 additions & 2 deletions benchmarks/accuracy/image_to_image/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,5 @@ Notes:
- This flow requires the optional Hugging Face `datasets` package.
- `generate` writes `generation_manifest.json` with local output coverage.
- The current repo marker set exposes `L4` but not `L5`, so if you promote an
end-to-end smoke test into CI, use the existing `advanced_model`, `benchmark`,
and `L4` markers or introduce a new repo-wide marker explicitly first.
end-to-end smoke test into CI, use the `full_model`, `benchmark`,
and `L4` markers for nightly (or `advanced_model` for merge) or introduce a new repo-wide marker explicitly first.
10 changes: 6 additions & 4 deletions docs/contributing/ci/CI_5levels.md
Original file line number Diff line number Diff line change
Expand Up @@ -418,13 +418,13 @@ L3 level testing executes after code is merged into the main branch. Its core pu

**Explanation**:

@pytest.mark.advanced_model: Marks the test as L3 or L4 level, indicating that this test case performs deep validation, using real models for performance, integration, and accuracy testing. This forms a "basic-advanced" correspondence with the core_model mark at the L2 level.
@pytest.mark.advanced_model: Marks the test as L3 merge level, indicating deep validation with real models. @pytest.mark.full_model: Marks L4 nightly-only suites (e.g. `test_*_expansion.py`, doc examples).

@pytest.mark.core_model: Marks the test as L1 or L2 level, indicating that this test case validates the basic functionality of the core model. It uses mock weights and only checks if the relevant interface functions correctly.

@pytest.mark.parametrize: A parameterization decorator that allows abstracting test data into parameters, enabling reuse of the same test logic across different data configurations. indirect=True indicates that parameters will be passed to the fixture for processing.

**Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation run logic at the merge or nightly level, you can mark it only with @pytest.mark.advanced_model. If you believe the test case needs to accommodate both basic run and deep validation test logic, you should mark it with both @pytest.mark.core_model and @pytest.mark.advanced_model.
**Notes**: If you believe the test case only needs to execute basic run logic at the PR-level CI, you can mark it only with @pytest.mark.core_model. If you believe it only needs to execute deep validation at merge (L3), use @pytest.mark.advanced_model. For L4 nightly-only expansion and doc-example tests, use @pytest.mark.full_model with `--run-level full_model`. If the test case needs both basic run and deep validation, mark with @pytest.mark.core_model and the appropriate L3/L4 marker (`advanced_model` and/or `full_model`).

**2.4.2 Test Function Definition and Documentation**

Expand Down Expand Up @@ -516,9 +516,11 @@ L3 level testing executes after code is merged into the main branch. Its core pu

**Single Request**: The comment clearly states this is a single-request completion test. For concurrent testing, it can be extended to multiple requests using request_num = n.

**Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model performs deep validation.
**Implicit Validation**: The `send_omni_request` and `send_diffusion_request` methods internally includes validation logic dynamically selected based on the --run-level parameter: core_model performs basic validation, while advanced_model and full_model perform deep validation.

- ***Run Command***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`
- ***Run Command (L3 merge)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}.py -m advanced_model --run-level=advanced_model`

- ***Run Command (L4 nightly expansion)***: `pytest -s -v /tests/e2e/online_serving/test_{model_name}_expansion.py -m full_model --run-level=full_model`

## Chapter 3: L4 Level Testing - Full Functionality, Performance, and Documentation Testing

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Currently all the features are available in online serving mode. Hence, only nee
**Code Style**

- Validation: test that the multimodal output files of your model have the correct shapes. `OpenAIClientHandler.send_diffusion_request` should have taken care of this.
- Test marks: always add `advanced_model` and `diffusion`. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
- Test marks: always add `full_model` and `diffusion` for L4 nightly `test_*_expansion.py` cases. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/).
- To maximize code reuse, you may refer to
- `tests/conftest.py` for `omni_server` (running server in subprocess) and `openai_client` fixtures (sending requests and validating output), `generate_synthetic_image` and `assert_XXX_valid` helper.
- `tests/helpers/mark.py` for `@hardware_test(...)` and `hardware_marks`.
Expand Down
31 changes: 23 additions & 8 deletions docs/contributing/ci/test_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,31 +42,46 @@ Our test scripts use the pytest framework. First, please use `git clone https://
```
The latest test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-ready.yml).

=== "L3 level & L4 level"
=== "L3 level"

```bash
pytest -s -v -m "advanced_model" --run-level=advanced_model
```
If you only want to run L3 test case, you can use:
If you only want to run a specific test case, you can use:
```bash
pytest -s -v test_xxxx.py --run-level=advanced_model
```
If you only want to run specific test cases on a particular platform, you can use:
```bash
pytest -s -v e2e/ --ignore-glob='*expansion.py' -m "advanced_model" --run-level=advanced_model
pytest -s -v -m "advanced_model and distributed_cuda and L4" --run-level=advanced_model
```
The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).


=== "L4 level"

```bash
cd tests
pytest -s -v -m "full_model" --run-level=full_model
```
If you only want to run a specific test case, you can use:
```bash
pytest -s -v test_xxxx.py --run-level=advanced_model
pytest -s -v test_xxxx.py --run-level=full_model
```
If you only want to run specific test cases on a particular platform, you can use:
```bash
pytest -s -v -m "core_model and distributed_cuda and L4" --run-level=core_model
pytest -s -v -m "full_model and distributed_cuda and L4" --run-level=full_model
```
Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS):
```bash
pytest -s -v tests/dfx/perf/scripts/run_benchmark.py
```
The latest L4 (nightly) test commands use the `full_model` marker and `--run-level full_model` (see [test-nightly.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml) and [test-nightly-diffusion.yml](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly-diffusion.yml)). Example:

The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).

The latest L4 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-nightly.yml).
```bash
cd tests
pytest -s -v -m "full_model and omni and H100" --run-level=full_model
```

You can find more information about markers in the documentation: [marker doc](./tests_markers.md)

Expand Down
3 changes: 2 additions & 1 deletion docs/contributing/ci/tests_markers.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ Defined in `pyproject.toml`:
| Marker | Description |
| ------------------ | --------------------------------------------------------- |
| `core_model` | L1&L2 tests (run in each PR) |
| `advanced_model` | L3&L4 level tests (run in each merge or nightly) |
| `advanced_model` | L3 tests (run on each merge to main) |
| `full_model` | L4 tests (run nightly) |
| `diffusion` | Diffusion model tests |
| `omni` | Omni model tests |
| `cache` | Cache backend tests |
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ addopts = [
markers = [
# ci/cd required
"core_model: L1&L2 tests (run in each PR)",
"advanced_model: L3&L4 level tests (run in each merge or nightly)",
"advanced_model: L3 level tests (run on each merge)",
"full_model: L4 level tests (run nightly)",
# function module markers
"diffusion: Diffusion model tests",
"omni: Omni model tests",
Expand Down
5 changes: 3 additions & 2 deletions tests/benchmarks/test_accuracy_bench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import pytest
from PIL import Image

pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]


REPO_ROOT = Path(__file__).resolve().parents[2]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
Expand Down Expand Up @@ -37,8 +40,6 @@
summarize_gebench_results,
)

pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]


def test_summarize_gebench_generated_records_groups_by_type():
records = [
Expand Down
4 changes: 4 additions & 0 deletions tests/dfx/perf/scripts/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
)
from tests.helpers.runtime import OmniServer

pytestmark = [pytest.mark.full_model, pytest.mark.omni]


os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"

Expand Down Expand Up @@ -293,6 +296,7 @@ def assert_result(
print(f"ERROR: Test results exceeded baseline: {metric_name}: {current_value} < {baseline_value}")


@pytest.mark.benchmark
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
@pytest.mark.parametrize("benchmark_params", benchmark_indices, indirect=True)
def test_performance_benchmark(omni_server, benchmark_params):
Expand Down
5 changes: 3 additions & 2 deletions tests/dfx/perf/scripts/run_diffusion_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import psutil
import pytest

pytestmark = [pytest.mark.diffusion, pytest.mark.full_model]

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
os.environ.setdefault("DIFFUSION_ATTENTION_BACKEND", "FLASH_ATTN")
Expand Down Expand Up @@ -662,8 +664,7 @@ def assert_result(result: dict[str, Any], params: dict[str, Any]) -> None:
# ---------------------------------------------------------------------------
# Test entry point
# ---------------------------------------------------------------------------


@pytest.mark.benchmark
@pytest.mark.parametrize(
"diffusion_server",
server_params,
Expand Down
5 changes: 2 additions & 3 deletions tests/diffusion/models/glm_image/test_glm_image_sp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from vllm_omni.diffusion.data import DiffusionParallelConfig

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


@pytest.fixture(scope="function", autouse=True)
def setup_sp_groups(mocker):
Expand All @@ -24,9 +26,6 @@ def setup_sp_groups(mocker):
yield


pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


def test_glm_image_sp_plan_defined():
"""Test that _sp_plan is properly defined on GlmImageTransformer2DModel."""
from vllm_omni.diffusion.models.glm_image.glm_image_transformer import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]

pytestmark = [pytest.mark.core_model, pytest.mark.cpu]


def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]:
ctx = zmq.Context.instance()
Expand Down
Loading
Loading