diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index e02d494729c..56feccbd664 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -20,19 +20,7 @@ steps: - label: "Simple Unit Test" depends_on: image-build commands: - - | - pytest -v -s \ - tests/entrypoints/ \ - tests/diffusion/cache/ \ - tests/diffusion/lora/ \ - tests/model_executor/models/qwen2_5_omni/test_audio_length.py \ - tests/worker/ \ - tests/distributed/omni_connectors/test_kv_flow.py \ - --cov=vllm_omni \ - --cov-branch \ - --cov-report=term-missing \ - --cov-report=html \ - --cov-report=xml + - "pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml" agents: queue: "gpu_1_queue" plugins: @@ -118,7 +106,7 @@ steps: timeout_in_minutes: 15 depends_on: image-build commands: - - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py + - pytest -s -v -m 'core_model and cache and diffusion and not distributed_cuda and L4' agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index fac5c7268bf..e08562ea6b1 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -44,7 +44,7 @@ steps: - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py + - pytest -s -v -m 'core_model and cache and diffusion and not distributed_rocm and MI325' - label: "Diffusion Sequence Parallelism Test" timeout_in_minutes: 20 diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md index bf56914f8da..2afff846028 100644 --- a/docs/contributing/ci/tests_markers.md +++ b/docs/contributing/ci/tests_markers.md @@ -5,33 +5,33 @@ By adding markers before test functions, tests can later be executed uniformly b ## Current Markers Defined in `pyproject.toml`: -| Marker | Description | -| ------------------ | ------------------------------------------------------- | -| `core_model` | Core model tests (run in each PR) | -| `diffusion` | Diffusion model tests | -| `omni` | Omni model tests | -| `cache` | Cache backend tests | -| `parallel` | Parallelism/distributed tests | -| `cpu` | Tests that run on CPU | -| `gpu` | Tests that run on GPU (auto-added) | -| `cuda` | Tests that run on CUDA (auto-added) | -| `rocm` | Tests that run on AMD/ROCm (auto-added) | -| `npu` | Tests that run on NPU/Ascend (auto-added) | -| `H100` | Tests that require H100 GPU | -| `L4` | Tests that require L4 GPU | -| `MI325` | Tests that require MI325 GPU (AMD/ROCm) | -| `A2` | Tests that require A2 NPU | -| `A3` | Tests that require A3 NPU | -| `distributed_cuda` | Tests that require multi cards on CUDA platform | -| `distributed_rocm` | Tests that require multi cards on ROCm platform | -| `distributed_npu` | Tests that require multi cards on NPU platform | -| `skipif_cuda` | Skip if the num of CUDA cards is less than the required | -| `skipif_rocm` | Skip if the num of ROCm cards is less than the required | -| `skipif_npu` | Skip if the num of NPU cards is less than the required | -| `slow` | Slow tests (may skip in quick CI) | -| `benchmark` | Benchmark tests | - -For those markers shown as auto-added, they will be added by the `@hardware_test` decorator. +| Marker | Description | +| ------------------ | --------------------------------------------------------- | +| `core_model` | Core model tests (run in each PR) | +| `diffusion` | Diffusion model tests | +| `omni` | Omni model tests | +| `cache` | Cache backend tests | +| `parallel` | Parallelism/distributed tests | +| `cpu` | Tests that run on CPU | +| `gpu` | Tests that run on GPU * | +| `cuda` | Tests that run on CUDA * | +| `rocm` | Tests that run on AMD/ROCm * | +| `npu` | Tests that run on NPU/Ascend * | +| `H100` | Tests that require H100 GPU * | +| `L4` | Tests that require L4 GPU * | +| `MI325` | Tests that require MI325 GPU (AMD/ROCm) * | +| `A2` | Tests that require A2 NPU * | +| `A3` | Tests that require A3 NPU * | +| `distributed_cuda` | Tests that require multi cards on CUDA platform * | +| `distributed_rocm` | Tests that require multi cards on ROCm platform * | +| `distributed_npu` | Tests that require multi cards on NPU platform * | +| `skipif_cuda` | Skip if the num of CUDA cards is less than the required * | +| `skipif_rocm` | Skip if the num of ROCm cards is less than the required * | +| `skipif_npu` | Skip if the num of NPU cards is less than the required * | +| `slow` | Slow tests (may skip in quick CI) | +| `benchmark` | Benchmark tests | + +\* Means those markers are auto-added, and they will be added by the `@hardware_test` decorator. ### Example usage for markers @@ -71,10 +71,7 @@ This decorator is intended to make hardware-aware, cross-platform test authoring Support for `skipif_rocm` and `skipif_npu` will be implemented later. -5. **Runs each test in a new process** - Automatically wraps the distributed test with a decorator (`@create_new_process_for_each_test`) to ensure isolation and compatibility with multi-process hardware backends. - -6. **Works with pytest filtering** +5. **Works with pytest filtering** Allows tests to be filtered and selected at runtime using standard pytest marker expressions (e.g., `-m "distributed_cuda and L4"`). #### Example usage for decorator @@ -94,7 +91,6 @@ This decorator is intended to make hardware-aware, cross-platform test authoring ``` - `res` must be a dict; supported resources: CUDA (L4/H100), ROCm (MI325), NPU (A2/A3) - `num_cards` can be int (all platforms) or dict (per platform); defaults to 1 when missing -- `hardware_test` automatically applies `@create_new_process_for_each_test` for distributed tests. - Distributed markers (`distributed_cuda`, `distributed_rocm`, `distributed_npu`) are auto-added for multi-card cases - Filtering examples: - CUDA only: `pytest -m "distributed_cuda and L4"` diff --git a/pyproject.toml b/pyproject.toml index 706d0152e1c..483f7625a2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -175,6 +175,10 @@ markers = [ "slow: Slow tests (may skip in quick CI)", "benchmark: Benchmark tests", ] +filterwarnings = [ + "ignore:.*does not have '__test__' attribute.*:UserWarning", + "ignore:.*does not have '__bases__' attribute.*:UserWarning", +] [tool.typos.default] extend-ignore-identifiers-re = [ diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 2c624b8e76a..40244eb5726 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -4,6 +4,7 @@ import pytest from tests.conftest import OmniServer +from tests.utils import hardware_test models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")] @@ -29,6 +30,9 @@ def omni_server(request): print("OmniServer stopped") +@pytest.mark.core_model +@pytest.mark.benchmark +@hardware_test(res={"cuda": "H100"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_bench_serve_chat(omni_server): command = [ diff --git a/tests/diffusion/cache/test_cache_backends.py b/tests/diffusion/cache/test_cache_backends.py index ed9301410ca..a9312f4b1ad 100644 --- a/tests/diffusion/cache/test_cache_backends.py +++ b/tests/diffusion/cache/test_cache_backends.py @@ -22,6 +22,8 @@ from vllm_omni.diffusion.cache.teacache.backend import TeaCacheBackend from vllm_omni.diffusion.data import DiffusionCacheConfig +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class TestCacheDiTBackend: """Test CacheDiTBackend implementation.""" diff --git a/tests/diffusion/lora/test_base_linear.py b/tests/diffusion/lora/test_base_linear.py index 42bdf6526a5..9386d0909d9 100644 --- a/tests/diffusion/lora/test_base_linear.py +++ b/tests/diffusion/lora/test_base_linear.py @@ -5,10 +5,13 @@ from dataclasses import dataclass +import pytest import torch from vllm_omni.diffusion.lora.layers.base_linear import DiffusionBaseLinearLayerWithLoRA +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + @dataclass class _DummyLoRAConfig: diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py index 84fafe3bc9e..e5f1c47003d 100644 --- a/tests/diffusion/lora/test_lora_manager.py +++ b/tests/diffusion/lora/test_lora_manager.py @@ -3,6 +3,7 @@ from __future__ import annotations +import pytest import torch from vllm.lora.lora_weights import LoRALayerWeights from vllm.lora.utils import get_supported_lora_modules @@ -11,6 +12,8 @@ from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager from vllm_omni.lora.request import LoRARequest +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class _DummyLoRALayer: def __init__(self, n_slices: int, output_slices: tuple[int, ...]): diff --git a/tests/diffusion/test_diffusion_worker.py b/tests/diffusion/test_diffusion_worker.py index 220f210a3d5..6e57355a2e1 100644 --- a/tests/diffusion/test_diffusion_worker.py +++ b/tests/diffusion/test_diffusion_worker.py @@ -17,6 +17,8 @@ from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + @pytest.fixture def mock_od_config(): diff --git a/tests/distributed/omni_connectors/test_kv_flow.py b/tests/distributed/omni_connectors/test_kv_flow.py index 8c7ff79ca54..2b35718e7c4 100644 --- a/tests/distributed/omni_connectors/test_kv_flow.py +++ b/tests/distributed/omni_connectors/test_kv_flow.py @@ -1,7 +1,6 @@ import pytest import torch -from tests.utils import hardware_test from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( OmniKVCacheConfig, @@ -9,6 +8,8 @@ ) from vllm_omni.inputs.data import OmniDiffusionSamplingParams +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.cache] + class MockConnector: def __init__(self): @@ -58,11 +59,6 @@ def common_constants(): } -@pytest.mark.cache -@hardware_test( - res={"cuda": "L4"}, - num_cards=2, -) def test_manager_extraction(kv_config, mock_connector, common_constants): """Test extraction and sending logic in OmniKVTransferManager.""" num_layers = common_constants["num_layers"] @@ -109,11 +105,6 @@ def test_manager_extraction(kv_config, mock_connector, common_constants): assert data["layer_blocks"]["key_cache"][0].shape == expected_shape -@pytest.mark.cache -@hardware_test( - res={"cuda": "L4"}, - num_cards=2, -) def test_manager_reception(kv_config, mock_connector, common_constants): """Test reception and injection logic in OmniKVTransferManager.""" num_layers = common_constants["num_layers"] @@ -171,11 +162,6 @@ def test_manager_reception(kv_config, mock_connector, common_constants): assert req.kv_metadata["seq_len"] == seq_len -@pytest.mark.cache -@hardware_test( - res={"cuda": "L4"}, - num_cards=2, -) def test_integration_flow(common_constants): """Simulate extraction -> connector -> reception.""" num_layers = common_constants["num_layers"] @@ -211,7 +197,8 @@ def test_integration_flow(common_constants): recv_timeout=1.0, ) receiver_manager = OmniKVTransferManager(receiver_config) - receiver_manager._connector = connector # Share the same mock connector instance + # Share the same mock connector instance + receiver_manager._connector = connector req = OmniDiffusionRequest( prompts=["test_integ"], @@ -228,11 +215,6 @@ def test_integration_flow(common_constants): assert req.kv_metadata["seq_len"] == 10 -@pytest.mark.cache -@hardware_test( - res={"cuda": "L4"}, - num_cards=2, -) def test_manager_extraction_no_connector(kv_config, common_constants): """Test extraction when connector is unavailable (should still return IDs).""" block_size = common_constants["block_size"] diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index 281ac48dcb4..65c59c6f7d8 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -15,6 +15,7 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams # ruff: noqa: E402 @@ -32,6 +33,10 @@ models = ["riverclouds/qwen_image_random"] +@pytest.mark.core_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) @pytest.mark.parametrize("model_name", models) def test_cache_dit(model_name: str): """Test cache-dit backend with diffusion model.""" diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index 35e106df81a..e46d7ec968c 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -5,7 +5,7 @@ import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.utils import GPUMemoryMonitor +from tests.utils import GPUMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -45,6 +45,9 @@ def inference(model_name: str, offload: bool = True): return peak +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) @pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported") @pytest.mark.parametrize("model_name", models) def test_cpu_offload_diffusion_model(model_name: str): diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py index af9d793c1dc..eda2f28b55b 100644 --- a/tests/e2e/offline_inference/test_qwen2_5_omni.py +++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py @@ -13,10 +13,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.multimodal.image import convert_image_mode +from tests.utils import create_new_process_for_each_test, hardware_test from vllm_omni.platforms import current_omni_platform from .conftest import OmniRunner -from .utils import create_new_process_for_each_test models = ["Qwen/Qwen2.5-Omni-3B"] @@ -34,8 +34,10 @@ @pytest.mark.core_model -@pytest.mark.parametrize("test_config", test_params) +@pytest.mark.omni +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @create_new_process_for_each_test("spawn") +@pytest.mark.parametrize("test_config", test_params) def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None: """Test processing audio, image, and video together, generating audio output.""" model, stage_config_path = test_config @@ -94,8 +96,10 @@ def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: t @pytest.mark.core_model -@pytest.mark.parametrize("test_config", test_params) +@pytest.mark.omni +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @create_new_process_for_each_test("spawn") +@pytest.mark.parametrize("test_config", test_params) def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None: """Test processing audio, image, and video together, generating audio output.""" model, stage_config_path = test_config diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py index 2c5c66fe348..ecd09fdd322 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni.py +++ b/tests/e2e/offline_inference/test_qwen3_omni.py @@ -14,6 +14,7 @@ import pytest from vllm.assets.video import VideoAsset +from tests.utils import hardware_test from vllm_omni.platforms import current_omni_platform from .conftest import OmniRunner @@ -31,6 +32,9 @@ test_params = [(model, stage_config) for model in models for stage_config in stage_configs] +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("test_config", test_params) def test_video_to_audio(omni_runner: type[OmniRunner], test_config) -> None: """Test processing video, generating audio output.""" diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py index 3e7bb561799..866c2c6a184 100644 --- a/tests/e2e/offline_inference/test_sequence_parallel.py +++ b/tests/e2e/offline_inference/test_sequence_parallel.py @@ -20,16 +20,17 @@ import torch.distributed as dist from PIL import Image +from tests.utils import hardware_test +from vllm_omni import Omni +from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.platforms import current_omni_platform # ruff: noqa: E402 REPO_ROOT = Path(__file__).resolve().parents[3] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni -from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.platforms import current_omni_platform # Test configuration MODELS = ["riverclouds/qwen_image_random"] @@ -145,9 +146,11 @@ def _run_inference( # - warmup: whether to run warmup for this SP config # - is_perf_test: whether this is a performance test (show speedup metrics) SP_CONFIGS = [ - (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ulysses-2 - performance test + # Ulysses-2 - performance test + (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True), # Ring-2 - performance test - (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False), # Hybrid - correctness only + # Hybrid - correctness only + (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False), (4, 1, 272, 272, False, False), # Ulysses-4 - shape and correctness ] @@ -162,6 +165,10 @@ def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str: return f"hybrid-{ulysses_degree}x{ring_degree}" +@pytest.mark.core_model +@pytest.mark.diffusion +@pytest.mark.parallel +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) @pytest.mark.parametrize("model_name", MODELS) def test_sp_correctness(model_name: str): """Test that SP inference produces correct outputs and measure performance. diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py index df2ca5e4283..ad879d75173 100644 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ b/tests/e2e/offline_inference/test_stable_audio_model.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput @@ -19,6 +20,9 @@ models = ["linyueqian/stable_audio_random"] +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4"}) @pytest.mark.parametrize("model_name", models) def test_stable_audio_model(model_name: str): m = Omni(model=model_name) diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index 74c1bd1ce2b..e2cb09f3068 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -32,6 +33,9 @@ models = ["Tongyi-MAI/Z-Image-Turbo"] +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 1, "rocm": 2}) @pytest.mark.parametrize("model_name", models) def test_diffusion_model(model_name: str): m = None diff --git a/tests/e2e/offline_inference/test_teacache.py b/tests/e2e/offline_inference/test_teacache.py index 7d626138819..d97991779e4 100644 --- a/tests/e2e/offline_inference/test_teacache.py +++ b/tests/e2e/offline_inference/test_teacache.py @@ -15,6 +15,7 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams # ruff: noqa: E402 @@ -31,6 +32,10 @@ models = ["riverclouds/qwen_image_random"] +@pytest.mark.core_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) @pytest.mark.parametrize("model_name", models) def test_teacache(model_name: str): """Test TeaCache backend with diffusion model.""" diff --git a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py index 0e3b97ec39f..2d051e5aaf5 100644 --- a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py +++ b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py @@ -12,18 +12,18 @@ from PIL import Image from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.utils import GPUMemoryMonitor, hardware_test +from vllm_omni import Omni +from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform # ruff: noqa: E402 REPO_ROOT = Path(__file__).resolve().parents[2] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from tests.utils import GPUMemoryMonitor -from vllm_omni import Omni -from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform # os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -127,7 +127,10 @@ def _run_zimage_generate( cleanup_dist_env_and_memory() -@pytest.mark.integration +@pytest.mark.core_model +@pytest.mark.diffusion +@pytest.mark.parallel +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) def test_zimage_tensor_parallel_tp2(tmp_path: Path): if current_omni_platform.is_npu() or current_omni_platform.is_rocm(): pytest.skip("Z-Image TP e2e test is only supported on CUDA for now.") diff --git a/tests/e2e/online_serving/test_async_omni.py b/tests/e2e/online_serving/test_async_omni.py index d90727e1bc5..ba5084ad460 100644 --- a/tests/e2e/online_serving/test_async_omni.py +++ b/tests/e2e/online_serving/test_async_omni.py @@ -8,6 +8,7 @@ from vllm import SamplingParams from vllm.inputs import PromptType +from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni, ClientRequestState os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -57,6 +58,9 @@ async def generate( return count, request_id +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.asyncio async def test_abort(): with ExitStack() as after: @@ -113,6 +117,9 @@ async def test_abort(): await asyncio.sleep(5) +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.asyncio async def test_build_and_log_summary(monkeypatch): from vllm_omni.entrypoints.utils import get_final_stage_id_for_e2e diff --git a/tests/e2e/online_serving/test_image_gen_edit.py b/tests/e2e/online_serving/test_image_gen_edit.py index 8db0d50fbe4..7db740f2037 100644 --- a/tests/e2e/online_serving/test_image_gen_edit.py +++ b/tests/e2e/online_serving/test_image_gen_edit.py @@ -22,6 +22,8 @@ from vllm.assets.image import ImageAsset from vllm.utils.network_utils import get_open_port +from tests.utils import hardware_test + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" # Increase timeout for downloading assets from S3 (default 5s is too short for CI) os.environ.setdefault("VLLM_IMAGE_FETCH_TIMEOUT", "60") @@ -178,6 +180,9 @@ def _decode_data_url_to_image_bytes(data_url: str) -> bytes: return base64.b64decode(b64_data) +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_i2i_multi_image_input_qwen_image_edit_2509( omni_server, diff --git a/tests/e2e/online_serving/test_images_generations_lora.py b/tests/e2e/online_serving/test_images_generations_lora.py index e912c420dc2..85b80cafbd3 100644 --- a/tests/e2e/online_serving/test_images_generations_lora.py +++ b/tests/e2e/online_serving/test_images_generations_lora.py @@ -23,6 +23,7 @@ from safetensors.torch import save_file from tests.conftest import OmniServer +from tests.utils import hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -144,6 +145,9 @@ def _basic_payload() -> dict: } +@pytest.mark.core_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) def test_images_generations_per_request_lora_switching(omni_server: OmniServer, tmp_path: Path) -> None: # Base generation. base_img = _post_images(omni_server, _basic_payload()) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index 073419fb838..7f879939f5c 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -28,6 +28,7 @@ merge_base64_and_convert_to_text, modify_stage_config, ) +from tests.utils import hardware_test from vllm_omni.platforms import current_omni_platform models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] @@ -145,6 +146,9 @@ def get_max_batch_size(size_type="few"): return batch_sizes.get(size_type, 5) +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> None: """ @@ -215,6 +219,9 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N assert similarity > 0.9, "The audio content is not same as the text" +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_text_to_text_audio_001(client: openai.OpenAI, omni_server) -> None: """ diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index 6ff6782aea4..0c6479ccea7 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -23,6 +23,8 @@ ) from vllm_omni.inputs.data import OmniDiffusionSamplingParams +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + # Unit Tests diff --git a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py index 240fd2051ed..58d4d253f5d 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py +++ b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py @@ -11,6 +11,8 @@ import pytest from vllm.sampling_params import SamplingParams +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + @pytest.fixture def mock_comprehension_stage(): diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index da73acb5a8e..2db98c06869 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -16,6 +16,8 @@ ) from vllm_omni.outputs import OmniRequestOutput +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + logger = logging.getLogger(__name__) diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py index 6b49eba2c60..ba0390040bf 100644 --- a/tests/entrypoints/test_async_omni_diffusion_config.py +++ b/tests/entrypoints/test_async_omni_diffusion_config.py @@ -1,9 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + from vllm_omni.entrypoints import omni as omni_module from vllm_omni.entrypoints.async_omni import AsyncOmni +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def test_default_stage_config_includes_cache_backend(monkeypatch): """Ensure cache_backend/cache_config are preserved in default diffusion stage.""" diff --git a/tests/entrypoints/test_omni_diffusion.py b/tests/entrypoints/test_omni_diffusion.py index c4884e3abd1..ab1194a25c8 100644 --- a/tests/entrypoints/test_omni_diffusion.py +++ b/tests/entrypoints/test_omni_diffusion.py @@ -9,6 +9,8 @@ from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK from vllm_omni.inputs.data import OmniDiffusionSamplingParams +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + # Suppress noisy DeprecationWarnings from optional Swig bindings imported by vLLM dependencies. warnings.filterwarnings( "ignore", diff --git a/tests/entrypoints/test_omni_input_preprocessor.py b/tests/entrypoints/test_omni_input_preprocessor.py index 77c84f06b3c..422154bf969 100644 --- a/tests/entrypoints/test_omni_input_preprocessor.py +++ b/tests/entrypoints/test_omni_input_preprocessor.py @@ -1,5 +1,9 @@ +import pytest + from vllm_omni.inputs.preprocess import OmniInputPreprocessor +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _make_preprocessor(monkeypatch): preprocessor = object.__new__(OmniInputPreprocessor) diff --git a/tests/entrypoints/test_omni_llm.py b/tests/entrypoints/test_omni_llm.py index f99c6d8336c..f33ca6d59cf 100644 --- a/tests/entrypoints/test_omni_llm.py +++ b/tests/entrypoints/test_omni_llm.py @@ -9,6 +9,8 @@ from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + # Suppress noisy DeprecationWarnings from optional Swig bindings imported by vLLM dependencies. warnings.filterwarnings( "ignore", diff --git a/tests/entrypoints/test_omni_new_request_data.py b/tests/entrypoints/test_omni_new_request_data.py index 776509d5bba..b1ad56cddf6 100644 --- a/tests/entrypoints/test_omni_new_request_data.py +++ b/tests/entrypoints/test_omni_new_request_data.py @@ -1,9 +1,12 @@ from types import SimpleNamespace +import pytest import torch from vllm_omni.core.sched.output import OmniNewRequestData +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def test_omni_new_request_data_copies_payloads(): prompt_embeds = torch.randn(2, 3) diff --git a/tests/entrypoints/test_omni_stage_diffusion_config.py b/tests/entrypoints/test_omni_stage_diffusion_config.py index 5fe04cbbd88..f464c55fd69 100644 --- a/tests/entrypoints/test_omni_stage_diffusion_config.py +++ b/tests/entrypoints/test_omni_stage_diffusion_config.py @@ -1,8 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + from vllm_omni.entrypoints.omni_stage import _build_od_config +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def test_build_od_config_includes_diffusion_fields(): engine_args = { diff --git a/tests/entrypoints/test_stage_utils.py b/tests/entrypoints/test_stage_utils.py index ac503639be6..ab7358d9f74 100644 --- a/tests/entrypoints/test_stage_utils.py +++ b/tests/entrypoints/test_stage_utils.py @@ -6,6 +6,8 @@ from vllm_omni.entrypoints.stage_utils import set_stage_devices +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _make_dummy_torch(call_log): class _Props: diff --git a/tests/model_executor/models/qwen2_5_omni/test_audio_length.py b/tests/model_executor/models/qwen2_5_omni/test_audio_length.py index dd5f098172c..6156ed3a97f 100644 --- a/tests/model_executor/models/qwen2_5_omni/test_audio_length.py +++ b/tests/model_executor/models/qwen2_5_omni/test_audio_length.py @@ -3,6 +3,8 @@ import pytest +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def test_resolve_max_mel_frames_default(): from vllm_omni.model_executor.models.qwen2_5_omni.audio_length import resolve_max_mel_frames diff --git a/tests/utils.py b/tests/utils.py index f5c513a4d36..3219821b6ff 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -495,18 +495,8 @@ def test_multi_platform(): raise ValueError(f"Unsupported platform: {platform}") all_marks.extend(marks) - create_new_process_flag = False - for cards in num_cards_dict.values(): - if cards > 1: - create_new_process_flag = True - break - def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - if create_new_process_flag: - # only for distributed tests - func = create_new_process_for_each_test()(f) - else: - func = f + func = f for mark in reversed(all_marks): func = mark(func) return func diff --git a/tests/worker/test_gpu_generation_model_runner.py b/tests/worker/test_gpu_generation_model_runner.py index 25ed1ae861d..5c44889a41e 100644 --- a/tests/worker/test_gpu_generation_model_runner.py +++ b/tests/worker/test_gpu_generation_model_runner.py @@ -1,7 +1,10 @@ +import pytest import torch from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class _DummyInputBatch: def __init__(self): diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py index eb1adf227d1..c7836123a64 100644 --- a/tests/worker/test_omni_gpu_model_runner.py +++ b/tests/worker/test_omni_gpu_model_runner.py @@ -1,10 +1,13 @@ from contextlib import contextmanager from types import SimpleNamespace +import pytest import torch from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class DummyBuffer: """A minimal buffer wrapper that exposes the `.gpu` attribute."""