diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e02d494729c..56feccbd664 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -20,19 +20,7 @@ steps:
   - label: "Simple Unit Test"
     depends_on: image-build
     commands:
-    - |
-      pytest -v -s \
-        tests/entrypoints/ \
-        tests/diffusion/cache/ \
-        tests/diffusion/lora/ \
-        tests/model_executor/models/qwen2_5_omni/test_audio_length.py \
-        tests/worker/ \
-        tests/distributed/omni_connectors/test_kv_flow.py \
-        --cov=vllm_omni \
-        --cov-branch \
-        --cov-report=term-missing \
-        --cov-report=html \
-        --cov-report=xml
+      - "pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
     agents:
       queue: "gpu_1_queue"
     plugins:
@@ -118,7 +106,7 @@ steps:
     timeout_in_minutes: 15
     depends_on: image-build
     commands:
-      - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+      - pytest -s -v -m 'core_model and cache and diffusion and not distributed_cuda and L4'
     agents:
       queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
     plugins:
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index fac5c7268bf..e08562ea6b1 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -44,7 +44,7 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+    - pytest -s -v -m 'core_model and cache and diffusion and not distributed_rocm and MI325'
 
 - label: "Diffusion Sequence Parallelism Test"
   timeout_in_minutes: 20
diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md
index bf56914f8da..2afff846028 100644
--- a/docs/contributing/ci/tests_markers.md
+++ b/docs/contributing/ci/tests_markers.md
@@ -5,33 +5,33 @@ By adding markers before test functions, tests can later be executed uniformly b
 ## Current Markers
 Defined in `pyproject.toml`:
 
-| Marker             | Description                                             |
-| ------------------ | ------------------------------------------------------- |
-| `core_model`       | Core model tests (run in each PR)                       |
-| `diffusion`        | Diffusion model tests                                   |
-| `omni`             | Omni model tests                                        |
-| `cache`            | Cache backend tests                                     |
-| `parallel`         | Parallelism/distributed tests                           |
-| `cpu`              | Tests that run on CPU                                   |
-| `gpu`              | Tests that run on GPU (auto-added)                      |
-| `cuda`             | Tests that run on CUDA (auto-added)                     |
-| `rocm`             | Tests that run on AMD/ROCm (auto-added)                 |
-| `npu`              | Tests that run on NPU/Ascend (auto-added)               |
-| `H100`             | Tests that require H100 GPU                             |
-| `L4`               | Tests that require L4 GPU                               |
-| `MI325`            | Tests that require MI325 GPU (AMD/ROCm)                 |
-| `A2`               | Tests that require A2 NPU                               |
-| `A3`               | Tests that require A3 NPU                               |
-| `distributed_cuda` | Tests that require multi cards on CUDA platform         |
-| `distributed_rocm` | Tests that require multi cards on ROCm platform         |
-| `distributed_npu`  | Tests that require multi cards on NPU platform          |
-| `skipif_cuda`      | Skip if the num of CUDA cards is less than the required |
-| `skipif_rocm`      | Skip if the num of ROCm cards is less than the required |
-| `skipif_npu`       | Skip if the num of NPU cards is less than the required  |
-| `slow`             | Slow tests (may skip in quick CI)                       |
-| `benchmark`        | Benchmark tests                                         |
-
-For those markers shown as auto-added, they will be added by the `@hardware_test` decorator.
+| Marker             | Description                                               |
+| ------------------ | --------------------------------------------------------- |
+| `core_model`       | Core model tests (run in each PR)                         |
+| `diffusion`        | Diffusion model tests                                     |
+| `omni`             | Omni model tests                                          |
+| `cache`            | Cache backend tests                                       |
+| `parallel`         | Parallelism/distributed tests                             |
+| `cpu`              | Tests that run on CPU                                     |
+| `gpu`              | Tests that run on GPU *                                   |
+| `cuda`             | Tests that run on CUDA *                                  |
+| `rocm`             | Tests that run on AMD/ROCm *                              |
+| `npu`              | Tests that run on NPU/Ascend *                            |
+| `H100`             | Tests that require H100 GPU  *                            |
+| `L4`               | Tests that require L4 GPU *                               |
+| `MI325`            | Tests that require MI325 GPU (AMD/ROCm) *                 |
+| `A2`               | Tests that require A2 NPU *                               |
+| `A3`               | Tests that require A3 NPU *                               |
+| `distributed_cuda` | Tests that require multi cards on CUDA platform *         |
+| `distributed_rocm` | Tests that require multi cards on ROCm platform  *        |
+| `distributed_npu`  | Tests that require multi cards on NPU platform  *         |
+| `skipif_cuda`      | Skip if the num of CUDA cards is less than the required * |
+| `skipif_rocm`      | Skip if the num of ROCm cards is less than the required * |
+| `skipif_npu`       | Skip if the num of NPU cards is less than the required *  |
+| `slow`             | Slow tests (may skip in quick CI)                         |
+| `benchmark`        | Benchmark tests                                           |
+
+\* Means those markers are auto-added, and they will be added by the `@hardware_test` decorator.
 
 ### Example usage for markers
 
@@ -71,10 +71,7 @@ This decorator is intended to make hardware-aware, cross-platform test authoring
    Support for `skipif_rocm` and `skipif_npu` will be implemented later.
 
 
-5. **Runs each test in a new process**  
-   Automatically wraps the distributed test with a decorator (`@create_new_process_for_each_test`) to ensure isolation and compatibility with multi-process hardware backends.
-
-6. **Works with pytest filtering**  
+5. **Works with pytest filtering**  
    Allows tests to be filtered and selected at runtime using standard pytest marker expressions (e.g., `-m "distributed_cuda and L4"`).
 
 #### Example usage for decorator
@@ -94,7 +91,6 @@ This decorator is intended to make hardware-aware, cross-platform test authoring
     ```
 - `res` must be a dict; supported resources: CUDA (L4/H100), ROCm (MI325), NPU (A2/A3)
 - `num_cards` can be int (all platforms) or dict (per platform); defaults to 1 when missing
-- `hardware_test` automatically applies `@create_new_process_for_each_test` for distributed tests.
 - Distributed markers (`distributed_cuda`, `distributed_rocm`, `distributed_npu`) are auto-added for multi-card cases
 - Filtering examples:
     - CUDA only: `pytest -m "distributed_cuda and L4"`
diff --git a/pyproject.toml b/pyproject.toml
index 706d0152e1c..483f7625a2e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -175,6 +175,10 @@ markers = [
     "slow: Slow tests (may skip in quick CI)",
     "benchmark: Benchmark tests",
 ]
+filterwarnings = [
+    "ignore:.*does not have '__test__' attribute.*:UserWarning",
+    "ignore:.*does not have '__bases__' attribute.*:UserWarning",
+]
 
 [tool.typos.default]
 extend-ignore-identifiers-re = [
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 2c624b8e76a..40244eb5726 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tests.conftest import OmniServer
+from tests.utils import hardware_test
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "qwen3_omni_ci.yaml")]
@@ -29,6 +30,9 @@ def omni_server(request):
         print("OmniServer stopped")
 
 
+@pytest.mark.core_model
+@pytest.mark.benchmark
+@hardware_test(res={"cuda": "H100"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bench_serve_chat(omni_server):
     command = [
diff --git a/tests/diffusion/cache/test_cache_backends.py b/tests/diffusion/cache/test_cache_backends.py
index ed9301410ca..a9312f4b1ad 100644
--- a/tests/diffusion/cache/test_cache_backends.py
+++ b/tests/diffusion/cache/test_cache_backends.py
@@ -22,6 +22,8 @@
 from vllm_omni.diffusion.cache.teacache.backend import TeaCacheBackend
 from vllm_omni.diffusion.data import DiffusionCacheConfig
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class TestCacheDiTBackend:
     """Test CacheDiTBackend implementation."""
diff --git a/tests/diffusion/lora/test_base_linear.py b/tests/diffusion/lora/test_base_linear.py
index 42bdf6526a5..9386d0909d9 100644
--- a/tests/diffusion/lora/test_base_linear.py
+++ b/tests/diffusion/lora/test_base_linear.py
@@ -5,10 +5,13 @@
 
 from dataclasses import dataclass
 
+import pytest
 import torch
 
 from vllm_omni.diffusion.lora.layers.base_linear import DiffusionBaseLinearLayerWithLoRA
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 @dataclass
 class _DummyLoRAConfig:
diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py
index 84fafe3bc9e..e5f1c47003d 100644
--- a/tests/diffusion/lora/test_lora_manager.py
+++ b/tests/diffusion/lora/test_lora_manager.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import pytest
 import torch
 from vllm.lora.lora_weights import LoRALayerWeights
 from vllm.lora.utils import get_supported_lora_modules
@@ -11,6 +12,8 @@
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
 from vllm_omni.lora.request import LoRARequest
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class _DummyLoRALayer:
     def __init__(self, n_slices: int, output_slices: tuple[int, ...]):
diff --git a/tests/diffusion/test_diffusion_worker.py b/tests/diffusion/test_diffusion_worker.py
index 220f210a3d5..6e57355a2e1 100644
--- a/tests/diffusion/test_diffusion_worker.py
+++ b/tests/diffusion/test_diffusion_worker.py
@@ -17,6 +17,8 @@
 
 from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
 
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+
 
 @pytest.fixture
 def mock_od_config():
diff --git a/tests/distributed/omni_connectors/test_kv_flow.py b/tests/distributed/omni_connectors/test_kv_flow.py
index 8c7ff79ca54..2b35718e7c4 100644
--- a/tests/distributed/omni_connectors/test_kv_flow.py
+++ b/tests/distributed/omni_connectors/test_kv_flow.py
@@ -1,7 +1,6 @@
 import pytest
 import torch
 
-from tests.utils import hardware_test
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.distributed.omni_connectors.kv_transfer_manager import (
     OmniKVCacheConfig,
@@ -9,6 +8,8 @@
 )
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.cache]
+
 
 class MockConnector:
     def __init__(self):
@@ -58,11 +59,6 @@ def common_constants():
     }
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_extraction(kv_config, mock_connector, common_constants):
     """Test extraction and sending logic in OmniKVTransferManager."""
     num_layers = common_constants["num_layers"]
@@ -109,11 +105,6 @@ def test_manager_extraction(kv_config, mock_connector, common_constants):
     assert data["layer_blocks"]["key_cache"][0].shape == expected_shape
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_reception(kv_config, mock_connector, common_constants):
     """Test reception and injection logic in OmniKVTransferManager."""
     num_layers = common_constants["num_layers"]
@@ -171,11 +162,6 @@ def test_manager_reception(kv_config, mock_connector, common_constants):
     assert req.kv_metadata["seq_len"] == seq_len
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_integration_flow(common_constants):
     """Simulate extraction -> connector -> reception."""
     num_layers = common_constants["num_layers"]
@@ -211,7 +197,8 @@ def test_integration_flow(common_constants):
         recv_timeout=1.0,
     )
     receiver_manager = OmniKVTransferManager(receiver_config)
-    receiver_manager._connector = connector  # Share the same mock connector instance
+    # Share the same mock connector instance
+    receiver_manager._connector = connector
 
     req = OmniDiffusionRequest(
         prompts=["test_integ"],
@@ -228,11 +215,6 @@ def test_integration_flow(common_constants):
     assert req.kv_metadata["seq_len"] == 10
 
 
-@pytest.mark.cache
-@hardware_test(
-    res={"cuda": "L4"},
-    num_cards=2,
-)
 def test_manager_extraction_no_connector(kv_config, common_constants):
     """Test extraction when connector is unavailable (should still return IDs)."""
     block_size = common_constants["block_size"]
diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py
index 281ac48dcb4..65c59c6f7d8 100644
--- a/tests/e2e/offline_inference/test_cache_dit.py
+++ b/tests/e2e/offline_inference/test_cache_dit.py
@@ -15,6 +15,7 @@
 import pytest
 import torch
 
+from tests.utils import hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
 # ruff: noqa: E402
@@ -32,6 +33,10 @@
 models = ["riverclouds/qwen_image_random"]
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cache
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"})
 @pytest.mark.parametrize("model_name", models)
 def test_cache_dit(model_name: str):
     """Test cache-dit backend with diffusion model."""
diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py
index 35e106df81a..e46d7ec968c 100644
--- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py
@@ -5,7 +5,7 @@
 import torch
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
-from tests.utils import GPUMemoryMonitor
+from tests.utils import GPUMemoryMonitor, hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.platforms import current_omni_platform
 
@@ -45,6 +45,9 @@ def inference(model_name: str, offload: bool = True):
     return peak
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"})
 @pytest.mark.skipif(current_omni_platform.is_npu() or current_omni_platform.is_rocm(), reason="Hardware not supported")
 @pytest.mark.parametrize("model_name", models)
 def test_cpu_offload_diffusion_model(model_name: str):
diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index af9d793c1dc..eda2f28b55b 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -13,10 +13,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.multimodal.image import convert_image_mode
 
+from tests.utils import create_new_process_for_each_test, hardware_test
 from vllm_omni.platforms import current_omni_platform
 
 from .conftest import OmniRunner
-from .utils import create_new_process_for_each_test
 
 models = ["Qwen/Qwen2.5-Omni-3B"]
 
@@ -34,8 +34,10 @@
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("test_config", test_params)
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @create_new_process_for_each_test("spawn")
+@pytest.mark.parametrize("test_config", test_params)
 def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
@@ -94,8 +96,10 @@ def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: t
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("test_config", test_params)
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @create_new_process_for_each_test("spawn")
+@pytest.mark.parametrize("test_config", test_params)
 def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py
index 2c5c66fe348..ecd09fdd322 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
@@ -14,6 +14,7 @@
 import pytest
 from vllm.assets.video import VideoAsset
 
+from tests.utils import hardware_test
 from vllm_omni.platforms import current_omni_platform
 
 from .conftest import OmniRunner
@@ -31,6 +32,9 @@
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
 
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("test_config", test_params)
 def test_video_to_audio(omni_runner: type[OmniRunner], test_config) -> None:
     """Test processing video, generating audio output."""
diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py
index 3e7bb561799..866c2c6a184 100644
--- a/tests/e2e/offline_inference/test_sequence_parallel.py
+++ b/tests/e2e/offline_inference/test_sequence_parallel.py
@@ -20,16 +20,17 @@
 import torch.distributed as dist
 from PIL import Image
 
+from tests.utils import hardware_test
+from vllm_omni import Omni
+from vllm_omni.diffusion.data import DiffusionParallelConfig
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
 
 # ruff: noqa: E402
 REPO_ROOT = Path(__file__).resolve().parents[3]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
-from vllm_omni import Omni
-from vllm_omni.diffusion.data import DiffusionParallelConfig
-from vllm_omni.platforms import current_omni_platform
 
 # Test configuration
 MODELS = ["riverclouds/qwen_image_random"]
@@ -145,9 +146,11 @@ def _run_inference(
 # - warmup: whether to run warmup for this SP config
 # - is_perf_test: whether this is a performance test (show speedup metrics)
 SP_CONFIGS = [
-    (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ulysses-2 - performance test
+    # Ulysses-2 - performance test
+    (2, 1, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),
     (1, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, True, True),  # Ring-2 - performance test
-    (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False),  # Hybrid - correctness only
+    # Hybrid - correctness only
+    (2, 2, DEFAULT_HEIGHT, DEFAULT_WIDTH, False, False),
     (4, 1, 272, 272, False, False),  # Ulysses-4 - shape and correctness
 ]
 
@@ -162,6 +165,10 @@ def _get_sp_mode(ulysses_degree: int, ring_degree: int) -> str:
         return f"hybrid-{ulysses_degree}x{ring_degree}"
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.parallel
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 @pytest.mark.parametrize("model_name", MODELS)
 def test_sp_correctness(model_name: str):
     """Test that SP inference produces correct outputs and measure performance.
diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py
index df2ca5e4283..ad879d75173 100644
--- a/tests/e2e/offline_inference/test_stable_audio_model.py
+++ b/tests/e2e/offline_inference/test_stable_audio_model.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.utils import hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 
@@ -19,6 +20,9 @@
 models = ["linyueqian/stable_audio_random"]
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4"})
 @pytest.mark.parametrize("model_name", models)
 def test_stable_audio_model(model_name: str):
     m = Omni(model=model_name)
diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py
index 74c1bd1ce2b..e2cb09f3068 100644
--- a/tests/e2e/offline_inference/test_t2i_model.py
+++ b/tests/e2e/offline_inference/test_t2i_model.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.utils import hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
@@ -32,6 +33,9 @@
     models = ["Tongyi-MAI/Z-Image-Turbo"]
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 1, "rocm": 2})
 @pytest.mark.parametrize("model_name", models)
 def test_diffusion_model(model_name: str):
     m = None
diff --git a/tests/e2e/offline_inference/test_teacache.py b/tests/e2e/offline_inference/test_teacache.py
index 7d626138819..d97991779e4 100644
--- a/tests/e2e/offline_inference/test_teacache.py
+++ b/tests/e2e/offline_inference/test_teacache.py
@@ -15,6 +15,7 @@
 import pytest
 import torch
 
+from tests.utils import hardware_test
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
 # ruff: noqa: E402
@@ -31,6 +32,10 @@
 models = ["riverclouds/qwen_image_random"]
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cache
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"})
 @pytest.mark.parametrize("model_name", models)
 def test_teacache(model_name: str):
     """Test TeaCache backend with diffusion model."""
diff --git a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py
index 0e3b97ec39f..2d051e5aaf5 100644
--- a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+++ b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py
@@ -12,18 +12,18 @@
 from PIL import Image
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 
+from tests.utils import GPUMemoryMonitor, hardware_test
+from vllm_omni import Omni
+from vllm_omni.diffusion.data import DiffusionParallelConfig
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
 
 # ruff: noqa: E402
 REPO_ROOT = Path(__file__).resolve().parents[2]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 
-from tests.utils import GPUMemoryMonitor
-from vllm_omni import Omni
-from vllm_omni.diffusion.data import DiffusionParallelConfig
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
 
 # os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -127,7 +127,10 @@ def _run_zimage_generate(
         cleanup_dist_env_and_memory()
 
 
-@pytest.mark.integration
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.parallel
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 def test_zimage_tensor_parallel_tp2(tmp_path: Path):
     if current_omni_platform.is_npu() or current_omni_platform.is_rocm():
         pytest.skip("Z-Image TP e2e test is only supported on CUDA for now.")
diff --git a/tests/e2e/online_serving/test_async_omni.py b/tests/e2e/online_serving/test_async_omni.py
index d90727e1bc5..ba5084ad460 100644
--- a/tests/e2e/online_serving/test_async_omni.py
+++ b/tests/e2e/online_serving/test_async_omni.py
@@ -8,6 +8,7 @@
 from vllm import SamplingParams
 from vllm.inputs import PromptType
 
+from tests.utils import hardware_test
 from vllm_omni.entrypoints.async_omni import AsyncOmni, ClientRequestState
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -57,6 +58,9 @@ async def generate(
     return count, request_id
 
 
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.asyncio
 async def test_abort():
     with ExitStack() as after:
@@ -113,6 +117,9 @@ async def test_abort():
     await asyncio.sleep(5)
 
 
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.asyncio
 async def test_build_and_log_summary(monkeypatch):
     from vllm_omni.entrypoints.utils import get_final_stage_id_for_e2e
diff --git a/tests/e2e/online_serving/test_image_gen_edit.py b/tests/e2e/online_serving/test_image_gen_edit.py
index 8db0d50fbe4..7db740f2037 100644
--- a/tests/e2e/online_serving/test_image_gen_edit.py
+++ b/tests/e2e/online_serving/test_image_gen_edit.py
@@ -22,6 +22,8 @@
 from vllm.assets.image import ImageAsset
 from vllm.utils.network_utils import get_open_port
 
+from tests.utils import hardware_test
+
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 # Increase timeout for downloading assets from S3 (default 5s is too short for CI)
 os.environ.setdefault("VLLM_IMAGE_FETCH_TIMEOUT", "60")
@@ -178,6 +180,9 @@ def _decode_data_url_to_image_bytes(data_url: str) -> bytes:
     return base64.b64decode(b64_data)
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_i2i_multi_image_input_qwen_image_edit_2509(
     omni_server,
diff --git a/tests/e2e/online_serving/test_images_generations_lora.py b/tests/e2e/online_serving/test_images_generations_lora.py
index e912c420dc2..85b80cafbd3 100644
--- a/tests/e2e/online_serving/test_images_generations_lora.py
+++ b/tests/e2e/online_serving/test_images_generations_lora.py
@@ -23,6 +23,7 @@
 from safetensors.torch import save_file
 
 from tests.conftest import OmniServer
+from tests.utils import hardware_test
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
@@ -144,6 +145,9 @@ def _basic_payload() -> dict:
     }
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "L4", "rocm": "MI325"})
 def test_images_generations_per_request_lora_switching(omni_server: OmniServer, tmp_path: Path) -> None:
     # Base generation.
     base_img = _post_images(omni_server, _basic_payload())
diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py
index 073419fb838..7f879939f5c 100644
--- a/tests/e2e/online_serving/test_qwen3_omni.py
+++ b/tests/e2e/online_serving/test_qwen3_omni.py
@@ -28,6 +28,7 @@
     merge_base64_and_convert_to_text,
     modify_stage_config,
 )
+from tests.utils import hardware_test
 from vllm_omni.platforms import current_omni_platform
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
@@ -145,6 +146,9 @@ def get_max_batch_size(size_type="few"):
     return batch_sizes.get(size_type, 5)
 
 
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> None:
     """
@@ -215,6 +219,9 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N
     assert similarity > 0.9, "The audio content is not same as the text"
 
 
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_text_to_text_audio_001(client: openai.OpenAI, omni_server) -> None:
     """
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 6ff6782aea4..0c6479ccea7 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -23,6 +23,8 @@
 )
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 # Unit Tests
 
 
diff --git a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py
index 240fd2051ed..58d4d253f5d 100644
--- a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py
+++ b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py
@@ -11,6 +11,8 @@
 import pytest
 from vllm.sampling_params import SamplingParams
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 @pytest.fixture
 def mock_comprehension_stage():
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
index da73acb5a8e..2db98c06869 100644
--- a/tests/entrypoints/openai_api/test_serving_speech.py
+++ b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -16,6 +16,8 @@
 )
 from vllm_omni.outputs import OmniRequestOutput
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py
index 6b49eba2c60..ba0390040bf 100644
--- a/tests/entrypoints/test_async_omni_diffusion_config.py
+++ b/tests/entrypoints/test_async_omni_diffusion_config.py
@@ -1,9 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm_omni.entrypoints import omni as omni_module
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def test_default_stage_config_includes_cache_backend(monkeypatch):
     """Ensure cache_backend/cache_config are preserved in default diffusion stage."""
diff --git a/tests/entrypoints/test_omni_diffusion.py b/tests/entrypoints/test_omni_diffusion.py
index c4884e3abd1..ab1194a25c8 100644
--- a/tests/entrypoints/test_omni_diffusion.py
+++ b/tests/entrypoints/test_omni_diffusion.py
@@ -9,6 +9,8 @@
 from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 # Suppress noisy DeprecationWarnings from optional Swig bindings imported by vLLM dependencies.
 warnings.filterwarnings(
     "ignore",
diff --git a/tests/entrypoints/test_omni_input_preprocessor.py b/tests/entrypoints/test_omni_input_preprocessor.py
index 77c84f06b3c..422154bf969 100644
--- a/tests/entrypoints/test_omni_input_preprocessor.py
+++ b/tests/entrypoints/test_omni_input_preprocessor.py
@@ -1,5 +1,9 @@
+import pytest
+
 from vllm_omni.inputs.preprocess import OmniInputPreprocessor
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def _make_preprocessor(monkeypatch):
     preprocessor = object.__new__(OmniInputPreprocessor)
diff --git a/tests/entrypoints/test_omni_llm.py b/tests/entrypoints/test_omni_llm.py
index f99c6d8336c..f33ca6d59cf 100644
--- a/tests/entrypoints/test_omni_llm.py
+++ b/tests/entrypoints/test_omni_llm.py
@@ -9,6 +9,8 @@
 
 from vllm_omni.entrypoints.stage_utils import SHUTDOWN_TASK
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 # Suppress noisy DeprecationWarnings from optional Swig bindings imported by vLLM dependencies.
 warnings.filterwarnings(
     "ignore",
diff --git a/tests/entrypoints/test_omni_new_request_data.py b/tests/entrypoints/test_omni_new_request_data.py
index 776509d5bba..b1ad56cddf6 100644
--- a/tests/entrypoints/test_omni_new_request_data.py
+++ b/tests/entrypoints/test_omni_new_request_data.py
@@ -1,9 +1,12 @@
 from types import SimpleNamespace
 
+import pytest
 import torch
 
 from vllm_omni.core.sched.output import OmniNewRequestData
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def test_omni_new_request_data_copies_payloads():
     prompt_embeds = torch.randn(2, 3)
diff --git a/tests/entrypoints/test_omni_stage_diffusion_config.py b/tests/entrypoints/test_omni_stage_diffusion_config.py
index 5fe04cbbd88..f464c55fd69 100644
--- a/tests/entrypoints/test_omni_stage_diffusion_config.py
+++ b/tests/entrypoints/test_omni_stage_diffusion_config.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 from vllm_omni.entrypoints.omni_stage import _build_od_config
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def test_build_od_config_includes_diffusion_fields():
     engine_args = {
diff --git a/tests/entrypoints/test_stage_utils.py b/tests/entrypoints/test_stage_utils.py
index ac503639be6..ab7358d9f74 100644
--- a/tests/entrypoints/test_stage_utils.py
+++ b/tests/entrypoints/test_stage_utils.py
@@ -6,6 +6,8 @@
 
 from vllm_omni.entrypoints.stage_utils import set_stage_devices
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def _make_dummy_torch(call_log):
     class _Props:
diff --git a/tests/model_executor/models/qwen2_5_omni/test_audio_length.py b/tests/model_executor/models/qwen2_5_omni/test_audio_length.py
index dd5f098172c..6156ed3a97f 100644
--- a/tests/model_executor/models/qwen2_5_omni/test_audio_length.py
+++ b/tests/model_executor/models/qwen2_5_omni/test_audio_length.py
@@ -3,6 +3,8 @@
 
 import pytest
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 def test_resolve_max_mel_frames_default():
     from vllm_omni.model_executor.models.qwen2_5_omni.audio_length import resolve_max_mel_frames
diff --git a/tests/utils.py b/tests/utils.py
index f5c513a4d36..3219821b6ff 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -495,18 +495,8 @@ def test_multi_platform():
             raise ValueError(f"Unsupported platform: {platform}")
         all_marks.extend(marks)
 
-    create_new_process_flag = False
-    for cards in num_cards_dict.values():
-        if cards > 1:
-            create_new_process_flag = True
-            break
-
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        if create_new_process_flag:
-            # only for distributed tests
-            func = create_new_process_for_each_test()(f)
-        else:
-            func = f
+        func = f
         for mark in reversed(all_marks):
             func = mark(func)
         return func
diff --git a/tests/worker/test_gpu_generation_model_runner.py b/tests/worker/test_gpu_generation_model_runner.py
index 25ed1ae861d..5c44889a41e 100644
--- a/tests/worker/test_gpu_generation_model_runner.py
+++ b/tests/worker/test_gpu_generation_model_runner.py
@@ -1,7 +1,10 @@
+import pytest
 import torch
 
 from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class _DummyInputBatch:
     def __init__(self):
diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py
index eb1adf227d1..c7836123a64 100644
--- a/tests/worker/test_omni_gpu_model_runner.py
+++ b/tests/worker/test_omni_gpu_model_runner.py
@@ -1,10 +1,13 @@
 from contextlib import contextmanager
 from types import SimpleNamespace
 
+import pytest
 import torch
 
 from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner
 
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
 
 class DummyBuffer:
     """A minimal buffer wrapper that exposes the `.gpu` attribute."""