vllm-project · WoosukKwon · Jul 4, 2025 · Jul 3, 2025 · Jul 3, 2025 · gemini-code-assist
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -48,10 +48,16 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/language/generation -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
+    # Note: disable until supports V1
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+
     pytest -v -s tests/models/language/pooling -m cpu_model
     pytest -v -s tests/models/multimodal/generation \
                 --ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,21 +68,15 @@ function cpu_tests() {
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
 
+  # Note: disable it until supports V1
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
   #   VLLM_USE_V1=0 pytest -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
-  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v -k cpu_model \
-    tests/basic_correctness/test_chunked_prefill.py"  
-
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e

diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
@@ -39,7 +39,7 @@
     [
         pytest.param(
             "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
         ),
         pytest.param(
             "openai-community/gpt2",  # gpt2
@@ -87,7 +87,11 @@
         pytest.param("bigcode/starcoder2-3b"),  # starcoder2
         pytest.param(
             "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            marks=[pytest.mark.cpu_model],
         )
     ])
 @pytest.mark.parametrize("max_tokens", [32])

diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 
@@ -28,20 +27,24 @@ def v1(run_with_both_engines):
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
-        pytest.param("intfloat/e5-mistral-7b-instruct",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "intfloat/e5-mistral-7b-instruct",
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
         # the qwen models interfere with each other (see PR
         # https://github.com/vllm-project/vllm/pull/18720).
         # To avoid this problem, for now we skip v0 since it will be
         # deprecated anyway.
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                      marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
         # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.skip_v1
-                     ]),
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
+            marks=[
+                # CPU only supports V1
+                pytest.mark.core_model,
+                pytest.mark.skip_v1
+            ]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2",
                      marks=[pytest.mark.skip_v1]),
         pytest.param("intfloat/multilingual-e5-small",
@@ -60,10 +63,6 @@ def test_models(
     model,
     monkeypatch,
 ) -> None:
-    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
-    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
-        pytest.skip("CPU V1 doesn't support sliding window")
-
     if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
         # switch to use ROCm CK FA backend

diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 import torch
 import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
     dtype: str,
     monkeypatch,
 ) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
+
     if current_platform.is_rocm():
         # ROCm Triton FA does not currently support sliding window attention
         # switch to use ROCm CK FA backend

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -45,7 +45,8 @@ def use_v0_only(monkeypatch):
     """
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    if not current_platform.is_cpu():
+        monkeypatch.setenv('VLLM_USE_V1', '0')
 
 
 @pytest.mark.parametrize(