From ed82c25f76fb587b3b84ad69b037f047b0c40a72 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 16 Apr 2024 17:47:26 +0800
Subject: [PATCH 01/25] [CI] enable intel queue for longer CPU tests

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/test-template.j2 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 265833e2ccf6..7e986c988407 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -40,6 +40,8 @@ steps:
 
   - label: "Intel Test"
     depends_on: ~
+    agents:
+      queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}

From e6d9507099be7e2391601938327b068d3c502c2d Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Wed, 17 Apr 2024 20:50:04 +0800
Subject: [PATCH 02/25] enable more tests

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 414045fe163e..6f3be0b34701 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,4 +11,24 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
+<<<<<<< HEAD
 docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
+=======
+docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+
+# async engine test
+#docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine"
+
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+  pip install pytest Pillow
+  rm -f __init__.py
+  sed -i '/*stablelm-3b-4e1t/d' models/test_models.py
+  sed -i '/torch.cuda.empty_cache/d' conftest.py
+  sed -i 's/cuda/cpu/g' conftest.py
+  bash ../.buildkite/download-images.sh
+  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py"
+>>>>>>> ca0870d3... enable more tests

From b04c6b1c1b63abaf40f506a58e189bebe1b1bff6 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Apr 2024 08:10:58 +0800
Subject: [PATCH 03/25] ignore models

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh  | 10 ++++------
 tests/models/test_models.py |  1 -
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 6f3be0b34701..ad4ec6ddca12 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,9 +11,7 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-<<<<<<< HEAD
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
-=======
+
 docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
@@ -26,9 +24,9 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
   rm -f __init__.py
-  sed -i '/*stablelm-3b-4e1t/d' models/test_models.py
+  sed -i '/\"stabilityai/d' models/test_models.py
   sed -i '/torch.cuda.empty_cache/d' conftest.py
   sed -i 's/cuda/cpu/g' conftest.py
   bash ../.buildkite/download-images.sh
-  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py"
->>>>>>> ca0870d3... enable more tests
+  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
+
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index e4609620387f..539bddc23f19 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -14,7 +14,6 @@
     "EleutherAI/pythia-70m",
     "bigscience/bloom-560m",  # Testing alibi slopes.
     "microsoft/phi-2",
-    "stabilityai/stablelm-3b-4e1t",
     # "allenai/OLMo-1B",  # Broken
     "bigcode/starcoder2-3b",
 ]

From 2520dbff725be43fe78dea68e189920cde86b6e3 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Apr 2024 13:34:25 +0800
Subject: [PATCH 04/25] adding big model test

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index ad4ec6ddca12..3ca9160e5daf 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,3 +30,7 @@ docker exec cpu-test bash -c "cd tests;
   bash ../.buildkite/download-images.sh
   pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
 
+# Run big model test
+docker exec cpu-test bash -c "cd tests;
+  sed -i 's/half/float/g' models/test_big_models.py
+  pytest -v -s models/test_big_models.py"

From 06a5eae867889f1c42521599d5bba0974203b593 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Apr 2024 14:01:55 +0800
Subject: [PATCH 05/25] disable big model

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 3ca9160e5daf..014033a280b0 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -31,6 +31,6 @@ docker exec cpu-test bash -c "cd tests;
   pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
 
 # Run big model test
-docker exec cpu-test bash -c "cd tests;
-  sed -i 's/half/float/g' models/test_big_models.py
-  pytest -v -s models/test_big_models.py"
+#docker exec cpu-test bash -c "cd tests;
+#  sed -i 's/half/float/g' models/test_big_models.py
+#  pytest -v -s models/test_big_models.py"

From 997f32b10071e5d31f86691dce077b4ebf880884 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 18 Apr 2024 15:03:28 +0800
Subject: [PATCH 06/25] add device check in conftest

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh  |  5 +----
 tests/conftest.py           | 25 +++++++++++++++++--------
 tests/models/test_models.py |  1 +
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 014033a280b0..f761ab8a365a 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,8 +11,7 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-
-docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
@@ -25,8 +24,6 @@ docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
   rm -f __init__.py
   sed -i '/\"stabilityai/d' models/test_models.py
-  sed -i '/torch.cuda.empty_cache/d' conftest.py
-  sed -i 's/cuda/cpu/g' conftest.py
   bash ../.buildkite/download-images.sh
   pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
 
diff --git a/tests/conftest.py b/tests/conftest.py
index e749338e1095..c25d5038bf1c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -20,6 +20,8 @@
 from vllm.sequence import SampleLogprobs
 
 logger = init_logger(__name__)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_cpu
 
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
@@ -58,7 +60,8 @@ def cleanup():
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    torch.cuda.empty_cache()
+    if not is_cpu():
+        torch.cuda.empty_cache()
 
 
 @pytest.fixture()
@@ -151,6 +154,12 @@ def example_long_prompts() -> List[str]:
 
 class HfRunner:
 
+    def wrap_device(self, input: any):
+        if is_cpu():
+            return input.cpu()
+        else:
+            return input.cuda()
+
     def __init__(
         self,
         model_name: str,
@@ -164,16 +173,16 @@ def __init__(
         if model_name in _EMBEDDING_MODELS:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
-            self.model = SentenceTransformer(
+            self.model = self.wrap_device(SentenceTransformer(
                 model_name,
                 device="cpu",
-            ).to(dtype=torch_dtype).cuda()
+            ).to(dtype=torch_dtype))
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(
+            self.model = self.wrap_device(AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype=torch_dtype,
                 trust_remote_code=True,
-            ).cuda()
+            ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -214,7 +223,7 @@ def generate(
             inputs = self.processor(**processor_kwargs)
 
             output_ids = self.model.generate(
-                **inputs.to("cuda"),
+                self.wrap_device(**inputs),
                 use_cache=True,
                 **kwargs,
             )
@@ -271,7 +280,7 @@ def generate_greedy_logprobs(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -306,7 +315,7 @@ def generate_greedy_logprobs_limit(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 539bddc23f19..e4609620387f 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -14,6 +14,7 @@
     "EleutherAI/pythia-70m",
     "bigscience/bloom-560m",  # Testing alibi slopes.
     "microsoft/phi-2",
+    "stabilityai/stablelm-3b-4e1t",
     # "allenai/OLMo-1B",  # Broken
     "bigcode/starcoder2-3b",
 ]

From e9fd4cb0d49af8f6f02e56a8bd2a31ae347d5769 Mon Sep 17 00:00:00 2001
From: "jiang1.li" <jiang1.li@intel.com>
Date: Thu, 18 Apr 2024 15:07:22 +0000
Subject: [PATCH 07/25] Fix corner case in pos_encoding.

---
 csrc/cpu/pos_encoding.cpp | 107 +++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 53 deletions(-)

diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index 73bf77e46f53..dad1c6055b31 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -21,73 +21,74 @@ void rotary_embedding_impl(
   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
 
   const int embed_dim = rot_dim / 2;
-  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
+  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
+  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
 
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    int64_t pos = positions[token_idx];
-    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
-
-    for (int i = 0; i < num_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head =
-          token_idx * query_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+  auto compute_loop = [&](const int64_t token_head, const scalar_t *cache_ptr,
+                          scalar_t *qk) {
+    int j = 0;
+    for (; j < loop_upper; j += VEC_ELEM_NUM) {
+      const int rot_offset = j;
+      const int x_index = rot_offset;
+      const int y_index = embed_dim + rot_offset;
 
-        const int64_t out_x = token_head + x_index;
-        const int64_t out_y = token_head + y_index;
+      const int64_t out_x = token_head + x_index;
+      const int64_t out_y = token_head + y_index;
 
-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+      const scalar_vec_t cos(cache_ptr + x_index);
+      const scalar_vec_t sin(cache_ptr + y_index);
 
-        const scalar_vec_t q_x(query + out_x);
-        const scalar_vec_t q_y(query + out_y);
+      const scalar_vec_t q_x(qk + out_x);
+      const scalar_vec_t q_y(qk + out_y);
 
-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+      vec_op::FP32Vec8 fp32_cos(cos);
+      vec_op::FP32Vec8 fp32_sin(sin);
 
-        vec_op::FP32Vec8 fp32_q_x(q_x);
-        vec_op::FP32Vec8 fp32_q_y(q_y);
+      vec_op::FP32Vec8 fp32_q_x(q_x);
+      vec_op::FP32Vec8 fp32_q_y(q_y);
 
-        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        scalar_vec_t(out1).save(query + out_x);
+      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+      scalar_vec_t(out1).save(qk + out_x);
 
-        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-        scalar_vec_t(out2).save(query + out_y);
-      }
+      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      scalar_vec_t(out2).save(qk + out_y);
     }
-
-    for (int i = 0; i < num_kv_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+    if (!flag) {
+      for (; j < embed_dim; ++j) {
+        const int x_index = j;
+        const int y_index = embed_dim + j;
 
         const int64_t out_x = token_head + x_index;
         const int64_t out_y = token_head + y_index;
 
-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+        const float fp32_cos = cache_ptr[x_index];
+        const float fp32_sin = cache_ptr[y_index];
 
-        const scalar_vec_t k_x(key + out_x);
-        const scalar_vec_t k_y(key + out_y);
+        const float fp32_q_x = qk[out_x];
+        const float fp32_q_y = qk[out_y];
 
-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      }
+    }
+  };
 
-        vec_op::FP32Vec8 fp32_k_x(k_x);
-        vec_op::FP32Vec8 fp32_k_y(k_y);
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
 
-        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
-        scalar_vec_t(out1).save(key + out_x);
-        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
-        scalar_vec_t(out2).save(key + out_y);
-      }
+    for (int i = 0; i < num_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
+    }
+
+    for (int i = 0; i < num_kv_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, key);
     }
   }
 }
@@ -166,9 +167,9 @@ void rotary_embedding_gptj_impl(
 }
 };  // namespace
 
-void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
-                      torch::Tensor& cos_sin_cache, bool is_neox) {
+void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
+                      torch::Tensor &key, int head_size,
+                      torch::Tensor &cos_sin_cache, bool is_neox) {
   int num_tokens = query.numel() / query.size(-1);
   int rot_dim = cos_sin_cache.size(1);
   int num_heads = query.size(-1) / head_size;

From fb00ea2267a67e8b0a4c3978ca3a6355f8a24f20 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Fri, 19 Apr 2024 07:54:30 +0800
Subject: [PATCH 08/25] enable stabilityai model

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f761ab8a365a..425220df57c2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,6 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
   rm -f __init__.py
-  sed -i '/\"stabilityai/d' models/test_models.py
   bash ../.buildkite/download-images.sh
   pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
 

From 3ed0edeaad6c6d7afc354e0fb15bf5035a9d6b1d Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Sat, 20 Apr 2024 08:33:21 +0800
Subject: [PATCH 09/25] using torch.cuda.is_available() to check the device
 type

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 tests/conftest.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c25d5038bf1c..be555ba6cc03 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,7 +21,6 @@
 
 logger = init_logger(__name__)
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import is_cpu
 
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
@@ -60,7 +59,7 @@ def cleanup():
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    if not is_cpu():
+    if torch.cuda.is_available():
         torch.cuda.empty_cache()
 
 
@@ -155,10 +154,10 @@ def example_long_prompts() -> List[str]:
 class HfRunner:
 
     def wrap_device(self, input: any):
-        if is_cpu():
-            return input.cpu()
-        else:
+        if torch.cuda.is_available():
             return input.cuda()
+        else:
+            return input.cpu()
 
     def __init__(
         self,

From 0ec2b79008ccab4e31c933aaedc2713d554a0b66 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 23 May 2024 13:35:15 +0800
Subject: [PATCH 10/25] fix format

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 csrc/cpu/pos_encoding.cpp | 12 ++++++------
 tests/conftest.py         | 21 +++++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index dad1c6055b31..e8aead17ae5a 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -24,8 +24,8 @@ void rotary_embedding_impl(
   bool flag = (embed_dim % VEC_ELEM_NUM == 0);
   const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
 
-  auto compute_loop = [&](const int64_t token_head, const scalar_t *cache_ptr,
-                          scalar_t *qk) {
+  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
+                          scalar_t* qk) {
     int j = 0;
     for (; j < loop_upper; j += VEC_ELEM_NUM) {
       const int rot_offset = j;
@@ -76,7 +76,7 @@ void rotary_embedding_impl(
 #pragma omp parallel for
   for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
     int64_t pos = positions[token_idx];
-    const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 
     for (int i = 0; i < num_heads; ++i) {
       const int head_idx = i;
@@ -167,9 +167,9 @@ void rotary_embedding_gptj_impl(
 }
 };  // namespace
 
-void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
-                      torch::Tensor &key, int head_size,
-                      torch::Tensor &cos_sin_cache, bool is_neox) {
+void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
+                      torch::Tensor& key, int head_size,
+                      torch::Tensor& cos_sin_cache, bool is_neox) {
   int num_tokens = query.numel() / query.size(-1);
   int rot_dim = cos_sin_cache.size(1);
   int num_heads = query.size(-1) / head_size;
diff --git a/tests/conftest.py b/tests/conftest.py
index be555ba6cc03..e09034d4ffeb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -20,7 +20,6 @@
 from vllm.sequence import SampleLogprobs
 
 logger = init_logger(__name__)
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
@@ -172,16 +171,18 @@ def __init__(
         if model_name in _EMBEDDING_MODELS:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(SentenceTransformer(
-                model_name,
-                device="cpu",
-            ).to(dtype=torch_dtype))
+            self.model = self.wrap_device(
+                SentenceTransformer(
+                    model_name,
+                    device="cpu",
+                ).to(dtype=torch_dtype))
         else:
-            self.model = self.wrap_device(AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-            ))
+            self.model = self.wrap_device(
+                AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=True,
+                ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,

From 9de3c52236f491038890650dae0985a40d3bc564 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 23 May 2024 14:12:50 +0800
Subject: [PATCH 11/25] ignore failed tests firstly

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 425220df57c2..e6e00bb1bb20 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -24,7 +24,9 @@ docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
   rm -f __init__.py
   bash ../.buildkite/download-images.sh
-  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py"
+  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \
+    --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \
+    --ignore=models/test_gptq_marlin_24.py"
 
 # Run big model test
 #docker exec cpu-test bash -c "cd tests;

From 0bda6d91e282a3fa86f25cc204d575c0f16dbd5b Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 23 May 2024 14:45:06 +0800
Subject: [PATCH 12/25] ignore embedding test

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index e6e00bb1bb20..ccdec1203000 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -26,7 +26,7 @@ docker exec cpu-test bash -c "cd tests;
   bash ../.buildkite/download-images.sh
   pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \
     --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \
-    --ignore=models/test_gptq_marlin_24.py"
+    --ignore=models/test_gptq_marlin_24.py --ignore=models/test_embedding.py"
 
 # Run big model test
 #docker exec cpu-test bash -c "cd tests;

From a93ad948b38d1ad3f20c34ef93fc2f2b41ba32aa Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Mon, 27 May 2024 08:33:37 +0800
Subject: [PATCH 13/25] fix test dir

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index ccdec1203000..e755bba8cea8 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -14,13 +14,13 @@ remove_docker_container
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py"
 
 # async engine test
 #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine"
 
 # Run basic model test
-docker exec cpu-test bash -c "cd tests;
+docker exec cpu-test bash -c "cd vllm/tests;
   pip install pytest Pillow
   rm -f __init__.py
   bash ../.buildkite/download-images.sh

From 774eba0e7c378a674581d7e0363ab747845930b6 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 07:33:48 +0800
Subject: [PATCH 14/25] fix tests with cuda device only

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh          | 10 ++++------
 tests/models/test_aqlm.py           | 11 +++++++----
 tests/models/test_fp8.py            | 11 +++++++----
 tests/models/test_gptq_marlin.py    | 11 +++++++----
 tests/models/test_gptq_marlin_24.py | 11 +++++++----
 tests/models/test_marlin.py         | 11 +++++++----
 6 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index e755bba8cea8..45b4bf2e6bd9 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -10,23 +10,21 @@ remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
+# Run the image
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
 docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py"
 
-# async engine test
+# async engine test, not passing due to distributed inference support missing
 #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd vllm/tests;
   pip install pytest Pillow
-  rm -f __init__.py
   bash ../.buildkite/download-images.sh
-  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \
-    --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \
-    --ignore=models/test_gptq_marlin_24.py --ignore=models/test_embedding.py"
+  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py \
+    --ignore=models/test_big_models.py --ignore=models/test_embedding.py"
 
 # Run big model test
 #docker exec cpu-test bash -c "cd tests;
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index a7abc011f57d..31cf30de6eca 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -8,10 +8,13 @@
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (capability <
-                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+aqlm_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability
+                          < QUANTIZATION_METHODS["aqlm"].get_min_capability())
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 0a5819ea3f05..b52b57cb5cd5 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -67,10 +67,13 @@
     },
 }
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (capability <
-                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+fp8_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability
+                         < QUANTIZATION_METHODS["fp8"].get_min_capability())
 
 
 @pytest.mark.skipif(fp8_not_supported,
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 1fc0b3f23912..814471b47763 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -22,10 +22,13 @@
 
 MAX_MODEL_LEN = 1024
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-gptq_marlin_not_supported = (
-    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+gptq_marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
 
 MODELS = [
     # act_order==False, group_size=channelwise
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 3e6ffb7f90fc..cc35ee803ff0 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -14,10 +14,13 @@
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 37c1664afec5..8520b26718bf 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -19,10 +19,13 @@
 
 from .utils import check_logprobs_close
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass

From 913eb24c6e4b51154999a38943066a9e65b882b6 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 08:07:09 +0800
Subject: [PATCH 15/25] add local develop

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 Dockerfile.cpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index aec79824213f..12fa84a99561 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -15,6 +15,7 @@ WORKDIR /workspace/vllm
 
 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py develop
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/

From 9f299d5a7f8df405048031413fb4b7c1bfe3470a Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 08:23:08 +0800
Subject: [PATCH 16/25] fix format

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 tests/models/test_aqlm.py | 4 ++--
 tests/models/test_fp8.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 31cf30de6eca..85d74f7f5b03 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -13,8 +13,8 @@
 if torch.cuda.is_available():
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    aqlm_not_supported = (capability
-                          < QUANTIZATION_METHODS["aqlm"].get_min_capability())
+    aqlm_not_supported = (capability <
+                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index b52b57cb5cd5..61aee0d0a6e9 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -72,8 +72,8 @@
 if torch.cuda.is_available():
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    fp8_not_supported = (capability
-                         < QUANTIZATION_METHODS["fp8"].get_min_capability())
+    fp8_not_supported = (capability <
+                         QUANTIZATION_METHODS["fp8"].get_min_capability())
 
 
 @pytest.mark.skipif(fp8_not_supported,

From 22d5228b739f097acb389e4d84380102e061242c Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 15:43:31 +0800
Subject: [PATCH 17/25] soft link tests folder

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 13 +++++++------
 Dockerfile.cpu             |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 45b4bf2e6bd9..47f9d3c40584 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -14,19 +14,20 @@ remove_docker_container
 docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
-docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py"
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 
 # async engine test, not passing due to distributed inference support missing
 #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine"
 
 # Run basic model test
-docker exec cpu-test bash -c "cd vllm/tests;
+docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
   bash ../.buildkite/download-images.sh
-  pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py \
-    --ignore=models/test_big_models.py --ignore=models/test_embedding.py"
+  cd ../
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_mistral.py \
+    --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py"
 
 # Run big model test
 #docker exec cpu-test bash -c "cd tests;
-#  sed -i 's/half/float/g' models/test_big_models.py
-#  pytest -v -s models/test_big_models.py"
+#  sed -i 's/half/float/g' tests/models/test_big_tests/models.py
+#  pytest -v -s tests/models/test_big_tests/models.py"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 12fa84a99561..cd39221fc927 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -15,9 +15,10 @@ WORKDIR /workspace/vllm
 
 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py develop
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 
+RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
 CMD ["/bin/bash"]

From 3208d4ee837819801a17629259a497a3db667203 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 16:32:57 +0800
Subject: [PATCH 18/25] enable big model test

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 47f9d3c40584..7a711d7d7839 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -16,9 +16,6 @@ docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 
-# async engine test, not passing due to distributed inference support missing
-#docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine"
-
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow
@@ -28,6 +25,6 @@ docker exec cpu-test bash -c "cd tests;
     --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py"
 
 # Run big model test
-#docker exec cpu-test bash -c "cd tests;
-#  sed -i 's/half/float/g' tests/models/test_big_tests/models.py
-#  pytest -v -s tests/models/test_big_tests/models.py"
+docker exec cpu-test bash -c "
+  sed -i 's/half/float/g' tests/models/test_big_models.py
+  pytest -v -s tests/models/test_big_models.py"

From 1be1f1f86a4b176f1f6ac41c6626274decd15b5e Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 17:23:40 +0800
Subject: [PATCH 19/25] addning hf token

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 7a711d7d7839..80003666e57c 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,7 +11,7 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"

From ffb647f387cbcd8b904beafbdf37447b935438f7 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Tue, 28 May 2024 22:05:53 +0800
Subject: [PATCH 20/25] enable test mistral

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 80003666e57c..96f8a7a4fef2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -18,13 +18,14 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
-  pip install pytest Pillow
+  pip install pytest Pillow protobuf
   bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_mistral.py \
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py \
     --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py"
 
 # Run big model test
 docker exec cpu-test bash -c "
+  #TODO: remove this after CPU float16 support ready
   sed -i 's/half/float/g' tests/models/test_big_models.py
   pytest -v -s tests/models/test_big_models.py"

From e019d215be8f062e00980dbd1e787657f36663f8 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Wed, 29 May 2024 08:30:09 +0800
Subject: [PATCH 21/25] use float dtype for big model tests with CPU backend

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh      |  9 +--------
 tests/models/test_big_models.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 96f8a7a4fef2..64b52952360c 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -21,11 +21,4 @@ docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py \
-    --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py"
-
-# Run big model test
-docker exec cpu-test bash -c "
-  #TODO: remove this after CPU float16 support ready
-  sed -i 's/half/float/g' tests/models/test_big_models.py
-  pytest -v -s tests/models/test_big_models.py"
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py"
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 10e7c64e34e7..ea95e6a49f03 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -5,6 +5,7 @@
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
+import torch
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -16,9 +17,14 @@
     # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
 
+#TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
     hf_runner,
@@ -46,7 +52,7 @@ def test_models(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
     vllm_runner,
     model: str,

From a7e25a36bc6309d32517c1b12f1b5b2fbb4b0536 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Thu, 30 May 2024 11:55:08 +0800
Subject: [PATCH 22/25] fix failed CI on CUDA spawn issue

torch.cuda.is_available() is not working well with multiprocessing case
so we switch to use is_cpu() to check the device

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 tests/conftest.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e09034d4ffeb..3731f85e4dd3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,7 @@
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
 
 logger = init_logger(__name__)
 
@@ -58,7 +59,7 @@ def cleanup():
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    if torch.cuda.is_available():
+    if not is_cpu():
         torch.cuda.empty_cache()
 
 
@@ -153,7 +154,7 @@ def example_long_prompts() -> List[str]:
 class HfRunner:
 
     def wrap_device(self, input: any):
-        if torch.cuda.is_available():
+        if not is_cpu():
             return input.cuda()
         else:
             return input.cpu()

From eca1bafd0feb0d61e3b8781aa4419256f36419c6 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Mon, 3 Jun 2024 08:22:25 +0800
Subject: [PATCH 23/25] fix rebase issue

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3731f85e4dd3..795ff70e60a8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -224,7 +224,7 @@ def generate(
             inputs = self.processor(**processor_kwargs)
 
             output_ids = self.model.generate(
-                self.wrap_device(**inputs),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
             )

From 7f0a34420a4dfabf3d3e66e4479f32215ee24dbc Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Mon, 3 Jun 2024 12:55:51 +0800
Subject: [PATCH 24/25] fix rebase

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 2 +-
 Dockerfile.cpu             | 4 +++-
 tests/conftest.py          | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 64b52952360c..d1200ee84dfe 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -21,4 +21,4 @@ docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
   bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py"
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index cd39221fc927..ae23e27b413b 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,6 +1,6 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 
-FROM ubuntu:22.04
+FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update  -y \
     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
@@ -9,6 +9,8 @@ RUN apt-get update  -y \
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja setuptools>=49.4.0 numpy
 
+FROM cpu-test-1 AS build
+
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
diff --git a/tests/conftest.py b/tests/conftest.py
index 795ff70e60a8..764374a779d9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -155,9 +155,9 @@ class HfRunner:
 
     def wrap_device(self, input: any):
         if not is_cpu():
-            return input.cuda()
+            return input.to("cuda")
         else:
-            return input.cpu()
+            return input.to("cpu")
 
     def __init__(
         self,

From 1eeb09d00c51786ed0f292ad93cb4f7621bb91a7 Mon Sep 17 00:00:00 2001
From: Yuan Zhou <yuan.zhou@intel.com>
Date: Mon, 3 Jun 2024 20:30:00 +0800
Subject: [PATCH 25/25] Trigger CI