From ed82c25f76fb587b3b84ad69b037f047b0c40a72 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 16 Apr 2024 17:47:26 +0800 Subject: [PATCH 01/25] [CI] enable intel queue for longer CPU tests Signed-off-by: Yuan Zhou --- .buildkite/test-template.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 265833e2ccf6..7e986c988407 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -40,6 +40,8 @@ steps: - label: "Intel Test" depends_on: ~ + agents: + queue: intel command: bash .buildkite/run-cpu-test.sh {% for step in steps %} From e6d9507099be7e2391601938327b068d3c502c2d Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Wed, 17 Apr 2024 20:50:04 +0800 Subject: [PATCH 02/25] enable more tests Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 414045fe163e..6f3be0b34701 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -11,4 +11,24 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference +<<<<<<< HEAD docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py +======= +docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test + +# offline inference +docker exec cpu-test bash -c "python3 examples/offline_inference.py" + +# async engine test +#docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine" + +# Run basic model test +docker exec cpu-test bash -c "cd tests; + pip install pytest Pillow + rm -f __init__.py + sed -i '/*stablelm-3b-4e1t/d' models/test_models.py + sed -i '/torch.cuda.empty_cache/d' conftest.py + sed -i 's/cuda/cpu/g' conftest.py + bash ../.buildkite/download-images.sh + pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py" +>>>>>>> ca0870d3... enable more tests From b04c6b1c1b63abaf40f506a58e189bebe1b1bff6 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 18 Apr 2024 08:10:58 +0800 Subject: [PATCH 03/25] ignore models Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 10 ++++------ tests/models/test_models.py | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 6f3be0b34701..ad4ec6ddca12 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -11,9 +11,7 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -<<<<<<< HEAD -docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py -======= + docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference @@ -26,9 +24,9 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py" docker exec cpu-test bash -c "cd tests; pip install pytest Pillow rm -f __init__.py - sed -i '/*stablelm-3b-4e1t/d' models/test_models.py + sed -i '/\"stabilityai/d' models/test_models.py sed -i '/torch.cuda.empty_cache/d' conftest.py sed -i 's/cuda/cpu/g' conftest.py bash ../.buildkite/download-images.sh - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py" ->>>>>>> ca0870d3... enable more tests + pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" + diff --git a/tests/models/test_models.py b/tests/models/test_models.py index e4609620387f..539bddc23f19 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -14,7 +14,6 @@ "EleutherAI/pythia-70m", "bigscience/bloom-560m", # Testing alibi slopes. "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", # "allenai/OLMo-1B", # Broken "bigcode/starcoder2-3b", ] From 2520dbff725be43fe78dea68e189920cde86b6e3 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 18 Apr 2024 13:34:25 +0800 Subject: [PATCH 04/25] adding big model test Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index ad4ec6ddca12..3ca9160e5daf 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -30,3 +30,7 @@ docker exec cpu-test bash -c "cd tests; bash ../.buildkite/download-images.sh pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" +# Run big model test +docker exec cpu-test bash -c "cd tests; + sed -i 's/half/float/g' models/test_big_models.py + pytest -v -s models/test_big_models.py" From 06a5eae867889f1c42521599d5bba0974203b593 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 18 Apr 2024 14:01:55 +0800 Subject: [PATCH 05/25] disable big model Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 3ca9160e5daf..014033a280b0 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -31,6 +31,6 @@ docker exec cpu-test bash -c "cd tests; pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" # Run big model test -docker exec cpu-test bash -c "cd tests; - sed -i 's/half/float/g' models/test_big_models.py - pytest -v -s models/test_big_models.py" +#docker exec cpu-test bash -c "cd tests; +# sed -i 's/half/float/g' models/test_big_models.py +# pytest -v -s models/test_big_models.py" From 997f32b10071e5d31f86691dce077b4ebf880884 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 18 Apr 2024 15:03:28 +0800 Subject: [PATCH 06/25] add device check in conftest Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 5 +---- tests/conftest.py | 25 +++++++++++++++++-------- tests/models/test_models.py | 1 + 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 014033a280b0..f761ab8a365a 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -11,8 +11,7 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference - -docker run -itd --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" @@ -25,8 +24,6 @@ docker exec cpu-test bash -c "cd tests; pip install pytest Pillow rm -f __init__.py sed -i '/\"stabilityai/d' models/test_models.py - sed -i '/torch.cuda.empty_cache/d' conftest.py - sed -i 's/cuda/cpu/g' conftest.py bash ../.buildkite/download-images.sh pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" diff --git a/tests/conftest.py b/tests/conftest.py index e749338e1095..c25d5038bf1c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,8 @@ from vllm.sequence import SampleLogprobs logger = init_logger(__name__) +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils import is_cpu _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] @@ -58,7 +60,8 @@ def cleanup(): with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - torch.cuda.empty_cache() + if not is_cpu(): + torch.cuda.empty_cache() @pytest.fixture() @@ -151,6 +154,12 @@ def example_long_prompts() -> List[str]: class HfRunner: + def wrap_device(self, input: any): + if is_cpu(): + return input.cpu() + else: + return input.cuda() + def __init__( self, model_name: str, @@ -164,16 +173,16 @@ def __init__( if model_name in _EMBEDDING_MODELS: # Lazy init required for AMD CI from sentence_transformers import SentenceTransformer - self.model = SentenceTransformer( + self.model = self.wrap_device(SentenceTransformer( model_name, device="cpu", - ).to(dtype=torch_dtype).cuda() + ).to(dtype=torch_dtype)) else: - self.model = AutoModelForCausalLM.from_pretrained( + self.model = self.wrap_device(AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch_dtype, trust_remote_code=True, - ).cuda() + )) self.tokenizer = AutoTokenizer.from_pretrained( model_name, @@ -214,7 +223,7 @@ def generate( inputs = self.processor(**processor_kwargs) output_ids = self.model.generate( - **inputs.to("cuda"), + self.wrap_device(**inputs), use_cache=True, **kwargs, ) @@ -271,7 +280,7 @@ def generate_greedy_logprobs( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output = self.model.generate( - input_ids.cuda(), + self.wrap_device(input_ids), use_cache=True, do_sample=False, max_new_tokens=max_tokens, @@ -306,7 +315,7 @@ def generate_greedy_logprobs_limit( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output = self.model.generate( - input_ids.cuda(), + self.wrap_device(input_ids), use_cache=True, do_sample=False, max_new_tokens=max_tokens, diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 539bddc23f19..e4609620387f 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -14,6 +14,7 @@ "EleutherAI/pythia-70m", "bigscience/bloom-560m", # Testing alibi slopes. "microsoft/phi-2", + "stabilityai/stablelm-3b-4e1t", # "allenai/OLMo-1B", # Broken "bigcode/starcoder2-3b", ] From e9fd4cb0d49af8f6f02e56a8bd2a31ae347d5769 Mon Sep 17 00:00:00 2001 From: "jiang1.li" Date: Thu, 18 Apr 2024 15:07:22 +0000 Subject: [PATCH 07/25] Fix corner case in pos_encoding. --- csrc/cpu/pos_encoding.cpp | 107 +++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index 73bf77e46f53..dad1c6055b31 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -21,73 +21,74 @@ void rotary_embedding_impl( constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); const int embed_dim = rot_dim / 2; - TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0); + bool flag = (embed_dim % VEC_ELEM_NUM == 0); + const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM; -#pragma omp parallel for - for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - - for (int i = 0; i < num_heads; ++i) { - const int head_idx = i; - const int64_t token_head = - token_idx * query_stride + head_idx * head_size; - for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) { - const int rot_offset = j; - const int x_index = rot_offset; - const int y_index = embed_dim + rot_offset; + auto compute_loop = [&](const int64_t token_head, const scalar_t *cache_ptr, + scalar_t *qk) { + int j = 0; + for (; j < loop_upper; j += VEC_ELEM_NUM) { + const int rot_offset = j; + const int x_index = rot_offset; + const int y_index = embed_dim + rot_offset; - const int64_t out_x = token_head + x_index; - const int64_t out_y = token_head + y_index; + const int64_t out_x = token_head + x_index; + const int64_t out_y = token_head + y_index; - const scalar_vec_t cos(cache_ptr + x_index); - const scalar_vec_t sin(cache_ptr + y_index); + const scalar_vec_t cos(cache_ptr + x_index); + const scalar_vec_t sin(cache_ptr + y_index); - const scalar_vec_t q_x(query + out_x); - const scalar_vec_t q_y(query + out_y); + const scalar_vec_t q_x(qk + out_x); + const scalar_vec_t q_y(qk + out_y); - vec_op::FP32Vec8 fp32_cos(cos); - vec_op::FP32Vec8 fp32_sin(sin); + vec_op::FP32Vec8 fp32_cos(cos); + vec_op::FP32Vec8 fp32_sin(sin); - vec_op::FP32Vec8 fp32_q_x(q_x); - vec_op::FP32Vec8 fp32_q_y(q_y); + vec_op::FP32Vec8 fp32_q_x(q_x); + vec_op::FP32Vec8 fp32_q_y(q_y); - auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; - scalar_vec_t(out1).save(query + out_x); + auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; + scalar_vec_t(out1).save(qk + out_x); - auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; - scalar_vec_t(out2).save(query + out_y); - } + auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; + scalar_vec_t(out2).save(qk + out_y); } - - for (int i = 0; i < num_kv_heads; ++i) { - const int head_idx = i; - const int64_t token_head = token_idx * key_stride + head_idx * head_size; - for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) { - const int rot_offset = j; - const int x_index = rot_offset; - const int y_index = embed_dim + rot_offset; + if (!flag) { + for (; j < embed_dim; ++j) { + const int x_index = j; + const int y_index = embed_dim + j; const int64_t out_x = token_head + x_index; const int64_t out_y = token_head + y_index; - const scalar_vec_t cos(cache_ptr + x_index); - const scalar_vec_t sin(cache_ptr + y_index); + const float fp32_cos = cache_ptr[x_index]; + const float fp32_sin = cache_ptr[y_index]; - const scalar_vec_t k_x(key + out_x); - const scalar_vec_t k_y(key + out_y); + const float fp32_q_x = qk[out_x]; + const float fp32_q_y = qk[out_y]; - vec_op::FP32Vec8 fp32_cos(cos); - vec_op::FP32Vec8 fp32_sin(sin); + qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin; + qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin; + } + } + }; - vec_op::FP32Vec8 fp32_k_x(k_x); - vec_op::FP32Vec8 fp32_k_y(k_y); +#pragma omp parallel for + for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { + int64_t pos = positions[token_idx]; + const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim; - auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin; - scalar_vec_t(out1).save(key + out_x); - auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin; - scalar_vec_t(out2).save(key + out_y); - } + for (int i = 0; i < num_heads; ++i) { + const int head_idx = i; + const int64_t token_head = + token_idx * query_stride + head_idx * head_size; + compute_loop(token_head, cache_ptr, query); + } + + for (int i = 0; i < num_kv_heads; ++i) { + const int head_idx = i; + const int64_t token_head = token_idx * key_stride + head_idx * head_size; + compute_loop(token_head, cache_ptr, key); } } } @@ -166,9 +167,9 @@ void rotary_embedding_gptj_impl( } }; // namespace -void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, - torch::Tensor& key, int head_size, - torch::Tensor& cos_sin_cache, bool is_neox) { +void rotary_embedding(torch::Tensor &positions, torch::Tensor &query, + torch::Tensor &key, int head_size, + torch::Tensor &cos_sin_cache, bool is_neox) { int num_tokens = query.numel() / query.size(-1); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; From fb00ea2267a67e8b0a4c3978ca3a6355f8a24f20 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 19 Apr 2024 07:54:30 +0800 Subject: [PATCH 08/25] enable stabilityai model Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f761ab8a365a..425220df57c2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,6 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py" docker exec cpu-test bash -c "cd tests; pip install pytest Pillow rm -f __init__.py - sed -i '/\"stabilityai/d' models/test_models.py bash ../.buildkite/download-images.sh pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" From 3ed0edeaad6c6d7afc354e0fb15bf5035a9d6b1d Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Sat, 20 Apr 2024 08:33:21 +0800 Subject: [PATCH 09/25] using torch.cuda.is_available() to check the device type Signed-off-by: Yuan Zhou --- tests/conftest.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c25d5038bf1c..be555ba6cc03 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,6 @@ logger = init_logger(__name__) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import is_cpu _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] @@ -60,7 +59,7 @@ def cleanup(): with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - if not is_cpu(): + if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -155,10 +154,10 @@ def example_long_prompts() -> List[str]: class HfRunner: def wrap_device(self, input: any): - if is_cpu(): - return input.cpu() - else: + if torch.cuda.is_available(): return input.cuda() + else: + return input.cpu() def __init__( self, From 0ec2b79008ccab4e31c933aaedc2713d554a0b66 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 23 May 2024 13:35:15 +0800 Subject: [PATCH 10/25] fix format Signed-off-by: Yuan Zhou --- csrc/cpu/pos_encoding.cpp | 12 ++++++------ tests/conftest.py | 21 +++++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp index dad1c6055b31..e8aead17ae5a 100644 --- a/csrc/cpu/pos_encoding.cpp +++ b/csrc/cpu/pos_encoding.cpp @@ -24,8 +24,8 @@ void rotary_embedding_impl( bool flag = (embed_dim % VEC_ELEM_NUM == 0); const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM; - auto compute_loop = [&](const int64_t token_head, const scalar_t *cache_ptr, - scalar_t *qk) { + auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr, + scalar_t* qk) { int j = 0; for (; j < loop_upper; j += VEC_ELEM_NUM) { const int rot_offset = j; @@ -76,7 +76,7 @@ void rotary_embedding_impl( #pragma omp parallel for for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { int64_t pos = positions[token_idx]; - const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; for (int i = 0; i < num_heads; ++i) { const int head_idx = i; @@ -167,9 +167,9 @@ void rotary_embedding_gptj_impl( } }; // namespace -void rotary_embedding(torch::Tensor &positions, torch::Tensor &query, - torch::Tensor &key, int head_size, - torch::Tensor &cos_sin_cache, bool is_neox) { +void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, + torch::Tensor& key, int head_size, + torch::Tensor& cos_sin_cache, bool is_neox) { int num_tokens = query.numel() / query.size(-1); int rot_dim = cos_sin_cache.size(1); int num_heads = query.size(-1) / head_size; diff --git a/tests/conftest.py b/tests/conftest.py index be555ba6cc03..e09034d4ffeb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,6 @@ from vllm.sequence import SampleLogprobs logger = init_logger(__name__) -from vllm.transformers_utils.tokenizer import get_tokenizer _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] @@ -172,16 +171,18 @@ def __init__( if model_name in _EMBEDDING_MODELS: # Lazy init required for AMD CI from sentence_transformers import SentenceTransformer - self.model = self.wrap_device(SentenceTransformer( - model_name, - device="cpu", - ).to(dtype=torch_dtype)) + self.model = self.wrap_device( + SentenceTransformer( + model_name, + device="cpu", + ).to(dtype=torch_dtype)) else: - self.model = self.wrap_device(AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - )) + self.model = self.wrap_device( + AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + )) self.tokenizer = AutoTokenizer.from_pretrained( model_name, From 9de3c52236f491038890650dae0985a40d3bc564 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 23 May 2024 14:12:50 +0800 Subject: [PATCH 11/25] ignore failed tests firstly Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 425220df57c2..e6e00bb1bb20 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -24,7 +24,9 @@ docker exec cpu-test bash -c "cd tests; pip install pytest Pillow rm -f __init__.py bash ../.buildkite/download-images.sh - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py --ignore=models/test_big_models.py" + pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \ + --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \ + --ignore=models/test_gptq_marlin_24.py" # Run big model test #docker exec cpu-test bash -c "cd tests; From 0bda6d91e282a3fa86f25cc204d575c0f16dbd5b Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 23 May 2024 14:45:06 +0800 Subject: [PATCH 12/25] ignore embedding test Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index e6e00bb1bb20..ccdec1203000 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -26,7 +26,7 @@ docker exec cpu-test bash -c "cd tests; bash ../.buildkite/download-images.sh pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \ --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \ - --ignore=models/test_gptq_marlin_24.py" + --ignore=models/test_gptq_marlin_24.py --ignore=models/test_embedding.py" # Run big model test #docker exec cpu-test bash -c "cd tests; From a93ad948b38d1ad3f20c34ef93fc2f2b41ba32aa Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Mon, 27 May 2024 08:33:37 +0800 Subject: [PATCH 13/25] fix test dir Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index ccdec1203000..e755bba8cea8 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -14,13 +14,13 @@ remove_docker_container docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference -docker exec cpu-test bash -c "python3 examples/offline_inference.py" +docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py" # async engine test #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine" # Run basic model test -docker exec cpu-test bash -c "cd tests; +docker exec cpu-test bash -c "cd vllm/tests; pip install pytest Pillow rm -f __init__.py bash ../.buildkite/download-images.sh From 774eba0e7c378a674581d7e0363ab747845930b6 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 07:33:48 +0800 Subject: [PATCH 14/25] fix tests with cuda device only Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 10 ++++------ tests/models/test_aqlm.py | 11 +++++++---- tests/models/test_fp8.py | 11 +++++++---- tests/models/test_gptq_marlin.py | 11 +++++++---- tests/models/test_gptq_marlin_24.py | 11 +++++++---- tests/models/test_marlin.py | 11 +++++++---- 6 files changed, 39 insertions(+), 26 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index e755bba8cea8..45b4bf2e6bd9 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -10,23 +10,21 @@ remove_docker_container() { docker rm -f cpu-test || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image and launch offline inference +# Run the image docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py" -# async engine test +# async engine test, not passing due to distributed inference support missing #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine" # Run basic model test docker exec cpu-test bash -c "cd vllm/tests; pip install pytest Pillow - rm -f __init__.py bash ../.buildkite/download-images.sh - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py --ignore=models/test_marlin.py \ - --ignore=models/test_big_models.py --ignore=models/test_aqlm.py --ignore=models/test_fp8.py --ignore=models/test_gptq_marlin.py \ - --ignore=models/test_gptq_marlin_24.py --ignore=models/test_embedding.py" + pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py \ + --ignore=models/test_big_models.py --ignore=models/test_embedding.py" # Run big model test #docker exec cpu-test bash -c "cd tests; diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index a7abc011f57d..31cf30de6eca 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -8,10 +8,13 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -aqlm_not_supported = (capability < - QUANTIZATION_METHODS["aqlm"].get_min_capability()) +aqlm_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + aqlm_not_supported = (capability + < QUANTIZATION_METHODS["aqlm"].get_min_capability()) # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 0a5819ea3f05..b52b57cb5cd5 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -67,10 +67,13 @@ }, } -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -fp8_not_supported = (capability < - QUANTIZATION_METHODS["fp8"].get_min_capability()) +fp8_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + fp8_not_supported = (capability + < QUANTIZATION_METHODS["fp8"].get_min_capability()) @pytest.mark.skipif(fp8_not_supported, diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 1fc0b3f23912..814471b47763 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -22,10 +22,13 @@ MAX_MODEL_LEN = 1024 -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -gptq_marlin_not_supported = ( - capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) +gptq_marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + gptq_marlin_not_supported = ( + capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) MODELS = [ # act_order==False, group_size=channelwise diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py index 3e6ffb7f90fc..cc35ee803ff0 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/test_gptq_marlin_24.py @@ -14,10 +14,13 @@ from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -marlin_not_supported = (capability < - QUANTIZATION_METHODS["marlin"].get_min_capability()) +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 37c1664afec5..8520b26718bf 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -19,10 +19,13 @@ from .utils import check_logprobs_close -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -marlin_not_supported = (capability < - QUANTIZATION_METHODS["marlin"].get_min_capability()) +marlin_not_supported = True + +if torch.cuda.is_available(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = ( + capability < QUANTIZATION_METHODS["marlin"].get_min_capability()) @dataclass From 913eb24c6e4b51154999a38943066a9e65b882b6 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 08:07:09 +0800 Subject: [PATCH 15/25] add local develop Signed-off-by: Yuan Zhou --- Dockerfile.cpu | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.cpu b/Dockerfile.cpu index aec79824213f..12fa84a99561 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -15,6 +15,7 @@ WORKDIR /workspace/vllm RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +RUN VLLM_TARGET_DEVICE=cpu python3 setup.py develop RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ From 9f299d5a7f8df405048031413fb4b7c1bfe3470a Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 08:23:08 +0800 Subject: [PATCH 16/25] fix format Signed-off-by: Yuan Zhou --- tests/models/test_aqlm.py | 4 ++-- tests/models/test_fp8.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index 31cf30de6eca..85d74f7f5b03 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -13,8 +13,8 @@ if torch.cuda.is_available(): capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - aqlm_not_supported = (capability - < QUANTIZATION_METHODS["aqlm"].get_min_capability()) + aqlm_not_supported = (capability < + QUANTIZATION_METHODS["aqlm"].get_min_capability()) # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index b52b57cb5cd5..61aee0d0a6e9 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -72,8 +72,8 @@ if torch.cuda.is_available(): capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] - fp8_not_supported = (capability - < QUANTIZATION_METHODS["fp8"].get_min_capability()) + fp8_not_supported = (capability < + QUANTIZATION_METHODS["fp8"].get_min_capability()) @pytest.mark.skipif(fp8_not_supported, From 22d5228b739f097acb389e4d84380102e061242c Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 15:43:31 +0800 Subject: [PATCH 17/25] soft link tests folder Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 13 +++++++------ Dockerfile.cpu | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 45b4bf2e6bd9..47f9d3c40584 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -14,19 +14,20 @@ remove_docker_container docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference -docker exec cpu-test bash -c "python3 vllm/examples/offline_inference.py" +docker exec cpu-test bash -c "python3 examples/offline_inference.py" # async engine test, not passing due to distributed inference support missing #docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine" # Run basic model test -docker exec cpu-test bash -c "cd vllm/tests; +docker exec cpu-test bash -c "cd tests; pip install pytest Pillow bash ../.buildkite/download-images.sh - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py \ - --ignore=models/test_big_models.py --ignore=models/test_embedding.py" + cd ../ + pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_mistral.py \ + --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py" # Run big model test #docker exec cpu-test bash -c "cd tests; -# sed -i 's/half/float/g' models/test_big_models.py -# pytest -v -s models/test_big_models.py" +# sed -i 's/half/float/g' tests/models/test_big_tests/models.py +# pytest -v -s tests/models/test_big_tests/models.py" diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 12fa84a99561..cd39221fc927 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -15,9 +15,10 @@ WORKDIR /workspace/vllm RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -RUN VLLM_TARGET_DEVICE=cpu python3 setup.py develop RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + CMD ["/bin/bash"] From 3208d4ee837819801a17629259a497a3db667203 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 16:32:57 +0800 Subject: [PATCH 18/25] enable big model test Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 47f9d3c40584..7a711d7d7839 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -16,9 +16,6 @@ docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" -# async engine test, not passing due to distributed inference support missing -#docker exec cpu-test bash -c "cd tests; pytest -v -s async_engine" - # Run basic model test docker exec cpu-test bash -c "cd tests; pip install pytest Pillow @@ -28,6 +25,6 @@ docker exec cpu-test bash -c "cd tests; --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py" # Run big model test -#docker exec cpu-test bash -c "cd tests; -# sed -i 's/half/float/g' tests/models/test_big_tests/models.py -# pytest -v -s tests/models/test_big_tests/models.py" +docker exec cpu-test bash -c " + sed -i 's/half/float/g' tests/models/test_big_models.py + pytest -v -s tests/models/test_big_models.py" From 1be1f1f86a4b176f1f6ac41c6626274decd15b5e Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 17:23:40 +0800 Subject: [PATCH 19/25] addning hf token Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 7a711d7d7839..80003666e57c 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -11,7 +11,7 @@ trap remove_docker_container EXIT remove_docker_container # Run the image -docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" From ffb647f387cbcd8b904beafbdf37447b935438f7 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 28 May 2024 22:05:53 +0800 Subject: [PATCH 20/25] enable test mistral Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 80003666e57c..96f8a7a4fef2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -18,13 +18,14 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; - pip install pytest Pillow + pip install pytest Pillow protobuf bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_mistral.py \ + pytest -v -s tests/models --ignore=tests/models/test_llava.py \ --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py" # Run big model test docker exec cpu-test bash -c " + #TODO: remove this after CPU float16 support ready sed -i 's/half/float/g' tests/models/test_big_models.py pytest -v -s tests/models/test_big_models.py" From e019d215be8f062e00980dbd1e787657f36663f8 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Wed, 29 May 2024 08:30:09 +0800 Subject: [PATCH 21/25] use float dtype for big model tests with CPU backend Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 9 +-------- tests/models/test_big_models.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 96f8a7a4fef2..64b52952360c 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -21,11 +21,4 @@ docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py \ - --ignore=tests/models/test_big_models.py --ignore=tests/models/test_embedding.py" - -# Run big model test -docker exec cpu-test bash -c " - #TODO: remove this after CPU float16 support ready - sed -i 's/half/float/g' tests/models/test_big_models.py - pytest -v -s tests/models/test_big_models.py" + pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py" diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 10e7c64e34e7..ea95e6a49f03 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -5,6 +5,7 @@ Run `pytest tests/models/test_big_models.py`. """ import pytest +import torch MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -16,9 +17,14 @@ # "Qwen/Qwen1.5-0.5B" # Broken, ] +#TODO: remove this after CPU float16 support ready +target_dtype = "float" +if torch.cuda.is_available(): + target_dtype = "half" + @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [32]) def test_models( hf_runner, @@ -46,7 +52,7 @@ def test_models( @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", [target_dtype]) def test_model_print( vllm_runner, model: str, From a7e25a36bc6309d32517c1b12f1b5b2fbb4b0536 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 30 May 2024 11:55:08 +0800 Subject: [PATCH 22/25] fix failed CI on CUDA spawn issue torch.cuda.is_available() is not working well with multiprocessing case so we switch to use is_cpu() to check the device Signed-off-by: Yuan Zhou --- tests/conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e09034d4ffeb..3731f85e4dd3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from vllm.multimodal import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData from vllm.sequence import SampleLogprobs +from vllm.utils import is_cpu logger = init_logger(__name__) @@ -58,7 +59,7 @@ def cleanup(): with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - if torch.cuda.is_available(): + if not is_cpu(): torch.cuda.empty_cache() @@ -153,7 +154,7 @@ def example_long_prompts() -> List[str]: class HfRunner: def wrap_device(self, input: any): - if torch.cuda.is_available(): + if not is_cpu(): return input.cuda() else: return input.cpu() From eca1bafd0feb0d61e3b8781aa4419256f36419c6 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Mon, 3 Jun 2024 08:22:25 +0800 Subject: [PATCH 23/25] fix rebase issue Signed-off-by: Yuan Zhou --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3731f85e4dd3..795ff70e60a8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -224,7 +224,7 @@ def generate( inputs = self.processor(**processor_kwargs) output_ids = self.model.generate( - self.wrap_device(**inputs), + **self.wrap_device(inputs), use_cache=True, **kwargs, ) From 7f0a34420a4dfabf3d3e66e4479f32215ee24dbc Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Mon, 3 Jun 2024 12:55:51 +0800 Subject: [PATCH 24/25] fix rebase Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 2 +- Dockerfile.cpu | 4 +++- tests/conftest.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 64b52952360c..d1200ee84dfe 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -21,4 +21,4 @@ docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py" + pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" diff --git a/Dockerfile.cpu b/Dockerfile.cpu index cd39221fc927..ae23e27b413b 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -1,6 +1,6 @@ # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform. -FROM ubuntu:22.04 +FROM ubuntu:22.04 AS cpu-test-1 RUN apt-get update -y \ && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ @@ -9,6 +9,8 @@ RUN apt-get update -y \ RUN pip install --upgrade pip \ && pip install wheel packaging ninja setuptools>=49.4.0 numpy +FROM cpu-test-1 AS build + COPY ./ /workspace/vllm WORKDIR /workspace/vllm diff --git a/tests/conftest.py b/tests/conftest.py index 795ff70e60a8..764374a779d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -155,9 +155,9 @@ class HfRunner: def wrap_device(self, input: any): if not is_cpu(): - return input.cuda() + return input.to("cuda") else: - return input.cpu() + return input.to("cpu") def __init__( self, From 1eeb09d00c51786ed0f292ad93cb4f7621bb91a7 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Mon, 3 Jun 2024 20:30:00 +0800 Subject: [PATCH 25/25] Trigger CI