@@ -19,50 +19,55 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
19
19
docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
20
20
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
21
21
22
- # offline inference
23
- docker exec cpu-test-avx2 bash -c "
24
- set -e
25
- python3 examples/offline_inference.py"
22
+ function cpu_tests() {
23
+ # offline inference
24
+ docker exec cpu-test-avx2 bash -c "
25
+ set -e
26
+ python3 examples/offline_inference.py"
26
27
27
- # Run basic model test
28
- docker exec cpu-test bash -c "
29
- set -e
30
- pip install pytest pytest-asyncio \
31
- decord einops librosa peft Pillow sentence-transformers soundfile \
32
- transformers_stream_generator matplotlib datamodel_code_generator
33
- pip install torchvision --index-url https://download.pytorch.org/whl/cpu
34
- # Embedding models are not supported for CPU yet
35
- # pytest -v -s tests/models/embedding/language
36
- pytest -v -s tests/models/encoder_decoder/language
37
- pytest -v -s tests/models/decoder_only/language/test_models.py
38
- # Chunked prefill not supported for CPU yet
39
- # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
40
- pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
28
+ # Run basic model test
29
+ docker exec cpu-test bash -c "
30
+ set -e
31
+ pip install pytest pytest-asyncio \
32
+ decord einops librosa peft Pillow sentence-transformers soundfile \
33
+ transformers_stream_generator matplotlib datamodel_code_generator
34
+ pip install torchvision --index-url https://download.pytorch.org/whl/cpu
35
+ # Embedding models are not supported for CPU yet
36
+ # pytest -v -s tests/models/embedding/language
37
+ pytest -v -s tests/models/encoder_decoder/language
38
+ pytest -v -s tests/models/decoder_only/language/test_models.py
39
+ pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
40
+ pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
41
41
42
- # Run compressed-tensor test
43
- docker exec cpu-test bash -c "
44
- set -e
45
- pytest -s -v \
46
- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
47
- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
42
+ # Run compressed-tensor test
43
+ docker exec cpu-test bash -c "
44
+ set -e
45
+ pytest -s -v \
46
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
47
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
48
48
49
- # Run AWQ test
50
- docker exec cpu-test bash -c "
51
- set -e
52
- pytest -s -v \
53
- tests/quantization/test_ipex_quant.py"
49
+ # Run AWQ test
50
+ docker exec cpu-test bash -c "
51
+ set -e
52
+ pytest -s -v \
53
+ tests/quantization/test_ipex_quant.py"
54
54
55
- # online inference
56
- docker exec cpu-test bash -c "
57
- set -e
58
- export VLLM_CPU_KVCACHE_SPACE=10
59
- export VLLM_CPU_OMP_THREADS_BIND=48-92
60
- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
61
- timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
62
- python3 benchmarks/benchmark_serving.py \
63
- --backend vllm \
64
- --dataset-name random \
65
- --model facebook/opt-125m \
66
- --num-prompts 20 \
67
- --endpoint /v1/completions \
68
- --tokenizer facebook/opt-125m"
55
+ # online inference
56
+ docker exec cpu-test bash -c "
57
+ set -e
58
+ export VLLM_CPU_KVCACHE_SPACE=10
59
+ export VLLM_CPU_OMP_THREADS_BIND=48-92
60
+ python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
61
+ timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
62
+ python3 benchmarks/benchmark_serving.py \
63
+ --backend vllm \
64
+ --dataset-name random \
65
+ --model facebook/opt-125m \
66
+ --num-prompts 20 \
67
+ --endpoint /v1/completions \
68
+ --tokenizer facebook/opt-125m"
69
+ }
70
+
71
+ # All of CPU tests are expected to be finished less than 25 mins.
72
+ export -f cpu_tests
73
+ timeout 25m bash -c " cpu_tests"
0 commit comments