Skip to content

Commit 6787c00

Browse files
🚧 CONFLICTS: Cherry-pick from features-based-on-v0.9.2 to features-based-on-v0.10.0
⚠️ This commit contains unresolved merge conflicts in files: requirements/common.txt tests/test_utils.py vllm/attention/backends/torch_sdpa.py Commits being cherry-picked: aefc22d Merge pull request #3 from moirai-internal/dev_container 50dadb7 pick [Bugfix] Improve JSON extraction in LlamaToolParser 4373a86 pick opc-request-id middleware 881ad1a cherry pick the fix for non english token in logprobs (#7) 📝 Edit files to remove conflict markers
2 parents 826a7d7 + 881ad1a commit 6787c00

File tree

22 files changed

+5584
-43
lines changed

22 files changed

+5584
-43
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ function cpu_tests() {
7373

7474
# Note: disable it until supports V1
7575
# Run AWQ test
76-
# docker exec cpu-test-"$NUMA_NODE" bash -c "
77-
# set -e
78-
# VLLM_USE_V1=0 pytest -s -v \
79-
# tests/quantization/test_ipex_quant.py"
76+
docker exec cpu-test-"$NUMA_NODE" bash -c "
77+
set -e
78+
VLLM_USE_V1=0 pytest -s -v \
79+
tests/quantization/test_ipex_quant.py"
8080

8181
# online serving
8282
docker exec cpu-test-"$NUMA_NODE" bash -c '

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ docker run \
2626
--name "${container_name}" \
2727
"${image_name}" \
2828
sh -c '
29+
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
30+
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
2931
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3032
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
3133
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp

examples/online_serving/chart-helm/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ image:
88
# -- Image tag
99
tag: "latest"
1010
# -- Container launch command
11-
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
11+
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
1212

1313
# -- Container port
1414
containerPort: 8000

requirements/common.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@ requests >= 2.26.0
77
tqdm
88
blake3
99
py-cpuinfo
10+
<<<<<<< HEAD
1011
transformers >= 4.53.2
12+
=======
13+
transformers >= 4.51.1, <4.54.0
14+
>>>>>>> origin/features-based-on-v0.9.2
1115
huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads.
1216
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
1317
protobuf # Required by LlamaTokenizer.

tests/kernels/attention/test_attention_selector.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@ def clear_cache():
3636
DEVICE_MLA_BLOCK_SIZES = {
3737
"cuda": [16, 64], # CUDA supports both standard and extended block sizes
3838
"hip": [16, 1], # HIP requires special handling for block_size=1
39-
# "cpu": [16] # CPU uses fixed block size from test cases
40-
"cpu": [] # FIXME(woosuk): Temporarily disable CPU tests
39+
"cpu": [16] # CPU uses fixed block size from test cases
4140
}
4241

4342

@@ -82,14 +81,14 @@ def test_env(
8281
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
8382

8483
if device == "cpu":
85-
if not use_v1:
86-
pytest.skip("CPU backend only supports V1")
87-
8884
with patch("vllm.attention.selector.current_platform",
8985
CpuPlatform()):
9086
backend = get_attn_backend(16, torch.float16, torch.float16,
9187
block_size, False)
92-
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
88+
if use_v1:
89+
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
90+
else:
91+
assert backend.get_name() == "TORCH_SDPA"
9392

9493
elif device == "hip":
9594
with patch("vllm.attention.selector.current_platform",
@@ -205,14 +204,12 @@ def test_fp32_fallback(
205204
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
206205

207206
if device == "cpu":
208-
if not use_v1:
209-
pytest.skip("CPU backend only supports V1")
210-
211207
with patch("vllm.attention.selector.current_platform",
212208
CpuPlatform()):
213209
backend = get_attn_backend(16, torch.float32, torch.float32,
214210
16, False)
215-
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
211+
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
212+
if use_v1 else "TORCH_SDPA")
216213

217214
elif device == "cuda":
218215
with patch("vllm.attention.selector.current_platform",

tests/test_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,7 @@ def test_convert_ids_list_to_tokens():
957957
]
958958
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
959959
assert tokens == ['Hello', ',', ' world', '!']
960+
<<<<<<< HEAD
960961

961962

962963
def test_current_stream_multithread():
@@ -995,3 +996,5 @@ def child_thread_func():
995996
child_thread.join(timeout=5)
996997
if child_thread.is_alive():
997998
pytest.fail("Child thread failed to exit properly")
999+
=======
1000+
>>>>>>> origin/features-based-on-v0.9.2

0 commit comments

Comments
 (0)