Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]

usage() {
echo``
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]

usage() {
echo``
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
&& python3 -m pip install pytest-shard

# -----------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/features/quantization/fp8.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ Since simple RTN does not require data for weight quantization and the activatio
Install `vllm` and `lm-evaluation-harness` for evaluation:

```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```

Load and run the model in `vllm`:
Expand Down
2 changes: 1 addition & 1 deletion docs/features/quantization/int4.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```

## Quantization Process
Expand Down
2 changes: 1 addition & 1 deletion docs/features/quantization/int8.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pip install llmcompressor
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```

## Quantization Process
Expand Down
2 changes: 1 addition & 1 deletion docs/features/quantization/quark.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

```bash
pip install vllm lm-eval==0.4.4
pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
```

## Quantization Process
Expand Down
4 changes: 3 additions & 1 deletion examples/offline_inference/spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.inputs import TokensPrompt
from vllm.v1.metrics.reader import Counter, Vector

try:
Expand Down Expand Up @@ -137,7 +138,8 @@ def main():
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
if not args.custom_mm_prompts:
outputs = llm.generate(
prompt_token_ids=prompt_ids, sampling_params=sampling_params
TokensPrompt(prompt_token_ids=prompt_ids),
sampling_params=sampling_params,
)
else:
outputs = llm.chat(prompts, sampling_params=sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/structured_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def format_output(title: str, output: str):


def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
outputs = llm.generate(prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text


Expand Down
2 changes: 1 addition & 1 deletion requirements/nightly_torch_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
Expand Down
3 changes: 2 additions & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.2
tokenizers==0.21.1
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval==0.4.8
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# via -r requirements/test.in
lxml==5.3.0
# via
Expand Down
15 changes: 6 additions & 9 deletions tests/entrypoints/llm/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ def text_llm():
enforce_eager=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down Expand Up @@ -88,10 +87,9 @@ def vision_llm():
seed=0,
)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down Expand Up @@ -158,10 +156,9 @@ def thinking_llm():
seed=0,
)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ def llm():
enforce_eager=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ def llm():
enforce_eager=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
52 changes: 3 additions & 49 deletions tests/entrypoints/llm/test_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@

import pytest

from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory

from ...models.utils import check_embeddings_close

MODEL_NAME = "intfloat/multilingual-e5-small"

PROMPTS = [
Expand Down Expand Up @@ -48,57 +46,13 @@ def llm():
enforce_eager=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()


def assert_outputs_match(o1: list[PoolingRequestOutput],
o2: list[PoolingRequestOutput]):
check_embeddings_close(
embeddings_0_lst=[o.outputs.data for o in o1],
embeddings_1_lst=[o.outputs.data for o in o2],
name_0="hf",
name_1="vllm",
tol=1e-2,
)


@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
pooling_params = PoolingParams()

with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
pooling_params=pooling_params)

v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params)
assert_outputs_match(v1_output, v2_output)


@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()

with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
pooling_params=pooling_params)

v2_output = llm.encode(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
pooling_params=pooling_params,
)
assert_outputs_match(v1_output, v2_output)


@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
Expand Down
43 changes: 3 additions & 40 deletions tests/entrypoints/llm/test_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from vllm import LLM, RequestOutput, SamplingParams
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory

MODEL_NAME = "distilbert/distilgpt2"
Expand Down Expand Up @@ -41,50 +41,13 @@ def llm():
gpu_memory_utilization=0.10,
enforce_eager=True)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()


def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]


@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)

with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params)

v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)


@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)

with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
sampling_params=sampling_params)

v2_output = llm.generate(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)


@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [
Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_generate_multiple_loras.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ def llm(request, monkeypatch_module):
max_num_seqs=128,
enforce_eager=True)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ def llm():
trust_remote_code=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
5 changes: 2 additions & 3 deletions tests/entrypoints/llm/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ def llm():
enforce_eager=True,
seed=0)

with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)

del llm
del llm

cleanup_dist_env_and_memory()

Expand Down
6 changes: 2 additions & 4 deletions tests/quantization/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
with vllm_runner(model_id) as llm:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(prompts=["Hello my name is"],
max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
print(outputs[0][1])


Expand Down Expand Up @@ -90,8 +89,7 @@ def check_model(model):

# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs = llm.generate_greedy(prompts=["Hello my name is"],
max_tokens=10)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
print(outputs[0][1])


Expand Down
2 changes: 1 addition & 1 deletion tests/quantization/test_lm_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def check_model(model):
vllm_model.apply_model(check_model)

print(
vllm_model.generate_greedy(prompts=["Hello my name is"],
vllm_model.generate_greedy(["Hello my name is"],
max_tokens=10)[0][1])
Loading