Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,8 @@ steps:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
- tests/models/quantization
commands:
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- pytest -v -s models/quantization

- label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
Expand Down Expand Up @@ -509,6 +507,14 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'

- label: Quantized Models Test
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization

# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
mirror_hardwares: [amd]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM when using greedy sampling.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these docstrings are pretty useless and run the risk of the command no longer matching the file name, so I removed them.


Run `pytest tests/models/test_models.py`.
"""

import pytest
import torch

Expand Down
4 changes: 0 additions & 4 deletions tests/models/language/generation/test_granite.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.

Run `pytest tests/models/test_granite.py`.
"""
import pytest

from ...utils import check_logprobs_close
Expand Down
4 changes: 0 additions & 4 deletions tests/models/language/generation/test_mistral.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.

Run `pytest tests/models/test_mistral.py`.
"""
import copy
import json

Expand Down
4 changes: 0 additions & 4 deletions tests/models/language/generation/test_phimoe.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for moe models using greedy sampling.

Run `pytest tests/models/test_phimoe.py`.
"""
import pytest
import torch

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the classification outputs of HF and vLLM models.

Run `pytest tests/models/test_cls_models.py`.
"""
import pytest
import torch
from transformers import AutoModelForSequenceClassification
Expand All @@ -19,7 +15,7 @@
)
@pytest.mark.parametrize("dtype",
["half"] if current_platform.is_rocm() else ["float"])
def test_classification_models(
def test_models(
hf_runner,
vllm_runner,
example_prompts,
Expand Down
4 changes: 0 additions & 4 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.

Run `pytest tests/models/embedding/language/test_embedding.py`.
"""
import pytest

from vllm.config import PoolerConfig
Expand Down
11 changes: 3 additions & 8 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: E501
"""Compare the scoring outputs of HF and vLLM models.

Run `pytest tests/models/embedding/language/test_jina.py`.
"""
import math

import pytest
Expand All @@ -22,9 +17,9 @@
"Organic skincare for sensitive skin with aloe vera and chamomile.",
"New makeup trends focus on bold colors and innovative techniques",
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", # noqa: E501
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", # noqa: E501
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", # noqa: E501
"针对敏感肌专门设计的天然有机护肤产品",
"新的化妆趋势注重鲜艳的颜色和创新的技巧",
"敏感肌のために特別に設計された天然有機スキンケア製品",
Expand Down
61 changes: 22 additions & 39 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the scoring outputs of HF and vLLM models.

Run `pytest tests/models/embedding/language/test_scoring.py`.
"""
import math

import pytest
import torch
import torch.nn.functional as F

MODELS = [
CROSS_ENCODER_MODELS = [
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
"BAAI/bge-reranker-v2-m3", # Roberta
]
Expand All @@ -28,21 +24,21 @@
"The capital of Germany is Berlin.",
]

DTYPE = "half"


@pytest.fixture(scope="module", params=MODELS)
@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
def model_name(request):
yield request.param


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):

def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]

with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist()

with vllm_runner(model_name, task="score", dtype=dtype,
with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

Expand All @@ -52,18 +48,16 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):

def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]

with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()

with vllm_runner(model_name, task="score", dtype=dtype,
with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

Expand All @@ -74,18 +68,16 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):

def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]

with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()

with vllm_runner(model_name, task="score", dtype=dtype,
with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

Expand All @@ -101,13 +93,10 @@ def emb_model_name(request):
yield request.param


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]

with hf_runner(emb_model_name, dtype=dtype,
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
Expand All @@ -116,7 +105,7 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

Expand All @@ -126,16 +115,13 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
Expand All @@ -147,7 +133,7 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

Expand All @@ -158,16 +144,13 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
Expand All @@ -179,7 +162,7 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.

Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
"""
import pytest

from ...utils import EmbedModelInfo, check_embeddings_close
Expand Down
24 changes: 12 additions & 12 deletions tests/models/language/pooling/test_truncation_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
max_model_len = 128

input_str = """Immerse yourself in the enchanting chronicle of calculus, a
mathematical domain that has radically transformed our comprehension of
change and motion. Despite its roots in ancient civilizations, the
formal birth of calculus predominantly occurred in the 17th century,
primarily under the influential guidance of Sir Isaac Newton and Gottfried
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
ancient Greek mathematics,most notably in the works of Eudoxus and
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
technique for computing areas and volumes through the use of finite sums.
This methodology laid crucial foundational work for integral calculus.
In the 17th century, both Newton and Leibniz independently pioneered
calculus, each contributing unique perspectives that would shape this new
field."""
mathematical domain that has radically transformed our comprehension of
change and motion. Despite its roots in ancient civilizations, the
formal birth of calculus predominantly occurred in the 17th century,
primarily under the influential guidance of Sir Isaac Newton and Gottfried
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
ancient Greek mathematics,most notably in the works of Eudoxus and
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
technique for computing areas and volumes through the use of finite sums.
This methodology laid crucial foundational work for integral calculus.
In the 17th century, both Newton and Leibniz independently pioneered
calculus, each contributing unique perspectives that would shape this new
field."""


def test_smaller_truncation_size(vllm_runner,
Expand Down
4 changes: 0 additions & 4 deletions tests/models/multimodal/generation/test_pixtral.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.

Run `pytest tests/models/test_mistral.py`.
"""
import json
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Optional
Expand Down
4 changes: 2 additions & 2 deletions tests/models/multimodal/generation/test_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ def run_test(
assert output.outputs[0].text == expected


@create_new_process_for_each_test("spawn")
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
@create_new_process_for_each_test()
def test_models(vllm_runner, model) -> None:
run_test(
vllm_runner,
Expand All @@ -131,11 +131,11 @@ def test_models(vllm_runner, model) -> None:
)


@create_new_process_for_each_test("spawn")
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@create_new_process_for_each_test()
def test_models_distributed(
vllm_runner,
model,
Expand Down
5 changes: 0 additions & 5 deletions tests/models/quantization/test_aqlm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a AQLM model between vLLM and HF Transformers

Run `pytest tests/models/test_aqlm.py`.
"""

import pytest

from tests.quantization.utils import is_quant_method_supported
Expand Down
2 changes: 0 additions & 2 deletions tests/models/quantization/test_bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.

Run `pytest tests/models/test_bitblas.py`.
"""
from dataclasses import dataclass

Expand Down
2 changes: 0 additions & 2 deletions tests/models/quantization/test_gptq_bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the
test up to 3 times to see if we pass.

Run `pytest tests/models/test_bitblas.py`.
"""
from dataclasses import dataclass

Expand Down
Loading