Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5dee54d
Add support for encoder embedding models
maxdebayser Jun 23, 2025
b430dba
Use multi-modal support to pass token_type_ids to the model
maxdebayser Jun 23, 2025
aad1052
reduce diff
maxdebayser Jun 24, 2025
3ca7ced
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 1, 2025
7006e8a
Fix cuda graphs for BERT models
maxdebayser Jul 1, 2025
c99df96
Add token_type_ids multi-modal to LLM._cross_encoding_score
maxdebayser Jul 2, 2025
bbe0ea7
fix merge problem
maxdebayser Jul 2, 2025
019496a
fix editing mistake
maxdebayser Jul 2, 2025
6558bdd
fix missing input ids
maxdebayser Jul 2, 2025
33bcc88
fix mistake
maxdebayser Jul 2, 2025
a743268
fix tensor not boolean error
maxdebayser Jul 2, 2025
6310f4d
appease mypy
maxdebayser Jul 2, 2025
1d79887
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 2, 2025
611217a
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 2, 2025
024198b
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 8, 2025
c4dc1a8
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 9, 2025
3f79324
Fix missing args
maxdebayser Jul 9, 2025
f3f075a
fix mm flag in registry test
maxdebayser Jul 9, 2025
268099b
remove model from unsupported list
maxdebayser Jul 10, 2025
5470c4e
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 10, 2025
d19dcd4
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 11, 2025
60696b4
appease linter
maxdebayser Jul 11, 2025
0ce2a36
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 11, 2025
00bfc79
lazy import
maxdebayser Jul 11, 2025
2501649
appease linter
maxdebayser Jul 11, 2025
28fb913
appease linter
maxdebayser Jul 11, 2025
43ddd37
fixes
maxdebayser Jul 28, 2025
9a26982
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 28, 2025
2fd6143
fix argument name
maxdebayser Jul 28, 2025
6fde956
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 28, 2025
a089065
fix merge problem
maxdebayser Jul 29, 2025
70cc584
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 29, 2025
4590707
Merge branch 'upstream_main' into v1_embeddings_full_mm
maxdebayser Jul 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_score_max_model_len(self, server: RemoteOpenAIServer,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
assert "is longer than the maximum model length" in \
score_response.text

# Test truncation
Expand Down
5 changes: 2 additions & 3 deletions tests/models/language/pooling/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@ def v1(run_with_both_engines):
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
marks=[pytest.mark.skip_v1]),
marks=[pytest.mark.skip_v0]),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2",
marks=[pytest.mark.skip_v1]),
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
],
)
def test_models(
Expand Down
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
6 changes: 3 additions & 3 deletions tests/models/test_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ def test_registry_imports(model_arch):
("LlamaForCausalLM", False, False, False),
("MllamaForConditionalGeneration", True, False, False),
("LlavaForConditionalGeneration", True, True, False),
("BertForSequenceClassification", False, False, True),
("RobertaForSequenceClassification", False, False, True),
("XLMRobertaForSequenceClassification", False, False, True),
("BertForSequenceClassification", True, False, True),
("RobertaForSequenceClassification", True, False, True),
("XLMRobertaForSequenceClassification", True, False, True),
])
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
Expand Down
38 changes: 11 additions & 27 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,34 +1269,18 @@ def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if model_config.is_multimodal_model:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)
model_config = self.llm_engine.model_config

parsed_prompts.append(engine_prompt)
else:
for q, t in input_pairs:
if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
text_pair=t, # type: ignore[arg-type]
**tokenization_kwargs)
else:
# `llm as reranker` models defaults to not using pad_token.
prompt_inputs = tokenizer(
text=q + t, # type: ignore[operator]
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)

parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
Expand Down
63 changes: 13 additions & 50 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,56 +188,19 @@ async def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if self.model_config.is_multimodal_model:

preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)

preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

else:
tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor)
use_pad_token = self.model_config.use_pad_token

if use_pad_token:
# cross_encoder models defaults to using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1, # type: ignore[arg-type]
text_pair=t2, # type: ignore[arg-type]
**tokenization_kwargs) for t1, t2 in input_pairs))
else:
# `llm as reranker` models defaults to not using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1 + # type: ignore[operator]
t2,
**tokenization_kwargs) for t1, t2 in input_pairs))

for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
sep_token = tokenizer.sep_token if (tokenizer.sep_token
and use_pad_token) else ''
request_prompt = f"{t1}{sep_token}{t2}"

input_ids = prompt_inputs["input_ids"]
text_token_prompt = \
self._validate_input(request, input_ids, request_prompt)
engine_prompt = TokensPrompt(
prompt_token_ids=text_token_prompt["prompt_token_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))

request_prompts.append(request_prompt)
engine_prompts.append(engine_prompt)
preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)

preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
Expand Down
25 changes: 22 additions & 3 deletions vllm/entrypoints/score_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from torch.nn import CosineSimilarity
from typing_extensions import Required, TypeAlias, TypedDict

import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
Expand Down Expand Up @@ -184,13 +185,31 @@ def get_score_prompt(
model_config,
tokenizer,
)
from vllm.model_executor.model_loader import get_model_cls

full_prompt = apply_score_template(model_config, prompt_1, prompt_2)

prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
model = get_model_cls(model_config)
if supports_score_template(model):
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
elif model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(text=prompt_1,
text_pair=prompt_2,
**tokenization_kwargs)
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
else:
# `llm as reranker` models defaults to not using pad_token.
full_prompt = prompt_1 + prompt_2
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)

engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])

if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
if envs.VLLM_USE_V1:
mm_data = {"token_type_ids": token_type_ids, **(mm_data or {})}
else:
engine_prompt["token_type_ids"] = token_type_ids

post_process_tokens(model_config, engine_prompt)

if mm_data is not None:
Expand Down
Loading