vllm-project · maxdebayser · Jun 23, 2025 · Jun 23, 2025 · Jun 24, 2025 · Jul 1, 2025
@@ -177,7 +177,7 @@ def test_score_max_model_len(self, server: RemoteOpenAIServer,
                                        })
         assert score_response.status_code == 400
         # Assert just a small fragments of the response
-        assert "Please reduce the length of the input." in \
+        assert "is longer than the maximum model length" in \
             score_response.text
 
         # Test truncation

@@ -43,10 +43,9 @@ def v1(run_with_both_engines):
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+                     marks=[pytest.mark.skip_v0]),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 def test_models(

@@ -23,6 +23,15 @@
     "The capital of Germany is Berlin.",
 ]
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 DTYPE = "half"
 
 

@@ -49,9 +49,9 @@ def test_registry_imports(model_arch):
     ("LlamaForCausalLM", False, False, False),
     ("MllamaForConditionalGeneration", True, False, False),
     ("LlavaForConditionalGeneration", True, True, False),
-    ("BertForSequenceClassification", False, False, True),
-    ("RobertaForSequenceClassification", False, False, True),
-    ("XLMRobertaForSequenceClassification", False, False, True),
+    ("BertForSequenceClassification", True, False, True),
+    ("RobertaForSequenceClassification", True, False, True),
+    ("XLMRobertaForSequenceClassification", True, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
     model_info = ModelRegistry._try_inspect_model_cls(model_arch)

@@ -1269,34 +1269,18 @@ def _cross_encoding_score(
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if model_config.is_multimodal_model:
-            for q, d in input_pairs:
-                _, engine_prompt = get_score_prompt(
-                    model_config=model_config,
-                    data_1=q,
-                    data_2=d,
-                    tokenizer=tokenizer,
-                    tokenization_kwargs=tokenization_kwargs,
-                )
+        model_config = self.llm_engine.model_config
 
-                parsed_prompts.append(engine_prompt)
-        else:
-            for q, t in input_pairs:
-                if model_config.use_pad_token:
-                    # cross_encoder models defaults to using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q,  # type: ignore[arg-type]
-                        text_pair=t,  # type: ignore[arg-type]
-                        **tokenization_kwargs)
-                else:
-                    # `llm as reranker` models defaults to not using pad_token.
-                    prompt_inputs = tokenizer(
-                        text=q + t,  # type: ignore[operator]
-                        **tokenization_kwargs)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=prompt_inputs["input_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-                parsed_prompts.append(engine_prompt)
+        for q, d in input_pairs:
+            _, engine_prompt = get_score_prompt(
+                model_config=model_config,
+                data_1=q,
+                data_2=d,
+                tokenizer=tokenizer,
+                tokenization_kwargs=tokenization_kwargs,
+            )
+
+            parsed_prompts.append(engine_prompt)
 
         self._validate_and_add_requests(
             prompts=parsed_prompts,

@@ -188,56 +188,19 @@ async def _cross_encoding_score(
 
         input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
 
-        if self.model_config.is_multimodal_model:
-
-            preprocess_async = make_async(self._preprocess_score,
-                                          executor=self._tokenizer_executor)
-
-            preprocessed_prompts = await asyncio.gather(
-                *(preprocess_async(request=request,
-                                   tokenizer=tokenizer,
-                                   tokenization_kwargs=tokenization_kwargs,
-                                   data_1=t1,
-                                   data_2=t2) for t1, t2 in input_pairs))
-
-            for full_prompt, engine_prompt in preprocessed_prompts:
-                request_prompts.append(full_prompt)
-                engine_prompts.append(engine_prompt)
-
-        else:
-            tokenize_async = make_async(tokenizer.__call__,
-                                        executor=self._tokenizer_executor)
-            use_pad_token = self.model_config.use_pad_token
-
-            if use_pad_token:
-                # cross_encoder models defaults to using pad_token.
-                tokenized_prompts = await asyncio.gather(*(
-                    tokenize_async(
-                        text=t1,  # type: ignore[arg-type]
-                        text_pair=t2,  # type: ignore[arg-type]
-                        **tokenization_kwargs) for t1, t2 in input_pairs))
-            else:
-                # `llm as reranker` models defaults to not using pad_token.
-                tokenized_prompts = await asyncio.gather(*(
-                    tokenize_async(
-                        text=t1 +  # type: ignore[operator]
-                        t2,
-                        **tokenization_kwargs) for t1, t2 in input_pairs))
-
-            for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
-                sep_token = tokenizer.sep_token if (tokenizer.sep_token
-                                                    and use_pad_token) else ''
-                request_prompt = f"{t1}{sep_token}{t2}"
-
-                input_ids = prompt_inputs["input_ids"]
-                text_token_prompt = \
-                    self._validate_input(request, input_ids, request_prompt)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-
-                request_prompts.append(request_prompt)
-                engine_prompts.append(engine_prompt)
+        preprocess_async = make_async(self._preprocess_score,
+                                      executor=self._tokenizer_executor)
+
+        preprocessed_prompts = await asyncio.gather(
+            *(preprocess_async(request=request,
+                               tokenizer=tokenizer,
+                               tokenization_kwargs=tokenization_kwargs,
+                               data_1=t1,
+                               data_2=t2) for t1, t2 in input_pairs))
+
+        for full_prompt, engine_prompt in preprocessed_prompts:
+            request_prompts.append(full_prompt)
+            engine_prompts.append(engine_prompt)
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

@@ -5,6 +5,7 @@
 from torch.nn import CosineSimilarity
 from typing_extensions import Required, TypeAlias, TypedDict
 
+import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
     BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam,
@@ -184,13 +185,31 @@ def get_score_prompt(
         model_config,
         tokenizer,
     )
+    from vllm.model_executor.model_loader import get_model_cls
 
-    full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
-
-    prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+    model = get_model_cls(model_config)
+    if supports_score_template(model):
+        full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
+        prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
+    elif model_config.use_pad_token:
+        # cross_encoder models defaults to using pad_token.
+        prompt_inputs = tokenizer(text=prompt_1,
+                                  text_pair=prompt_2,
+                                  **tokenization_kwargs)
+        full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
+    else:
+        # `llm as reranker` models defaults to not using pad_token.
+        full_prompt = prompt_1 + prompt_2
+        prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
 
     engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])
 
+    if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
+        if envs.VLLM_USE_V1:
+            mm_data = {"token_type_ids": token_type_ids, **(mm_data or {})}
+        else:
+            engine_prompt["token_type_ids"] = token_type_ids
+
     post_process_tokens(model_config, engine_prompt)
 
     if mm_data is not None: