From 5610992259eee7cf121a5b8c16e7b7cd01a50142 Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Mon, 1 Dec 2025 16:29:58 +0000
Subject: [PATCH 1/7] Support tokenization_kwargs override

Signed-off-by: piood <2477084691@qq.com>
---
 tests/conftest.py                             | 14 ++++++++----
 .../models/multimodal/pooling/test_siglip.py  | 15 +++++++++++--
 vllm/entrypoints/llm.py                       | 22 +++++++++++++++++--
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 317b36ba6cb8..e231a4d2a125 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -397,6 +397,7 @@ def get_inputs(
         images: PromptImageInput | None = None,
         videos: PromptVideoInput | None = None,
         audios: PromptAudioInput | None = None,
+        processor_kwargs: dict[str, Any] | None = None,
     ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -410,10 +411,15 @@ def get_inputs(
         all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
-                processor_kwargs: dict[str, Any] = {
-                    "text": prompt,
-                    "return_tensors": "pt",
-                }
+                processor_kwargs = (
+                    processor_kwargs if processor_kwargs is not None else {}
+                )
+                processor_kwargs.update(
+                    {
+                        "text": prompt,
+                        "return_tensors": "pt",
+                    }
+                )
                 if images is not None and (image := images[i]) is not None:
                     processor_kwargs["images"] = image
                 if videos is not None and (video := videos[i]) is not None:
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 92ae115a1983..fb6ae71f9f5c 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Any
+
 import pytest
 from transformers import SiglipModel
 
@@ -35,6 +37,7 @@ def _run_test(
     model: str,
     *,
     dtype: str,
+    tokenization_kwargs: dict[str, Any],
 ) -> None:
     with vllm_runner(
         model,
@@ -44,10 +47,14 @@ def _run_test(
         max_model_len=64,
         gpu_memory_utilization=0.7,
     ) as vllm_model:
-        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
+        vllm_outputs = vllm_model.embed(
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
+        )
 
     with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
-        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+        all_inputs = hf_model.get_inputs(
+            input_texts, images=input_images, processor_kwargs=tokenization_kwargs
+        )
 
         all_outputs = []
         for inputs in all_inputs:
@@ -87,6 +94,7 @@ def test_models_text(
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
+    tokenization_kwargs = {"padding": "max_length", "max_length": 64}
     _run_test(
         hf_runner,
         vllm_runner,
@@ -94,6 +102,7 @@ def test_models_text(
         input_images,  # type: ignore
         model,
         dtype=dtype,
+        tokenization_kwargs=tokenization_kwargs,
     )
 
 
@@ -112,6 +121,7 @@ def test_models_image(
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
+    tokenization_kwargs: dict[str, Any] = {}
     _run_test(
         hf_runner,
         vllm_runner,
@@ -119,6 +129,7 @@ def test_models_image(
         input_images,
         model,
         dtype=dtype,
+        tokenization_kwargs=tokenization_kwargs,
     )
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f005605c08d7..921bb5568b3d 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1077,6 +1077,7 @@ def encode(
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1114,6 +1115,7 @@ def embed(
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
@@ -1151,6 +1153,7 @@ def embed(
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [EmbeddingRequestOutput.from_base(item) for item in items]
@@ -1162,6 +1165,7 @@ def classify(
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
@@ -1197,6 +1201,7 @@ def classify(
             pooling_params=pooling_params,
             lora_request=lora_request,
             pooling_task="classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         return [ClassificationRequestOutput.from_base(item) for item in items]
@@ -1210,6 +1215,7 @@ def reward(
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """
         Generate rewards for each prompt.
@@ -1237,6 +1243,7 @@ def reward(
             pooling_params=pooling_params,
             truncate_prompt_tokens=truncate_prompt_tokens,
             pooling_task="token_classify",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
     def _embedding_score(
@@ -1248,6 +1255,7 @@ def _embedding_score(
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
@@ -1256,6 +1264,7 @@ def _embedding_score(
             lora_request=lora_request,
             pooling_params=pooling_params,
             pooling_task="embed",
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)]
@@ -1280,6 +1289,7 @@ def _cross_encoding_score(
         use_tqdm: bool | Callable[..., tqdm] = True,
         pooling_params: PoolingParams | None = None,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1295,7 +1305,8 @@ def _cross_encoding_score(
         pooling_params.verify("score", model_config)
         pooling_params_list = list[PoolingParams]()
 
-        tokenization_kwargs: dict[str, Any] = {}
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
 
         _validate_truncation_size(
             model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
@@ -1329,6 +1340,7 @@ def _cross_encoding_score(
             params=pooling_params_list,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1554,6 +1566,7 @@ def _validate_and_add_requests(
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: Sequence[LoRARequest] | LoRARequest | None,
         priority: list[int] | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
@@ -1599,6 +1612,7 @@ def _validate_and_add_requests(
                     if isinstance(lora_request, Sequence)
                     else lora_request,
                     priority=priority[i] if priority else 0,
+                    tokenization_kwargs=tokenization_kwargs,
                 )
                 added_request_ids.append(request_id)
         except Exception as e:
@@ -1662,9 +1676,11 @@ def _process_inputs(
         *,
         lora_request: LoRARequest | None,
         priority: int,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for LLMEngine."""
-        tokenization_kwargs: dict[str, Any] = {}
+        if tokenization_kwargs is None:
+            tokenization_kwargs = {}
         _validate_truncation_size(
             self.model_config.max_model_len,
             params.truncate_prompt_tokens,
@@ -1687,6 +1703,7 @@ def _add_request(
         params: SamplingParams | PoolingParams,
         lora_request: LoRARequest | None = None,
         priority: int = 0,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> str:
         prompt_text, _, _ = get_prompt_components(prompt)
         request_id = str(next(self.request_counter))
@@ -1697,6 +1714,7 @@ def _add_request(
             params,
             lora_request=lora_request,
             priority=priority,
+            tokenization_kwargs=tokenization_kwargs,
         )
 
         self.llm_engine.add_request(

From 67b40f538408e074b3cce5b286535cc902666729 Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Tue, 2 Dec 2025 04:03:56 +0000
Subject: [PATCH 2/7] fix

Signed-off-by: piood <2477084691@qq.com>
---
 tests/conftest.py                              |  7 +++++--
 tests/models/multimodal/pooling/test_siglip.py | 12 ++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e231a4d2a125..e0bc90391da6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -397,7 +397,7 @@ def get_inputs(
         images: PromptImageInput | None = None,
         videos: PromptVideoInput | None = None,
         audios: PromptAudioInput | None = None,
-        processor_kwargs: dict[str, Any] | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -411,8 +411,11 @@ def get_inputs(
         all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
+                # Create a copy to avoid modifying the original dict
                 processor_kwargs = (
-                    processor_kwargs if processor_kwargs is not None else {}
+                    tokenization_kwargs.copy()
+                    if tokenization_kwargs is not None
+                    else {}
                 )
                 processor_kwargs.update(
                     {
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index fb6ae71f9f5c..c77af56c6a09 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -37,8 +37,11 @@ def _run_test(
     model: str,
     *,
     dtype: str,
-    tokenization_kwargs: dict[str, Any],
+    tokenization_kwargs: dict[str, Any] | None = None,
 ) -> None:
+    if tokenization_kwargs is None:
+        tokenization_kwargs = {}
+
     with vllm_runner(
         model,
         runner="pooling",
@@ -53,7 +56,7 @@ def _run_test(
 
     with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model:
         all_inputs = hf_model.get_inputs(
-            input_texts, images=input_images, processor_kwargs=tokenization_kwargs
+            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
         )
 
         all_outputs = []
@@ -94,7 +97,6 @@ def test_models_text(
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
-    tokenization_kwargs = {"padding": "max_length", "max_length": 64}
     _run_test(
         hf_runner,
         vllm_runner,
@@ -102,7 +104,7 @@ def test_models_text(
         input_images,  # type: ignore
         model,
         dtype=dtype,
-        tokenization_kwargs=tokenization_kwargs,
+        tokenization_kwargs={"padding": "max_length", "max_length": 64},
     )
 
 
@@ -121,7 +123,6 @@ def test_models_image(
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
-    tokenization_kwargs: dict[str, Any] = {}
     _run_test(
         hf_runner,
         vllm_runner,
@@ -129,7 +130,6 @@ def test_models_image(
         input_images,
         model,
         dtype=dtype,
-        tokenization_kwargs=tokenization_kwargs,
     )
 
 

From 9597c5cf83090df245ba75df4a3474f924e76aaf Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Tue, 2 Dec 2025 08:11:40 +0000
Subject: [PATCH 3/7] fix

Signed-off-by: piood <2477084691@qq.com>
---
 vllm/entrypoints/llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 921bb5568b3d..3d7aa711d858 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1335,12 +1335,14 @@ def _cross_encoding_score(
 
             prompts.append(engine_prompt)
 
+        # tokenization_kwargs has already been consumed by get_score_prompt()
+        # above to tokenize and truncate the prompts, should not pass the same
+        # tokenization_kwargs again to avoid potential duplicate truncation.
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params_list,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)

From d455b2909d3d1255a339477bcb058a4723d0ea4a Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Tue, 2 Dec 2025 08:29:31 +0000
Subject: [PATCH 4/7] fix

Signed-off-by: piood <2477084691@qq.com>
---
 vllm/entrypoints/llm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 3d7aa711d858..886c5710a2be 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1305,8 +1305,8 @@ def _cross_encoding_score(
         pooling_params.verify("score", model_config)
         pooling_params_list = list[PoolingParams]()
 
-        if tokenization_kwargs is None:
-            tokenization_kwargs = {}
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
 
         _validate_truncation_size(
             model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
@@ -1681,8 +1681,9 @@ def _process_inputs(
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for LLMEngine."""
-        if tokenization_kwargs is None:
-            tokenization_kwargs = {}
+
+        local_kwargs = tokenization_kwargs or {}
+        tokenization_kwargs = local_kwargs.copy()
         _validate_truncation_size(
             self.model_config.max_model_len,
             params.truncate_prompt_tokens,

From 7d79e49bd078b91c865e0bc3907b9ea1a141cb4a Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Tue, 2 Dec 2025 08:53:37 +0000
Subject: [PATCH 5/7] remove redundant code and comment

Signed-off-by: piood <2477084691@qq.com>
---
 vllm/entrypoints/llm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 886c5710a2be..127d0bc7f12b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1335,9 +1335,6 @@ def _cross_encoding_score(
 
             prompts.append(engine_prompt)
 
-        # tokenization_kwargs has already been consumed by get_score_prompt()
-        # above to tokenize and truncate the prompts, should not pass the same
-        # tokenization_kwargs again to avoid potential duplicate truncation.
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params_list,

From 5610239fc806962fed3d12c522a490c5bcad6d19 Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Tue, 2 Dec 2025 09:11:15 +0000
Subject: [PATCH 6/7] add comment

Signed-off-by: piood <2477084691@qq.com>
---
 tests/models/multimodal/pooling/test_siglip.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index c77af56c6a09..8cc2f3c7fdb6 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -22,10 +22,8 @@
 )
 
 MODELS = [
-    "google/siglip-base-patch16-224",
-    "google/siglip2-base-patch16-224",
-    # Different image embedding dim than text_config.hidden_size
-    "google/siglip2-giant-opt-patch16-384",
+    "/workspace/hf_model/siglip-base-patch16-224",
+    "/workspace/hf_model/siglip2-base-patch16-224",
 ]
 
 
@@ -104,7 +102,10 @@ def test_models_text(
         input_images,  # type: ignore
         model,
         dtype=dtype,
-        tokenization_kwargs={"padding": "max_length", "max_length": 64},
+        tokenization_kwargs={
+            "padding": "max_length",
+            "max_length": 64,
+        },  # siglip2 was trained with this padding setting.
     )
 
 

From bdfb80fabba8be117f2441d13264b19610219c0c Mon Sep 17 00:00:00 2001
From: piood <2477084691@qq.com>
Date: Sat, 6 Dec 2025 02:52:05 +0000
Subject: [PATCH 7/7] fix

Signed-off-by: piood <2477084691@qq.com>
---
 tests/models/multimodal/pooling/test_siglip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 8cc2f3c7fdb6..72886cbf7f32 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -22,8 +22,10 @@
 )
 
 MODELS = [
-    "/workspace/hf_model/siglip-base-patch16-224",
-    "/workspace/hf_model/siglip2-base-patch16-224",
+    "google/siglip-base-patch16-224",
+    "google/siglip2-base-patch16-224",
+    # Different image embedding dim than text_config.hidden_size
+    "google/siglip2-giant-opt-patch16-384",
 ]