From 5610992259eee7cf121a5b8c16e7b7cd01a50142 Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Mon, 1 Dec 2025 16:29:58 +0000 Subject: [PATCH 1/7] Support tokenization_kwargs override Signed-off-by: piood <2477084691@qq.com> --- tests/conftest.py | 14 ++++++++---- .../models/multimodal/pooling/test_siglip.py | 15 +++++++++++-- vllm/entrypoints/llm.py | 22 +++++++++++++++++-- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 317b36ba6cb8..e231a4d2a125 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -397,6 +397,7 @@ def get_inputs( images: PromptImageInput | None = None, videos: PromptVideoInput | None = None, audios: PromptAudioInput | None = None, + processor_kwargs: dict[str, Any] | None = None, ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -410,10 +411,15 @@ def get_inputs( all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): - processor_kwargs: dict[str, Any] = { - "text": prompt, - "return_tensors": "pt", - } + processor_kwargs = ( + processor_kwargs if processor_kwargs is not None else {} + ) + processor_kwargs.update( + { + "text": prompt, + "return_tensors": "pt", + } + ) if images is not None and (image := images[i]) is not None: processor_kwargs["images"] = image if videos is not None and (video := videos[i]) is not None: diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 92ae115a1983..fb6ae71f9f5c 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + import pytest from transformers import SiglipModel @@ -35,6 +37,7 @@ def _run_test( model: str, *, dtype: str, + tokenization_kwargs: dict[str, Any], ) -> None: with vllm_runner( model, @@ -44,10 +47,14 @@ def _run_test( max_model_len=64, gpu_memory_utilization=0.7, ) as vllm_model: - vllm_outputs = vllm_model.embed(input_texts, images=input_images) + vllm_outputs = vllm_model.embed( + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs + ) with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model: - all_inputs = hf_model.get_inputs(input_texts, images=input_images) + all_inputs = hf_model.get_inputs( + input_texts, images=input_images, processor_kwargs=tokenization_kwargs + ) all_outputs = [] for inputs in all_inputs: @@ -87,6 +94,7 @@ def test_models_text( input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] + tokenization_kwargs = {"padding": "max_length", "max_length": 64} _run_test( hf_runner, vllm_runner, @@ -94,6 +102,7 @@ def test_models_text( input_images, # type: ignore model, dtype=dtype, + tokenization_kwargs=tokenization_kwargs, ) @@ -112,6 +121,7 @@ def test_models_image( input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] + tokenization_kwargs: dict[str, Any] = {} _run_test( hf_runner, vllm_runner, @@ -119,6 +129,7 @@ def test_models_image( input_images, model, dtype=dtype, + tokenization_kwargs=tokenization_kwargs, ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f005605c08d7..921bb5568b3d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1077,6 +1077,7 @@ def encode( params=pooling_params, use_tqdm=use_tqdm, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1114,6 +1115,7 @@ def embed( use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[EmbeddingRequestOutput]: """ Generate an embedding vector for each prompt. @@ -1151,6 +1153,7 @@ def embed( pooling_params=pooling_params, lora_request=lora_request, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) return [EmbeddingRequestOutput.from_base(item) for item in items] @@ -1162,6 +1165,7 @@ def classify( use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ClassificationRequestOutput]: """ Generate class logits for each prompt. @@ -1197,6 +1201,7 @@ def classify( pooling_params=pooling_params, lora_request=lora_request, pooling_task="classify", + tokenization_kwargs=tokenization_kwargs, ) return [ClassificationRequestOutput.from_base(item) for item in items] @@ -1210,6 +1215,7 @@ def reward( use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[PoolingRequestOutput]: """ Generate rewards for each prompt. @@ -1237,6 +1243,7 @@ def reward( pooling_params=pooling_params, truncate_prompt_tokens=truncate_prompt_tokens, pooling_task="token_classify", + tokenization_kwargs=tokenization_kwargs, ) def _embedding_score( @@ -1248,6 +1255,7 @@ def _embedding_score( use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: encoded_output: list[PoolingRequestOutput] = self.encode( text_1 + text_2, @@ -1256,6 +1264,7 @@ def _embedding_score( lora_request=lora_request, pooling_params=pooling_params, pooling_task="embed", + tokenization_kwargs=tokenization_kwargs, ) encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)] @@ -1280,6 +1289,7 @@ def _cross_encoding_score( use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: model_config = self.model_config @@ -1295,7 +1305,8 @@ def _cross_encoding_score( pooling_params.verify("score", model_config) pooling_params_list = list[PoolingParams]() - tokenization_kwargs: dict[str, Any] = {} + if tokenization_kwargs is None: + tokenization_kwargs = {} _validate_truncation_size( model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs @@ -1329,6 +1340,7 @@ def _cross_encoding_score( params=pooling_params_list, use_tqdm=use_tqdm, lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) @@ -1554,6 +1566,7 @@ def _validate_and_add_requests( use_tqdm: bool | Callable[..., tqdm] = True, lora_request: Sequence[LoRARequest] | LoRARequest | None, priority: list[int] | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. @@ -1599,6 +1612,7 @@ def _validate_and_add_requests( if isinstance(lora_request, Sequence) else lora_request, priority=priority[i] if priority else 0, + tokenization_kwargs=tokenization_kwargs, ) added_request_ids.append(request_id) except Exception as e: @@ -1662,9 +1676,11 @@ def _process_inputs( *, lora_request: LoRARequest | None, priority: int, + tokenization_kwargs: dict[str, Any] | None = None, ) -> tuple[EngineCoreRequest, dict[str, Any]]: """Use the Processor to process inputs for LLMEngine.""" - tokenization_kwargs: dict[str, Any] = {} + if tokenization_kwargs is None: + tokenization_kwargs = {} _validate_truncation_size( self.model_config.max_model_len, params.truncate_prompt_tokens, @@ -1687,6 +1703,7 @@ def _add_request( params: SamplingParams | PoolingParams, lora_request: LoRARequest | None = None, priority: int = 0, + tokenization_kwargs: dict[str, Any] | None = None, ) -> str: prompt_text, _, _ = get_prompt_components(prompt) request_id = str(next(self.request_counter)) @@ -1697,6 +1714,7 @@ def _add_request( params, lora_request=lora_request, priority=priority, + tokenization_kwargs=tokenization_kwargs, ) self.llm_engine.add_request( From 67b40f538408e074b3cce5b286535cc902666729 Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Tue, 2 Dec 2025 04:03:56 +0000 Subject: [PATCH 2/7] fix Signed-off-by: piood <2477084691@qq.com> --- tests/conftest.py | 7 +++++-- tests/models/multimodal/pooling/test_siglip.py | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e231a4d2a125..e0bc90391da6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -397,7 +397,7 @@ def get_inputs( images: PromptImageInput | None = None, videos: PromptVideoInput | None = None, audios: PromptAudioInput | None = None, - processor_kwargs: dict[str, Any] | None = None, + tokenization_kwargs: dict[str, Any] | None = None, ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -411,8 +411,11 @@ def get_inputs( all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): + # Create a copy to avoid modifying the original dict processor_kwargs = ( - processor_kwargs if processor_kwargs is not None else {} + tokenization_kwargs.copy() + if tokenization_kwargs is not None + else {} ) processor_kwargs.update( { diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index fb6ae71f9f5c..c77af56c6a09 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -37,8 +37,11 @@ def _run_test( model: str, *, dtype: str, - tokenization_kwargs: dict[str, Any], + tokenization_kwargs: dict[str, Any] | None = None, ) -> None: + if tokenization_kwargs is None: + tokenization_kwargs = {} + with vllm_runner( model, runner="pooling", @@ -53,7 +56,7 @@ def _run_test( with hf_runner(model, dtype=dtype, auto_cls=SiglipModel) as hf_model: all_inputs = hf_model.get_inputs( - input_texts, images=input_images, processor_kwargs=tokenization_kwargs + input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs ) all_outputs = [] @@ -94,7 +97,6 @@ def test_models_text( input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] - tokenization_kwargs = {"padding": "max_length", "max_length": 64} _run_test( hf_runner, vllm_runner, @@ -102,7 +104,7 @@ def test_models_text( input_images, # type: ignore model, dtype=dtype, - tokenization_kwargs=tokenization_kwargs, + tokenization_kwargs={"padding": "max_length", "max_length": 64}, ) @@ -121,7 +123,6 @@ def test_models_image( input_texts = [text for text, _ in input_texts_images] input_images = [image for _, image in input_texts_images] - tokenization_kwargs: dict[str, Any] = {} _run_test( hf_runner, vllm_runner, @@ -129,7 +130,6 @@ def test_models_image( input_images, model, dtype=dtype, - tokenization_kwargs=tokenization_kwargs, ) From 9597c5cf83090df245ba75df4a3474f924e76aaf Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Tue, 2 Dec 2025 08:11:40 +0000 Subject: [PATCH 3/7] fix Signed-off-by: piood <2477084691@qq.com> --- vllm/entrypoints/llm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 921bb5568b3d..3d7aa711d858 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1335,12 +1335,14 @@ def _cross_encoding_score( prompts.append(engine_prompt) + # tokenization_kwargs has already been consumed by get_score_prompt() + # above to tokenize and truncate the prompts, should not pass the same + # tokenization_kwargs again to avoid potential duplicate truncation. self._validate_and_add_requests( prompts=prompts, params=pooling_params_list, use_tqdm=use_tqdm, lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs, ) outputs = self._run_engine(use_tqdm=use_tqdm) From d455b2909d3d1255a339477bcb058a4723d0ea4a Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Tue, 2 Dec 2025 08:29:31 +0000 Subject: [PATCH 4/7] fix Signed-off-by: piood <2477084691@qq.com> --- vllm/entrypoints/llm.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3d7aa711d858..886c5710a2be 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1305,8 +1305,8 @@ def _cross_encoding_score( pooling_params.verify("score", model_config) pooling_params_list = list[PoolingParams]() - if tokenization_kwargs is None: - tokenization_kwargs = {} + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs @@ -1681,8 +1681,9 @@ def _process_inputs( tokenization_kwargs: dict[str, Any] | None = None, ) -> tuple[EngineCoreRequest, dict[str, Any]]: """Use the Processor to process inputs for LLMEngine.""" - if tokenization_kwargs is None: - tokenization_kwargs = {} + + local_kwargs = tokenization_kwargs or {} + tokenization_kwargs = local_kwargs.copy() _validate_truncation_size( self.model_config.max_model_len, params.truncate_prompt_tokens, From 7d79e49bd078b91c865e0bc3907b9ea1a141cb4a Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Tue, 2 Dec 2025 08:53:37 +0000 Subject: [PATCH 5/7] remove redundant code and comment Signed-off-by: piood <2477084691@qq.com> --- vllm/entrypoints/llm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 886c5710a2be..127d0bc7f12b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1335,9 +1335,6 @@ def _cross_encoding_score( prompts.append(engine_prompt) - # tokenization_kwargs has already been consumed by get_score_prompt() - # above to tokenize and truncate the prompts, should not pass the same - # tokenization_kwargs again to avoid potential duplicate truncation. self._validate_and_add_requests( prompts=prompts, params=pooling_params_list, From 5610239fc806962fed3d12c522a490c5bcad6d19 Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Tue, 2 Dec 2025 09:11:15 +0000 Subject: [PATCH 6/7] add comment Signed-off-by: piood <2477084691@qq.com> --- tests/models/multimodal/pooling/test_siglip.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index c77af56c6a09..8cc2f3c7fdb6 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -22,10 +22,8 @@ ) MODELS = [ - "google/siglip-base-patch16-224", - "google/siglip2-base-patch16-224", - # Different image embedding dim than text_config.hidden_size - "google/siglip2-giant-opt-patch16-384", + "/workspace/hf_model/siglip-base-patch16-224", + "/workspace/hf_model/siglip2-base-patch16-224", ] @@ -104,7 +102,10 @@ def test_models_text( input_images, # type: ignore model, dtype=dtype, - tokenization_kwargs={"padding": "max_length", "max_length": 64}, + tokenization_kwargs={ + "padding": "max_length", + "max_length": 64, + }, # siglip2 was trained with this padding setting. ) From bdfb80fabba8be117f2441d13264b19610219c0c Mon Sep 17 00:00:00 2001 From: piood <2477084691@qq.com> Date: Sat, 6 Dec 2025 02:52:05 +0000 Subject: [PATCH 7/7] fix Signed-off-by: piood <2477084691@qq.com> --- tests/models/multimodal/pooling/test_siglip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py index 8cc2f3c7fdb6..72886cbf7f32 100644 --- a/tests/models/multimodal/pooling/test_siglip.py +++ b/tests/models/multimodal/pooling/test_siglip.py @@ -22,8 +22,10 @@ ) MODELS = [ - "/workspace/hf_model/siglip-base-patch16-224", - "/workspace/hf_model/siglip2-base-patch16-224", + "google/siglip-base-patch16-224", + "google/siglip2-base-patch16-224", + # Different image embedding dim than text_config.hidden_size + "google/siglip2-giant-opt-patch16-384", ]