From 826ac0bd6e571dec475ff2fcc8b0a3825dc6e27d Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Mon, 9 Feb 2026 14:26:51 +0800 Subject: [PATCH 01/44] wip: add io_process_plugin for sparse embedding Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/__init__.py | 6 + .../sparse_embeddings_processor.py | 152 ++++++++++++++++++ .../bge_m3_sparse_processor/types.py | 24 +++ tests/plugins/bge_m3_sparse_plugin/setup.py | 15 ++ vllm/entrypoints/pooling/pooling/serving.py | 4 +- 5 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py create mode 100644 tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py create mode 100644 tests/plugins/bge_m3_sparse_plugin/setup.py diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py new file mode 100644 index 000000000000..a428be6fc0ec --- /dev/null +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +def register_bge_m3_sparse_embeddings_processor(): + return "bge_m3_sparse_processor.sparse_embeddings_processor.BgeM3SparseEmbeddingsProcessor" # noqa: E501 diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py new file mode 100644 index 000000000000..e2172eceeacb --- /dev/null +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Any + +from vllm.config import VllmConfig +from vllm.entrypoints.openai.engine.protocol import UsageInfo +from vllm.entrypoints.pooling.base.protocol import CompletionRequestMixin +from vllm.entrypoints.pooling.pooling.protocol import ( + IOProcessorRequest, + IOProcessorResponse, +) +from vllm.inputs.data import PromptType +from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput +from vllm.plugins.io_processors.interface import ( + IOProcessor, + IOProcessorInput, + IOProcessorOutput, +) +from vllm.pooling_params import PoolingParams +from vllm.renderers import TokenizeParams +from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq +from vllm.sampling_params import SamplingParams + +from .types import ( + SparseEmbeddingCompletionRequestMixin, + SparseEmbeddingResponse, + SparseEmbeddingResponseData, +) + +logger = init_logger(__name__) + + +class BgeM3SparseEmbeddingsProcessor(IOProcessor): + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + self.max_model_len = vllm_config.model_config.max_model_len + assert self.max_model_len, f"max_model_len is not configured, {vllm_config=}" + + def validate_or_generate_params( + self, params: SamplingParams | PoolingParams | None = None, request: Any = None + ) -> SamplingParams | PoolingParams: + if request: + params = PoolingParams( + task=request.task, + truncate_prompt_tokens=request.truncate_prompt_tokens + if isinstance(request, IOProcessorRequest) + else None, + ) + return params + + def parse_request(self, request: Any) -> IOProcessorInput: + # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. + if type(request) is dict and "data" in request: + return SparseEmbeddingCompletionRequestMixin(input=[request["data"]]) + + # for online serving `pooling` endpoint + if isinstance(request, IOProcessorRequest): + if not hasattr(request, "data"): + raise ValueError("missing 'data' field in OpenAIBaseModel Request") + request_data = request.data + kwargs = {"truncate_prompt_tokens": request.truncate_prompt_tokens} + if type(request_data) is list: + kwargs["input"] = request_data + return SparseEmbeddingCompletionRequestMixin(**kwargs) + if type(request_data) is str: + kwargs["input"] = [request_data] + return SparseEmbeddingCompletionRequestMixin(**kwargs) + if type(request_data) is dict: + kwargs.update(request_data) + return SparseEmbeddingCompletionRequestMixin(**kwargs) + raise ValueError("Unable to parse request") + + def pre_process( + self, + prompt: IOProcessorInput, + request_id: str | None = None, + **kwargs, + ) -> PromptType | Sequence[PromptType]: + prompts = prompt_to_seq(prompt.input) + if "renderer" not in kwargs: + return prompts + renderer = kwargs["renderer"] + parsed_prompts = [ + parse_model_prompt(self.vllm_config.model_config, prompt) + for prompt in prompts + ] + engine_prompts = renderer.render_cmpl( + parsed_prompts, + self._build_render_params(prompt), + ) + return engine_prompts + + def post_process( + self, + model_output: Sequence[PoolingRequestOutput], + request_id: str | None = None, + **kwargs, + ) -> IOProcessorOutput: + num_prompt_tokens = 0 + response_data = [] + for idx in range(len(model_output)): + mo = model_output[idx] + sparse_embedding = {} + num_prompt_tokens += len(mo.prompt_token_ids) + if len(mo.prompt_token_ids) != len(mo.outputs.data): + # this is the case that add_special_tokens is True, + # which means first token and last token are special tokens + mo.prompt_token_ids = mo.prompt_token_ids[1:] + for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data): + if token_id not in sparse_embedding: + sparse_embedding[token_id] = weight + continue + if weight > sparse_embedding[token_id]: + sparse_embedding[token_id] = weight + response_data.append( + SparseEmbeddingResponseData( + index=idx, sparse_embedding=sparse_embedding + ) + ) + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + resp = SparseEmbeddingResponse( + request_id=request_id, + data=response_data, + usage=usage, + ) + + return resp + + def output_to_response( + self, plugin_output: IOProcessorOutput + ) -> IOProcessorResponse: + return IOProcessorResponse( + request_id=plugin_output.request_id, + data=plugin_output, + ) + + def _build_render_params(self, request: CompletionRequestMixin): + encoder_config = self.vllm_config.model_config.encoder_config or {} + return TokenizeParams( + max_total_tokens=self.max_model_len, + max_output_tokens=0, + truncate_prompt_tokens=request.truncate_prompt_tokens, + do_lower_case=encoder_config.get("do_lower_case", False), + add_special_tokens=request.add_special_tokens, + max_total_tokens_param="max_model_len", + ) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py new file mode 100644 index 000000000000..63c3f9abaca1 --- /dev/null +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Annotated + +from pydantic import BaseModel, Field + +from vllm.entrypoints.openai.engine.protocol import UsageInfo +from vllm.entrypoints.pooling.base.protocol import CompletionRequestMixin + + +class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): + truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + + +class SparseEmbeddingResponseData(BaseModel): + index: int + object: str = "sparse-embedding" + sparse_embedding: dict[int, float] + + +class SparseEmbeddingResponse(BaseModel): + request_id: str | None + data: list[SparseEmbeddingResponseData] + usage: UsageInfo diff --git a/tests/plugins/bge_m3_sparse_plugin/setup.py b/tests/plugins/bge_m3_sparse_plugin/setup.py new file mode 100644 index 000000000000..7bc01399f73b --- /dev/null +++ b/tests/plugins/bge_m3_sparse_plugin/setup.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from setuptools import setup + +setup( + name="bge-m3-sparse-plugin", + version="0.1", + packages=["bge_m3_sparse_processor"], + entry_points={ + "vllm.io_processor_plugins": [ + "bge_m3_sparse_plugin = bge_m3_sparse_processor:register_bge_m3_sparse_embeddings_processor", # noqa: E501 + ] + }, +) diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index f27a27191f99..6ee928eec0bc 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -105,7 +105,9 @@ async def create_pooling( validated_prompt = self.io_processor.parse_data(request.data) raw_prompts = await self.io_processor.pre_process_async( - prompt=validated_prompt, request_id=request_id + prompt=validated_prompt, + request_id=request_id, + renderer=self.renderer, ) engine_prompts = await self._preprocess_cmpl( request, From 1687d0cc9a83dcab86535692a8152ffcaf918050 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Tue, 10 Feb 2026 15:35:22 +0800 Subject: [PATCH 02/44] fix bugs for offline mode with array Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index e2172eceeacb..720b459bf2f0 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -54,7 +54,9 @@ def validate_or_generate_params( def parse_request(self, request: Any) -> IOProcessorInput: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. if type(request) is dict and "data" in request: - return SparseEmbeddingCompletionRequestMixin(input=[request["data"]]) + if isinstance(request["data"], str): + return SparseEmbeddingCompletionRequestMixin(input=[request["data"]]) + return SparseEmbeddingCompletionRequestMixin(input=request["data"]) # for online serving `pooling` endpoint if isinstance(request, IOProcessorRequest): From 2e4dc2824ec7ec862a219f24aa15fe4723603290 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Tue, 10 Feb 2026 16:07:04 +0800 Subject: [PATCH 03/44] update code with gemeni suggestions Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 720b459bf2f0..6785d77e70ad 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -49,11 +49,11 @@ def validate_or_generate_params( if isinstance(request, IOProcessorRequest) else None, ) - return params + return params or PoolingParams() def parse_request(self, request: Any) -> IOProcessorInput: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. - if type(request) is dict and "data" in request: + if isinstance(request, dict) and "data" in request: if isinstance(request["data"], str): return SparseEmbeddingCompletionRequestMixin(input=[request["data"]]) return SparseEmbeddingCompletionRequestMixin(input=request["data"]) @@ -64,13 +64,13 @@ def parse_request(self, request: Any) -> IOProcessorInput: raise ValueError("missing 'data' field in OpenAIBaseModel Request") request_data = request.data kwargs = {"truncate_prompt_tokens": request.truncate_prompt_tokens} - if type(request_data) is list: + if isinstance(request_data, list): kwargs["input"] = request_data return SparseEmbeddingCompletionRequestMixin(**kwargs) - if type(request_data) is str: + if isinstance(request_data, str): kwargs["input"] = [request_data] return SparseEmbeddingCompletionRequestMixin(**kwargs) - if type(request_data) is dict: + if isinstance(request_data, dict): kwargs.update(request_data) return SparseEmbeddingCompletionRequestMixin(**kwargs) raise ValueError("Unable to parse request") From 4589b09c25c0cbf70b03cac0769435a1bc16ee30 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 11 Feb 2026 10:50:02 +0800 Subject: [PATCH 04/44] update bge_m3_sparse_plugin with simple code to construct sparse embedding dict Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 6785d77e70ad..faf32d695de9 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -105,18 +105,16 @@ def post_process( response_data = [] for idx in range(len(model_output)): mo = model_output[idx] - sparse_embedding = {} + sparse_embedding: dict[int, float] = {} num_prompt_tokens += len(mo.prompt_token_ids) if len(mo.prompt_token_ids) != len(mo.outputs.data): # this is the case that add_special_tokens is True, # which means first token and last token are special tokens mo.prompt_token_ids = mo.prompt_token_ids[1:] for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data): - if token_id not in sparse_embedding: - sparse_embedding[token_id] = weight - continue - if weight > sparse_embedding[token_id]: - sparse_embedding[token_id] = weight + sparse_embedding[token_id] = max( + weight, sparse_embedding.get(token_id, 0.0) + ) response_data.append( SparseEmbeddingResponseData( index=idx, sparse_embedding=sparse_embedding From b2e15fe49c52ccf7944743cc39ac0d331f10e8da Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 11 Feb 2026 15:13:32 +0800 Subject: [PATCH 05/44] add params to determine whether output token_id to token text mapping Signed-off-by: augusto.yjh --- .../bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py index 63c3f9abaca1..095fcbb83c71 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py @@ -10,6 +10,11 @@ class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None + return_token_id_texts_map: bool | None = Field( + default=None, + description="Whether to return dict shows the mapping of token_id to text." + "`None` or False means not return.", + ) class SparseEmbeddingResponseData(BaseModel): From 269a8b74fab1984c6d23df6f0044e487219dfd76 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Fri, 13 Feb 2026 18:30:01 +0800 Subject: [PATCH 06/44] udpate bge_m3_sparse_plugin Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 113 ++++++++---------- .../bge_m3_sparse_processor/types.py | 11 +- vllm/entrypoints/llm.py | 4 +- 3 files changed, 60 insertions(+), 68 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index faf32d695de9..9ac607f621ca 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -6,9 +6,7 @@ from vllm.config import VllmConfig from vllm.entrypoints.openai.engine.protocol import UsageInfo -from vllm.entrypoints.pooling.base.protocol import CompletionRequestMixin from vllm.entrypoints.pooling.pooling.protocol import ( - IOProcessorRequest, IOProcessorResponse, ) from vllm.inputs.data import PromptType @@ -20,14 +18,14 @@ IOProcessorOutput, ) from vllm.pooling_params import PoolingParams -from vllm.renderers import TokenizeParams -from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq +from vllm.renderers import BaseRenderer from vllm.sampling_params import SamplingParams from .types import ( SparseEmbeddingCompletionRequestMixin, SparseEmbeddingResponse, SparseEmbeddingResponseData, + SparseEmbeddingTokenWeight, ) logger = init_logger(__name__) @@ -36,64 +34,60 @@ class BgeM3SparseEmbeddingsProcessor(IOProcessor): def __init__(self, vllm_config: VllmConfig): super().__init__(vllm_config) - self.max_model_len = vllm_config.model_config.max_model_len - assert self.max_model_len, f"max_model_len is not configured, {vllm_config=}" + self.offline_requests: list[SparseEmbeddingCompletionRequestMixin] = [] + self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {} + self.renderer: BaseRenderer = None def validate_or_generate_params( - self, params: SamplingParams | PoolingParams | None = None, request: Any = None + self, params: SamplingParams | PoolingParams | None = None ) -> SamplingParams | PoolingParams: - if request: - params = PoolingParams( - task=request.task, - truncate_prompt_tokens=request.truncate_prompt_tokens - if isinstance(request, IOProcessorRequest) - else None, - ) - return params or PoolingParams() + return params - def parse_request(self, request: Any) -> IOProcessorInput: + def parse_request(self, request_data: Any) -> IOProcessorInput: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. - if isinstance(request, dict) and "data" in request: - if isinstance(request["data"], str): - return SparseEmbeddingCompletionRequestMixin(input=[request["data"]]) - return SparseEmbeddingCompletionRequestMixin(input=request["data"]) - - # for online serving `pooling` endpoint - if isinstance(request, IOProcessorRequest): - if not hasattr(request, "data"): - raise ValueError("missing 'data' field in OpenAIBaseModel Request") - request_data = request.data - kwargs = {"truncate_prompt_tokens": request.truncate_prompt_tokens} - if isinstance(request_data, list): - kwargs["input"] = request_data - return SparseEmbeddingCompletionRequestMixin(**kwargs) - if isinstance(request_data, str): - kwargs["input"] = [request_data] - return SparseEmbeddingCompletionRequestMixin(**kwargs) - if isinstance(request_data, dict): - kwargs.update(request_data) - return SparseEmbeddingCompletionRequestMixin(**kwargs) - raise ValueError("Unable to parse request") + if isinstance(request_data, dict): + return SparseEmbeddingCompletionRequestMixin(**request_data) + raise ValueError("Unable to parse request_data") def pre_process( self, - prompt: IOProcessorInput, + prompt: SparseEmbeddingCompletionRequestMixin, request_id: str | None = None, **kwargs, ) -> PromptType | Sequence[PromptType]: - prompts = prompt_to_seq(prompt.input) - if "renderer" not in kwargs: - return prompts - renderer = kwargs["renderer"] - parsed_prompts = [ - parse_model_prompt(self.vllm_config.model_config, prompt) - for prompt in prompts - ] - engine_prompts = renderer.render_cmpl( - parsed_prompts, - self._build_render_params(prompt), - ) - return engine_prompts + if request_id is not None: + assert request_id not in self.online_requests, "request_id duplicated" + self.online_requests[request_id] = prompt + else: + self.offline_requests.append(prompt) + if self.renderer is None and "renderer" in kwargs: + self.renderer = kwargs["renderer"] + return prompt + + def _get_sparse_embedding_request(self, request_id: str | None = None): + if request_id: + return self.online_requests.pop(request_id) + return self.offline_requests.pop() + + def _build_sparse_embedding_token_weights( + self, request_id: str | None, sparse_embedding: dict[int, float] + ) -> list[SparseEmbeddingTokenWeight]: + request = self._get_sparse_embedding_request(request_id) + assert request, "illegal request" + token_ids = sparse_embedding.keys() + token_weights = sparse_embedding.values() + tokens = [None] * len(token_ids) + + if request.return_token_id_texts_map: + tokens = self.renderer.get_tokenizer().convert_ids_to_tokens(token_ids) + sparse_embedding_output: list[SparseEmbeddingTokenWeight] = [] + for token_id, weight, token in zip(token_ids, token_weights, tokens): + sparse_embedding_output.append( + SparseEmbeddingTokenWeight( + token_id=token_id, weight=weight, token=token + ) + ) + return sparse_embedding_output def post_process( self, @@ -117,9 +111,13 @@ def post_process( ) response_data.append( SparseEmbeddingResponseData( - index=idx, sparse_embedding=sparse_embedding + index=idx, + sparse_embedding=self._build_sparse_embedding_token_weights( + request_id, sparse_embedding + ), ) ) + usage = UsageInfo( prompt_tokens=num_prompt_tokens, total_tokens=num_prompt_tokens, @@ -139,14 +137,3 @@ def output_to_response( request_id=plugin_output.request_id, data=plugin_output, ) - - def _build_render_params(self, request: CompletionRequestMixin): - encoder_config = self.vllm_config.model_config.encoder_config or {} - return TokenizeParams( - max_total_tokens=self.max_model_len, - max_output_tokens=0, - truncate_prompt_tokens=request.truncate_prompt_tokens, - do_lower_case=encoder_config.get("do_lower_case", False), - add_special_tokens=request.add_special_tokens, - max_total_tokens_param="max_model_len", - ) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py index 095fcbb83c71..41efbac289a4 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Annotated from pydantic import BaseModel, Field @@ -9,7 +8,6 @@ class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): - truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None return_token_id_texts_map: bool | None = Field( default=None, description="Whether to return dict shows the mapping of token_id to text." @@ -17,13 +15,18 @@ class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): ) +class SparseEmbeddingTokenWeight(BaseModel): + token_id: int + weight: float + token: str | None + + class SparseEmbeddingResponseData(BaseModel): index: int object: str = "sparse-embedding" - sparse_embedding: dict[int, float] + sparse_embedding: list[SparseEmbeddingTokenWeight] class SparseEmbeddingResponse(BaseModel): - request_id: str | None data: list[SparseEmbeddingResponseData] usage: UsageInfo diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2d925d0a992b..e0af9b0f20d5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1102,7 +1102,9 @@ def encode( validated_prompt = self.io_processor.parse_data(prompt_data) # obtain the actual model prompts from the pre-processor - prompts = self.io_processor.pre_process(prompt=validated_prompt) + prompts = self.io_processor.pre_process( + prompt=validated_prompt, renderer=self.renderer + ) prompts_seq = prompt_to_seq(prompts) params_seq: Sequence[PoolingParams] = [ From 75027a7477bb73458b686d0be0814f8819e399de Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 09:58:05 +0800 Subject: [PATCH 07/44] add input param for sparse embedding Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 2 +- .../bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 9ac607f621ca..dc94d76cf656 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -62,7 +62,7 @@ def pre_process( self.offline_requests.append(prompt) if self.renderer is None and "renderer" in kwargs: self.renderer = kwargs["renderer"] - return prompt + return prompt.input def _get_sparse_embedding_request(self, request_id: str | None = None): if request_id: diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py index 41efbac289a4..8d3741315f5b 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py @@ -8,6 +8,7 @@ class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): + input: list[int] | list[list[int]] | str | list[str] return_token_id_texts_map: bool | None = Field( default=None, description="Whether to return dict shows the mapping of token_id to text." From 3901619f09f247f6ef5f567fefa7828c68b03e6b Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 10:31:57 +0800 Subject: [PATCH 08/44] update interface for io_processor_plugin Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 16 +++++++++++----- vllm/entrypoints/pooling/pooling/serving.py | 4 +++- vllm/plugins/io_processors/interface.py | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index dc94d76cf656..d36c78d7227e 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -19,7 +19,6 @@ ) from vllm.pooling_params import PoolingParams from vllm.renderers import BaseRenderer -from vllm.sampling_params import SamplingParams from .types import ( SparseEmbeddingCompletionRequestMixin, @@ -38,10 +37,17 @@ def __init__(self, vllm_config: VllmConfig): self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {} self.renderer: BaseRenderer = None - def validate_or_generate_params( - self, params: SamplingParams | PoolingParams | None = None - ) -> SamplingParams | PoolingParams: - return params + def merge_pooling_params( + self, + params: PoolingParams | None = None, + request: Any = None, + ) -> PoolingParams: + if params is None: + params = PoolingParams() + # refer to PoolingCompletionRequest.to_pooling_params + if request is not None: + params.task = request.task + params.truncate_prompt_tokens = request.truncate_prompt_tokens def parse_request(self, request_data: Any) -> IOProcessorInput: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 6ee928eec0bc..a98182f93d68 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -147,7 +147,9 @@ async def create_pooling( if use_io_processor: assert self.io_processor is not None - pooling_params = self.io_processor.merge_pooling_params() + pooling_params = self.io_processor.merge_pooling_params( + params=None, request=request + ) if pooling_params.task is None: pooling_params.task = "plugin" else: diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index fa71b4ca0995..cd273acb0d43 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -3,7 +3,7 @@ import warnings from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Sequence -from typing import Generic, TypeVar +from typing import Any, Generic, TypeVar from vllm.config import VllmConfig from vllm.inputs.data import PromptType @@ -62,6 +62,7 @@ def merge_sampling_params( def merge_pooling_params( self, params: PoolingParams | None = None, + request: Any = None, ) -> PoolingParams: if callable( validate_or_generate_params := getattr( From 4850b7ef53abb0008226b9e988856b561d055495 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 10:33:20 +0800 Subject: [PATCH 09/44] add return Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index d36c78d7227e..2d63c3176e03 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -48,6 +48,7 @@ def merge_pooling_params( if request is not None: params.task = request.task params.truncate_prompt_tokens = request.truncate_prompt_tokens + return params def parse_request(self, request_data: Any) -> IOProcessorInput: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. From 79946b8bc48cf92d4afbb09eee6dba7f1371fa1b Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 11:46:10 +0800 Subject: [PATCH 10/44] fix bugs in post_process Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 2d63c3176e03..d11ca29aa891 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -73,19 +73,19 @@ def pre_process( def _get_sparse_embedding_request(self, request_id: str | None = None): if request_id: - return self.online_requests.pop(request_id) + return self.online_requests.pop(request_id, None) return self.offline_requests.pop() def _build_sparse_embedding_token_weights( - self, request_id: str | None, sparse_embedding: dict[int, float] + self, + sparse_embedding: dict[int, float], + return_tokens: bool = False, ) -> list[SparseEmbeddingTokenWeight]: - request = self._get_sparse_embedding_request(request_id) - assert request, "illegal request" token_ids = sparse_embedding.keys() token_weights = sparse_embedding.values() tokens = [None] * len(token_ids) - if request.return_token_id_texts_map: + if return_tokens: tokens = self.renderer.get_tokenizer().convert_ids_to_tokens(token_ids) sparse_embedding_output: list[SparseEmbeddingTokenWeight] = [] for token_id, weight, token in zip(token_ids, token_weights, tokens): @@ -104,6 +104,9 @@ def post_process( ) -> IOProcessorOutput: num_prompt_tokens = 0 response_data = [] + return_tokens = self._get_sparse_embedding_request( + request_id + ).return_token_id_texts_map for idx in range(len(model_output)): mo = model_output[idx] sparse_embedding: dict[int, float] = {} @@ -112,7 +115,7 @@ def post_process( # this is the case that add_special_tokens is True, # which means first token and last token are special tokens mo.prompt_token_ids = mo.prompt_token_ids[1:] - for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data): + for token_id, weight in zip(mo.prompt_token_ids, mo.outputs.data.tolist()): sparse_embedding[token_id] = max( weight, sparse_embedding.get(token_id, 0.0) ) @@ -120,7 +123,8 @@ def post_process( SparseEmbeddingResponseData( index=idx, sparse_embedding=self._build_sparse_embedding_token_weights( - request_id, sparse_embedding + sparse_embedding, + return_tokens, ), ) ) From 9c20e6ff030137f01ff41302d52d91b56d314ddd Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 11:48:22 +0800 Subject: [PATCH 11/44] fix bugs in post_process Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index d11ca29aa891..026441a97d76 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -134,7 +134,6 @@ def post_process( total_tokens=num_prompt_tokens, ) resp = SparseEmbeddingResponse( - request_id=request_id, data=response_data, usage=usage, ) From d3c2d8ddb1a1c58b0f16509bee11f5ed3a3d36ff Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 11:51:57 +0800 Subject: [PATCH 12/44] make plugin compatible with main branch Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 026441a97d76..c07de46275c8 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -6,9 +6,6 @@ from vllm.config import VllmConfig from vllm.entrypoints.openai.engine.protocol import UsageInfo -from vllm.entrypoints.pooling.pooling.protocol import ( - IOProcessorResponse, -) from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput @@ -139,11 +136,3 @@ def post_process( ) return resp - - def output_to_response( - self, plugin_output: IOProcessorOutput - ) -> IOProcessorResponse: - return IOProcessorResponse( - request_id=plugin_output.request_id, - data=plugin_output, - ) From 07e263382443d98453fa4247c8979ebc76e95c49 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 12:04:22 +0800 Subject: [PATCH 13/44] make plugin compatible with offline mode Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index c07de46275c8..4c6db2d4d738 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -43,8 +43,10 @@ def merge_pooling_params( params = PoolingParams() # refer to PoolingCompletionRequest.to_pooling_params if request is not None: - params.task = request.task - params.truncate_prompt_tokens = request.truncate_prompt_tokens + if params.task is None: + params.task = request.task + if params.truncate_prompt_tokens is None: + params.truncate_prompt_tokens = request.truncate_prompt_tokens return params def parse_request(self, request_data: Any) -> IOProcessorInput: From 32bda353a37dc404bf9baca1013a650ecf0add00 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 14:12:58 +0800 Subject: [PATCH 14/44] update pooling params for online mode Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 4c6db2d4d738..0a2d8ea4fa97 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -6,6 +6,7 @@ from vllm.config import VllmConfig from vllm.entrypoints.openai.engine.protocol import UsageInfo +from vllm.entrypoints.pooling.pooling.protocol import IOProcessorRequest from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput @@ -42,7 +43,7 @@ def merge_pooling_params( if params is None: params = PoolingParams() # refer to PoolingCompletionRequest.to_pooling_params - if request is not None: + if request is not None and isinstance(request, IOProcessorRequest): if params.task is None: params.task = request.task if params.truncate_prompt_tokens is None: From ac285c30fed798b4584e564b6d8305c3f56bbaf7 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 14:34:39 +0800 Subject: [PATCH 15/44] make code cleaner Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 0a2d8ea4fa97..39ae5e393b2c 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -43,7 +43,7 @@ def merge_pooling_params( if params is None: params = PoolingParams() # refer to PoolingCompletionRequest.to_pooling_params - if request is not None and isinstance(request, IOProcessorRequest): + if isinstance(request, IOProcessorRequest): if params.task is None: params.task = request.task if params.truncate_prompt_tokens is None: From 93d754cd5424c7975d4bf5a842b497ba68e3287a Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 14:56:47 +0800 Subject: [PATCH 16/44] use convert_ids_list_to_tokens instead of convert_ids_to_tokens Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 9 +++++---- .../bge_m3_sparse_processor/types.py | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 39ae5e393b2c..bca4f5c087a9 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -17,6 +17,7 @@ ) from vllm.pooling_params import PoolingParams from vllm.renderers import BaseRenderer +from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens from .types import ( SparseEmbeddingCompletionRequestMixin, @@ -86,7 +87,9 @@ def _build_sparse_embedding_token_weights( tokens = [None] * len(token_ids) if return_tokens: - tokens = self.renderer.get_tokenizer().convert_ids_to_tokens(token_ids) + tokens = convert_ids_list_to_tokens( + self.renderer.get_tokenizer(), token_ids + ) sparse_embedding_output: list[SparseEmbeddingTokenWeight] = [] for token_id, weight, token in zip(token_ids, token_weights, tokens): sparse_embedding_output.append( @@ -104,9 +107,7 @@ def post_process( ) -> IOProcessorOutput: num_prompt_tokens = 0 response_data = [] - return_tokens = self._get_sparse_embedding_request( - request_id - ).return_token_id_texts_map + return_tokens = self._get_sparse_embedding_request(request_id).return_token for idx in range(len(model_output)): mo = model_output[idx] sparse_embedding: dict[int, float] = {} diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py index 8d3741315f5b..1dcf30a058c9 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/types.py @@ -8,8 +8,7 @@ class SparseEmbeddingCompletionRequestMixin(CompletionRequestMixin): - input: list[int] | list[list[int]] | str | list[str] - return_token_id_texts_map: bool | None = Field( + return_tokens: bool | None = Field( default=None, description="Whether to return dict shows the mapping of token_id to text." "`None` or False means not return.", From e3dce2114b8ace5f26bdb75514c8797fafd2edc3 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Sat, 14 Feb 2026 14:58:38 +0800 Subject: [PATCH 17/44] use convert_ids_list_to_tokens instead of convert_ids_to_tokens Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index bca4f5c087a9..0c04f51ee4a2 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -107,7 +107,7 @@ def post_process( ) -> IOProcessorOutput: num_prompt_tokens = 0 response_data = [] - return_tokens = self._get_sparse_embedding_request(request_id).return_token + return_tokens = self._get_sparse_embedding_request(request_id).return_tokens for idx in range(len(model_output)): mo = model_output[idx] sparse_embedding: dict[int, float] = {} From 5c856cb7e913b2260f2f82d09be5dcec8156cce4 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Tue, 24 Feb 2026 17:06:55 +0800 Subject: [PATCH 18/44] pass renderer during io_processor init Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 18 ++++++++---------- .../prithvi_io_processor/prithvi_processor.py | 3 ++- vllm/entrypoints/llm.py | 4 +--- vllm/entrypoints/pooling/pooling/serving.py | 4 +--- vllm/plugins/io_processors/__init__.py | 7 +++++-- vllm/v1/engine/async_llm.py | 1 + vllm/v1/engine/llm_engine.py | 1 + 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 0c04f51ee4a2..fa7aecd4e14c 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -12,8 +12,6 @@ from vllm.outputs import PoolingRequestOutput from vllm.plugins.io_processors.interface import ( IOProcessor, - IOProcessorInput, - IOProcessorOutput, ) from vllm.pooling_params import PoolingParams from vllm.renderers import BaseRenderer @@ -29,12 +27,14 @@ logger = init_logger(__name__) -class BgeM3SparseEmbeddingsProcessor(IOProcessor): - def __init__(self, vllm_config: VllmConfig): +class BgeM3SparseEmbeddingsProcessor( + IOProcessor[SparseEmbeddingCompletionRequestMixin, SparseEmbeddingResponse] +): + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer | None = None): super().__init__(vllm_config) self.offline_requests: list[SparseEmbeddingCompletionRequestMixin] = [] self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {} - self.renderer: BaseRenderer = None + self.renderer: BaseRenderer | None = renderer def merge_pooling_params( self, @@ -51,7 +51,7 @@ def merge_pooling_params( params.truncate_prompt_tokens = request.truncate_prompt_tokens return params - def parse_request(self, request_data: Any) -> IOProcessorInput: + def parse_request(self, request_data: Any) -> SparseEmbeddingCompletionRequestMixin: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. if isinstance(request_data, dict): return SparseEmbeddingCompletionRequestMixin(**request_data) @@ -68,8 +68,6 @@ def pre_process( self.online_requests[request_id] = prompt else: self.offline_requests.append(prompt) - if self.renderer is None and "renderer" in kwargs: - self.renderer = kwargs["renderer"] return prompt.input def _get_sparse_embedding_request(self, request_id: str | None = None): @@ -86,7 +84,7 @@ def _build_sparse_embedding_token_weights( token_weights = sparse_embedding.values() tokens = [None] * len(token_ids) - if return_tokens: + if return_tokens and self.renderer is not None: tokens = convert_ids_list_to_tokens( self.renderer.get_tokenizer(), token_ids ) @@ -104,7 +102,7 @@ def post_process( model_output: Sequence[PoolingRequestOutput], request_id: str | None = None, **kwargs, - ) -> IOProcessorOutput: + ) -> SparseEmbeddingResponse: num_prompt_tokens = 0 response_data = [] return_tokens = self._get_sparse_embedding_request(request_id).return_tokens diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py index f9dfa0848b80..0a34b4042a19 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py @@ -22,6 +22,7 @@ from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput from vllm.plugins.io_processors.interface import IOProcessor +from vllm.renderers import BaseRenderer from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput @@ -218,7 +219,7 @@ def load_image( class PrithviMultimodalDataProcessor(IOProcessor[ImagePrompt, ImageRequestOutput]): indices = [0, 1, 2, 3, 4, 5] - def __init__(self, vllm_config: VllmConfig): + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer | None = None): super().__init__(vllm_config) self.datamodule = Sen1Floods11NonGeoDataModule( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e0af9b0f20d5..2d925d0a992b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1102,9 +1102,7 @@ def encode( validated_prompt = self.io_processor.parse_data(prompt_data) # obtain the actual model prompts from the pre-processor - prompts = self.io_processor.pre_process( - prompt=validated_prompt, renderer=self.renderer - ) + prompts = self.io_processor.pre_process(prompt=validated_prompt) prompts_seq = prompt_to_seq(prompts) params_seq: Sequence[PoolingParams] = [ diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index a98182f93d68..2972398a33e4 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -105,9 +105,7 @@ async def create_pooling( validated_prompt = self.io_processor.parse_data(request.data) raw_prompts = await self.io_processor.pre_process_async( - prompt=validated_prompt, - request_id=request_id, - renderer=self.renderer, + prompt=validated_prompt, request_id=request_id ) engine_prompts = await self._preprocess_cmpl( request, diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index b3a3b548781e..41834be7217a 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -6,13 +6,16 @@ from vllm.config import VllmConfig from vllm.plugins import IO_PROCESSOR_PLUGINS_GROUP, load_plugins_by_group from vllm.plugins.io_processors.interface import IOProcessor +from vllm.renderers import BaseRenderer from vllm.utils.import_utils import resolve_obj_by_qualname logger = logging.getLogger(__name__) def get_io_processor( - vllm_config: VllmConfig, plugin_from_init: str | None = None + vllm_config: VllmConfig, + plugin_from_init: str | None = None, + renderer: BaseRenderer | None = None, ) -> IOProcessor | None: # Input.Output processors are loaded as plugins under the # 'vllm.io_processor_plugins' group. Similar to platform @@ -65,4 +68,4 @@ def get_io_processor( activated_plugin_cls = loadable_plugins[model_plugin] - return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config) + return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config, renderer) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d86e1b43df59..559956d43890 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -135,6 +135,7 @@ def __init__( self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, + self.renderer, ) # Convert TokPrompt --> EngineCoreRequest. diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 29a73251faa1..bfa8ddbe7d87 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -93,6 +93,7 @@ def __init__( self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, + self.renderer, ) # Convert TokPrompt --> EngineCoreRequest. From 5eb5a337100fea84a1e9a544b0685fb79a7e4580 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Tue, 24 Feb 2026 18:03:24 +0800 Subject: [PATCH 19/44] let get_io_processor compatible with previous io_process_plugin Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 6 +++--- .../prithvi_io_processor/prithvi_processor.py | 4 ++-- vllm/plugins/io_processors/__init__.py | 4 ++++ vllm/plugins/io_processors/interface.py | 3 ++- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index fa7aecd4e14c..59002fbe36c1 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -30,11 +30,11 @@ class BgeM3SparseEmbeddingsProcessor( IOProcessor[SparseEmbeddingCompletionRequestMixin, SparseEmbeddingResponse] ): - def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer | None = None): - super().__init__(vllm_config) + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): + super().__init__(vllm_config, renderer) self.offline_requests: list[SparseEmbeddingCompletionRequestMixin] = [] self.online_requests: dict[str, SparseEmbeddingCompletionRequestMixin] = {} - self.renderer: BaseRenderer | None = renderer + self.renderer: BaseRenderer = renderer def merge_pooling_params( self, diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py index 0a34b4042a19..b22239fcc267 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py @@ -219,8 +219,8 @@ def load_image( class PrithviMultimodalDataProcessor(IOProcessor[ImagePrompt, ImageRequestOutput]): indices = [0, 1, 2, 3, 4, 5] - def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer | None = None): - super().__init__(vllm_config) + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): + super().__init__(vllm_config, renderer) self.datamodule = Sen1Floods11NonGeoDataModule( data_root=datamodule_config["data_root"], diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index 41834be7217a..f2c6f9b44ec6 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import inspect import logging from vllm.config import VllmConfig @@ -68,4 +69,7 @@ def get_io_processor( activated_plugin_cls = loadable_plugins[model_plugin] + # for backward compatibility, the plugin does not have a renderer argument + if "renderer" not in inspect.signature(activated_plugin_cls.__init__).parameters: + return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config) return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config, renderer) diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index cd273acb0d43..84fbaa3899b5 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -9,6 +9,7 @@ from vllm.inputs.data import PromptType from vllm.outputs import PoolingRequestOutput from vllm.pooling_params import PoolingParams +from vllm.renderers import BaseRenderer from vllm.sampling_params import SamplingParams IOProcessorInput = TypeVar("IOProcessorInput") @@ -18,7 +19,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): """Abstract interface for pre/post-processing of engine I/O.""" - def __init__(self, vllm_config: VllmConfig): + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): super().__init__() self.vllm_config = vllm_config From 670d31c6724b2f770268163e19ff1e798b83a98b Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Tue, 24 Feb 2026 19:44:27 +0800 Subject: [PATCH 20/44] add warnning msg for io_processor_plugin.__init__ api change Signed-off-by: augusto.yjh --- .../bge_m3_sparse_processor/sparse_embeddings_processor.py | 7 ++++--- vllm/plugins/io_processors/__init__.py | 7 ++++++- vllm/plugins/io_processors/interface.py | 4 ++-- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 59002fbe36c1..6f2d1015c125 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Any from vllm.config import VllmConfig from vllm.entrypoints.openai.engine.protocol import UsageInfo @@ -39,7 +38,7 @@ def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): def merge_pooling_params( self, params: PoolingParams | None = None, - request: Any = None, + request: object = None, ) -> PoolingParams: if params is None: params = PoolingParams() @@ -51,7 +50,9 @@ def merge_pooling_params( params.truncate_prompt_tokens = request.truncate_prompt_tokens return params - def parse_request(self, request_data: Any) -> SparseEmbeddingCompletionRequestMixin: + def parse_request( + self, request_data: object + ) -> SparseEmbeddingCompletionRequestMixin: # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. if isinstance(request_data, dict): return SparseEmbeddingCompletionRequestMixin(**request_data) diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index f2c6f9b44ec6..3a348e4a7342 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -15,8 +15,8 @@ def get_io_processor( vllm_config: VllmConfig, + renderer: BaseRenderer, plugin_from_init: str | None = None, - renderer: BaseRenderer | None = None, ) -> IOProcessor | None: # Input.Output processors are loaded as plugins under the # 'vllm.io_processor_plugins' group. Similar to platform @@ -71,5 +71,10 @@ def get_io_processor( # for backward compatibility, the plugin does not have a renderer argument if "renderer" not in inspect.signature(activated_plugin_cls.__init__).parameters: + logger.warning( + "The renderer argument will be required in v0.18, " + "please update your IOProcessor plugin: %s", + activated_plugin_cls, + ) return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config) return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config, renderer) diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index 84fbaa3899b5..f6d8d76642e1 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -3,7 +3,7 @@ import warnings from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Sequence -from typing import Any, Generic, TypeVar +from typing import Generic, TypeVar from vllm.config import VllmConfig from vllm.inputs.data import PromptType @@ -63,7 +63,7 @@ def merge_sampling_params( def merge_pooling_params( self, params: PoolingParams | None = None, - request: Any = None, + request: object = None, ) -> PoolingParams: if callable( validate_or_generate_params := getattr( diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 559956d43890..62fa8c90bc63 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -134,8 +134,8 @@ def __init__( self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, self.renderer, + self.model_config.io_processor_plugin, ) # Convert TokPrompt --> EngineCoreRequest. diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bfa8ddbe7d87..0d9279331d02 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -92,8 +92,8 @@ def __init__( self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, self.renderer, + self.model_config.io_processor_plugin, ) # Convert TokPrompt --> EngineCoreRequest. From 0782705e8029d040caabebe2d1da1be259b91516 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 25 Feb 2026 10:36:05 +0800 Subject: [PATCH 21/44] remove request parameter in merge_pooling_params Signed-off-by: augusto.yjh --- .../sparse_embeddings_processor.py | 8 +------- vllm/plugins/io_processors/interface.py | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 6f2d1015c125..2383e74ec032 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -5,7 +5,6 @@ from vllm.config import VllmConfig from vllm.entrypoints.openai.engine.protocol import UsageInfo -from vllm.entrypoints.pooling.pooling.protocol import IOProcessorRequest from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput @@ -38,16 +37,11 @@ def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): def merge_pooling_params( self, params: PoolingParams | None = None, - request: object = None, ) -> PoolingParams: if params is None: params = PoolingParams() # refer to PoolingCompletionRequest.to_pooling_params - if isinstance(request, IOProcessorRequest): - if params.task is None: - params.task = request.task - if params.truncate_prompt_tokens is None: - params.truncate_prompt_tokens = request.truncate_prompt_tokens + params.task = "token_classify" return params def parse_request( diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index f6d8d76642e1..f73eb99abd73 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -63,7 +63,6 @@ def merge_sampling_params( def merge_pooling_params( self, params: PoolingParams | None = None, - request: object = None, ) -> PoolingParams: if callable( validate_or_generate_params := getattr( From dc4ec89ae0b27b02b666bc01bd218c48f2039a63 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 25 Feb 2026 10:42:56 +0800 Subject: [PATCH 22/44] fix bugs in call merge_pooling_params Signed-off-by: augusto.yjh --- vllm/entrypoints/pooling/pooling/serving.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 2972398a33e4..f27a27191f99 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -145,9 +145,7 @@ async def create_pooling( if use_io_processor: assert self.io_processor is not None - pooling_params = self.io_processor.merge_pooling_params( - params=None, request=request - ) + pooling_params = self.io_processor.merge_pooling_params() if pooling_params.task is None: pooling_params.task = "plugin" else: From 0739106f11a3d480a5ff1062ce18b125a96be640 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 25 Feb 2026 10:47:24 +0800 Subject: [PATCH 23/44] update io_processor_plugins.md as abstract class IOProcessor is updated Signed-off-by: augusto.yjh --- docs/design/io_processor_plugins.md | 50 +++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index c6945e443c37..69575c201ca0 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -13,26 +13,70 @@ IOProcessorInput = TypeVar("IOProcessorInput") IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): - def __init__(self, vllm_config: VllmConfig): + """Abstract interface for pre/post-processing of engine I/O.""" + + def __init__(self, vllm_config: VllmConfig, renderer: BaseRenderer): super().__init__() self.vllm_config = vllm_config - @abstractmethod def parse_data(self, data: object) -> IOProcessorInput: + if callable(parse_request := getattr(self, "parse_request", None)): + warnings.warn( + "`parse_request` has been renamed to `parse_data`. " + "Please update your IO Processor Plugin to use the new name. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return parse_request(data) # type: ignore + raise NotImplementedError def merge_sampling_params( self, params: SamplingParams | None = None, ) -> SamplingParams: + if callable( + validate_or_generate_params := getattr( + self, "validate_or_generate_params", None + ) + ): + warnings.warn( + "`validate_or_generate_params` has been split into " + "`merge_sampling_params` and `merge_pooling_params`." + "Please update your IO Processor Plugin to use the new methods. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return validate_or_generate_params(params) # type: ignore + return params or SamplingParams() def merge_pooling_params( self, params: PoolingParams | None = None, ) -> PoolingParams: - return params or PoolingParams() + if callable( + validate_or_generate_params := getattr( + self, "validate_or_generate_params", None + ) + ): + warnings.warn( + "`validate_or_generate_params` has been split into " + "`merge_sampling_params` and `merge_pooling_params`." + "Please update your IO Processor Plugin to use the new methods. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return validate_or_generate_params(params) # type: ignore + + return params or PoolingParams(task="plugin") @abstractmethod def pre_process( From a5d518bc2dfe400d3d2035d2bd1258869e27e67b Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 25 Feb 2026 14:06:10 +0800 Subject: [PATCH 24/44] remove fallbacks in update io_processor_plugins.md, return correct error in parse_data Signed-off-by: augusto.yjh --- docs/design/io_processor_plugins.md | 43 ------------------- .../sparse_embeddings_processor.py | 2 +- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index 69575c201ca0..68b532108672 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -21,61 +21,18 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): self.vllm_config = vllm_config def parse_data(self, data: object) -> IOProcessorInput: - if callable(parse_request := getattr(self, "parse_request", None)): - warnings.warn( - "`parse_request` has been renamed to `parse_data`. " - "Please update your IO Processor Plugin to use the new name. " - "The old name will be removed in v0.19.", - DeprecationWarning, - stacklevel=2, - ) - - return parse_request(data) # type: ignore - raise NotImplementedError def merge_sampling_params( self, params: SamplingParams | None = None, ) -> SamplingParams: - if callable( - validate_or_generate_params := getattr( - self, "validate_or_generate_params", None - ) - ): - warnings.warn( - "`validate_or_generate_params` has been split into " - "`merge_sampling_params` and `merge_pooling_params`." - "Please update your IO Processor Plugin to use the new methods. " - "The old name will be removed in v0.19.", - DeprecationWarning, - stacklevel=2, - ) - - return validate_or_generate_params(params) # type: ignore - return params or SamplingParams() def merge_pooling_params( self, params: PoolingParams | None = None, ) -> PoolingParams: - if callable( - validate_or_generate_params := getattr( - self, "validate_or_generate_params", None - ) - ): - warnings.warn( - "`validate_or_generate_params` has been split into " - "`merge_sampling_params` and `merge_pooling_params`." - "Please update your IO Processor Plugin to use the new methods. " - "The old name will be removed in v0.19.", - DeprecationWarning, - stacklevel=2, - ) - - return validate_or_generate_params(params) # type: ignore - return params or PoolingParams(task="plugin") @abstractmethod diff --git a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py index 2383e74ec032..4749d3e81fed 100644 --- a/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py +++ b/tests/plugins/bge_m3_sparse_plugin/bge_m3_sparse_processor/sparse_embeddings_processor.py @@ -50,7 +50,7 @@ def parse_request( # for vllm.entrypoints.llm.LLM, offline mode, calls `encode` directly. if isinstance(request_data, dict): return SparseEmbeddingCompletionRequestMixin(**request_data) - raise ValueError("Unable to parse request_data") + raise TypeError("request_data should be a dictionary") def pre_process( self, From 49a2cef67397a1fc9a5edcb5e4c44e386bd6c62a Mon Sep 17 00:00:00 2001 From: Augusto Yao Date: Wed, 25 Feb 2026 14:37:38 +0800 Subject: [PATCH 25/44] Update vllm/plugins/io_processors/__init__.py Co-authored-by: Cyrus Leung Signed-off-by: Augusto Yao --- vllm/plugins/io_processors/__init__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py index 3a348e4a7342..86ebe41b0355 100644 --- a/vllm/plugins/io_processors/__init__.py +++ b/vllm/plugins/io_processors/__init__.py @@ -69,12 +69,14 @@ def get_io_processor( activated_plugin_cls = loadable_plugins[model_plugin] + activated_plugin_typ = resolve_obj_by_qualname(activated_plugin_cls) + # for backward compatibility, the plugin does not have a renderer argument - if "renderer" not in inspect.signature(activated_plugin_cls.__init__).parameters: + if "renderer" not in inspect.signature(activated_plugin_typ.__init__).parameters: logger.warning( "The renderer argument will be required in v0.18, " "please update your IOProcessor plugin: %s", activated_plugin_cls, ) - return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config) - return resolve_obj_by_qualname(activated_plugin_cls)(vllm_config, renderer) + return activated_plugin_typ(vllm_config) + return activated_plugin_typ(vllm_config, renderer) From 50db3264acd94dc05d607133300fcf6937c1d8b5 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Wed, 25 Feb 2026 16:07:57 +0800 Subject: [PATCH 26/44] fix testcase for loading wrong io_processor plugin Signed-off-by: augusto.yjh --- tests/plugins_tests/test_io_processor_plugins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 04cb19499296..f11d00316ff7 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -39,7 +39,7 @@ def _compute_image_hash(base64_data: str) -> str: def test_loading_missing_plugin(): vllm_config = VllmConfig() with pytest.raises(ValueError): - get_io_processor(vllm_config, "wrong_plugin") + get_io_processor(vllm_config, None, "wrong_plugin") @pytest.fixture(scope="function") From 29378ab3c3f82f905de35ef04e27e554ceadc4ad Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 14:53:46 +0800 Subject: [PATCH 27/44] add e2e test case for bge_m3_sparse_plugin Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 tests/plugins_tests/test_bge_m3_sparse_plugin.py diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py new file mode 100644 index 000000000000..e9c1699035d7 --- /dev/null +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest +import requests + +# Test configuration for BGE-M3 sparse plugin +from tests.plugins.bge_m3_sparse_plugin.bge_m3_sparse_processor.types import ( + SparseEmbeddingResponse, +) +from tests.utils import RemoteOpenAIServer +from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse + +model_config = { + "model_name": "BAAI/bge-m3", + "plugin": "bge_m3_sparse_plugin", + "test_input": "What is the capital of France?", + "hf_overrides": json.dumps( + {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"} + ), +} + + +@pytest.fixture(scope="function") +def server(): + args = [ + "--runner", + "pooling", + "--enforce-eager", + "--max-num-seqs", + "32", + "--hf_overrides", + model_config["hf_overrides"], + "--io-processor-plugin", + model_config["plugin"], + ] + + with RemoteOpenAIServer(model_config["model_name"], args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): + """Test BGE-M3 sparse plugin in online mode via API.""" + request_payload = { + "model": model_config["model_name"], + "data": {"input": model_config["test_input"], "return_tokens": True}, + } + + ret = requests.post( + server.url_for("pooling"), + json=request_payload, + ) + + response = ret.json() + + # Verify the request response is in the correct format + assert (parsed_response := IOProcessorResponse(**response)) + + # Verify the output is formatted as expected for this plugin + assert parsed_response.data + assert len(parsed_response.data) > 0 + + data_entry = parsed_response.data[0] + assert data_entry.object == "sparse-embedding" + assert hasattr(data_entry, "sparse_embedding") + + # Verify sparse embedding format + sparse_embedding = data_entry.sparse_embedding + assert isinstance(sparse_embedding, list) + if sparse_embedding: + entry = sparse_embedding[0] + assert hasattr(entry, "token_id") + assert hasattr(entry, "weight") + # When return_tokens=True, token should be present + assert entry.token is not None + assert isinstance(entry.token_id, int) + assert isinstance(entry.weight, float) + assert entry.weight >= 0 # SPLADE outputs are non-negative + + # Verify usage information + assert parsed_response.usage + assert parsed_response.usage.prompt_tokens > 0 + assert parsed_response.usage.total_tokens == parsed_response.usage.prompt_tokens + + +@pytest.mark.asyncio +async def test_bge_m3_sparse_plugin_online_no_tokens(server: RemoteOpenAIServer): + """Test BGE-M3 sparse plugin in online mode without returning tokens.""" + request_payload = { + "model": model_config["model_name"], + "input": model_config["test_input"], + "return_tokens": False, + } + + ret = requests.post( + server.url_for("pooling"), + json=request_payload, + ) + + response = ret.json() + print(f"Response: {response}") + + # Verify the request response is in the correct format + assert (parsed_response := SparseEmbeddingResponse(**response)) + + # Verify sparse embedding format + sparse_embedding = parsed_response.data[0].sparse_embedding + if sparse_embedding: + entry = sparse_embedding[0] + # When return_tokens=False, token should be None + assert entry.token is None + + +@pytest.mark.parametrize( + "return_tokens", + [True, False], +) +def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): + """Test BGE-M3 sparse plugin in offline mode.""" + prompt = { + "data": { + "input": model_config["test_input"], + "return_tokens": return_tokens, + } + } + + with vllm_runner( + model_config["model_name"], + runner="pooling", + enforce_eager=True, + max_num_seqs=32, + io_processor_plugin=model_config["plugin"], + hf_overrides=model_config["hf_overrides"], + default_torch_num_threads=1, + ) as llm_runner: + llm = llm_runner.get_llm() + output = llm.encode(prompt, pooling_task="token_classify") + + print(f"Output: {output}") + + # Verify output structure + assert len(output) > 0 + result = output[0] + assert hasattr(result, "outputs") + + # The outputs should be SparseEmbeddingResponse + response = result.outputs + assert hasattr(response, "data") + assert hasattr(response, "usage") + + # Verify response data + data_entry = response.data[0] + assert data_entry.object == "sparse-embedding" + assert hasattr(data_entry, "sparse_embedding") + + # Verify sparse embedding format + sparse_embedding = data_entry.sparse_embedding + assert isinstance(sparse_embedding, list) + if sparse_embedding: + entry = sparse_embedding[0] + assert hasattr(entry, "token_id") + assert hasattr(entry, "weight") + assert isinstance(entry.token_id, int) + assert isinstance(entry.weight, float) + assert entry.weight >= 0 # SPLADE outputs are non-negative + + # Verify token presence based on return_tokens + if return_tokens: + # For offline mode, tokens might be None depending on renderer + # but token_id and weight should always be present + assert isinstance(entry.token_id, int) + assert isinstance(entry.weight, (float, int)) + + +def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): + """Test BGE-M3 sparse plugin with multiple inputs in offline mode.""" + prompts = [ + {"input": "What is the capital of France?", "return_tokens": True}, + {"input": "What is the capital of Germany?", "return_tokens": True}, + {"input": "What is the capital of Spain?", "return_tokens": True}, + ] + + with vllm_runner( + model_config["model_name"], + runner="pooling", + enforce_eager=True, + max_num_seqs=32, + io_processor_plugin=model_config["plugin"], + hf_overrides=model_config["hf_overrides"], + default_torch_num_threads=1, + ) as llm_runner: + llm = llm_runner.get_llm() + outputs = llm.encode(prompts, pooling_task="token_classify") + + print(f"Outputs: {outputs}") + + # Verify output structure + assert len(outputs) == 3 + for i, output in enumerate(outputs): + result = output + assert hasattr(result, "outputs") + + response = result.outputs + assert len(response.data) > 0 + + # Each output should have sparse embeddings + sparse_embedding = response.data[0].sparse_embedding + assert isinstance(sparse_embedding, list) + + # Verify usage + assert response.usage.prompt_tokens > 0 + assert response.usage.total_tokens == response.usage.prompt_tokens From ea6e9c18ee0b635d018fa2ffdf824ecc6d4463bd Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 15:19:22 +0800 Subject: [PATCH 28/44] fix bugs in passing hf_overrides Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index e9c1699035d7..11e827fa1880 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -17,9 +17,7 @@ "model_name": "BAAI/bge-m3", "plugin": "bge_m3_sparse_plugin", "test_input": "What is the capital of France?", - "hf_overrides": json.dumps( - {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"} - ), + "hf_overrides": {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"}, } @@ -32,7 +30,7 @@ def server(): "--max-num-seqs", "32", "--hf_overrides", - model_config["hf_overrides"], + json.dumps(model_config["hf_overrides"]), "--io-processor-plugin", model_config["plugin"], ] From 759314b161de1535c7dc9dfdd29f4782ef605a27 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 15:27:44 +0800 Subject: [PATCH 29/44] fix bugs in construct prompts for offline mode Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 11e827fa1880..85c24a00ff80 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -176,9 +176,9 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): """Test BGE-M3 sparse plugin with multiple inputs in offline mode.""" prompts = [ - {"input": "What is the capital of France?", "return_tokens": True}, - {"input": "What is the capital of Germany?", "return_tokens": True}, - {"input": "What is the capital of Spain?", "return_tokens": True}, + {"data": {"input": "What is the capital of France?", "return_tokens": True}}, + {"data": {"input": "What is the capital of Germany?", "return_tokens": True}}, + {"data": {"input": "What is the capital of Spain?", "return_tokens": True}}, ] with vllm_runner( From 15d6cf5c1056a472087186e3ff57b33bc5139f0b Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 15:40:59 +0800 Subject: [PATCH 30/44] fix bugs in construct prompts for multi inputs in offline mode Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 85c24a00ff80..5465c2e82caf 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -175,11 +175,16 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): """Test BGE-M3 sparse plugin with multiple inputs in offline mode.""" - prompts = [ - {"data": {"input": "What is the capital of France?", "return_tokens": True}}, - {"data": {"input": "What is the capital of Germany?", "return_tokens": True}}, - {"data": {"input": "What is the capital of Spain?", "return_tokens": True}}, - ] + prompts = { + "data": { + "input": [ + "What is the capital of France?", + "What is the capital of Germany?", + "What is the capital of Spain?", + ], + "return_tokens": True, + } + } with vllm_runner( model_config["model_name"], From d67346f8687f423eafbde524e67db0b8fed33c01 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 15:48:35 +0800 Subject: [PATCH 31/44] update verify logic for bge_m3_sparse_plugin Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 5465c2e82caf..457f0059189d 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -201,18 +201,15 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): print(f"Outputs: {outputs}") # Verify output structure - assert len(outputs) == 3 - for i, output in enumerate(outputs): - result = output - assert hasattr(result, "outputs") - - response = result.outputs - assert len(response.data) > 0 - + assert hasattr(outputs, "outputs") + response = outputs.outputs + assert hasattr(response, "data") + assert len(outputs.outputs.data) == 3 + for i, output in enumerate(response.data): # Each output should have sparse embeddings - sparse_embedding = response.data[0].sparse_embedding + sparse_embedding = output.sparse_embedding assert isinstance(sparse_embedding, list) - # Verify usage - assert response.usage.prompt_tokens > 0 - assert response.usage.total_tokens == response.usage.prompt_tokens + # Verify usage + assert response.usage.prompt_tokens > 0 + assert response.usage.total_tokens == response.usage.prompt_tokens From cb01d533c39356957bf674c29445a099c454d737 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 15:52:30 +0800 Subject: [PATCH 32/44] fix bugs in get pooler_output Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 457f0059189d..9af3e3785fd5 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -196,7 +196,9 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): default_torch_num_threads=1, ) as llm_runner: llm = llm_runner.get_llm() - outputs = llm.encode(prompts, pooling_task="token_classify") + pooler_output = llm.encode(prompts, pooling_task="token_classify") + + outputs = pooler_output[0] print(f"Outputs: {outputs}") From a16d521734e974d8a93151596b85f9d760ab8a43 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 16:03:03 +0800 Subject: [PATCH 33/44] fix bugs in offline testcase Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 9af3e3785fd5..0095cc306f8a 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -135,42 +135,31 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): default_torch_num_threads=1, ) as llm_runner: llm = llm_runner.get_llm() - output = llm.encode(prompt, pooling_task="token_classify") + pooler_output = llm.encode(prompt, pooling_task="token_classify") - print(f"Output: {output}") + outputs = pooler_output[0] # Verify output structure - assert len(output) > 0 - result = output[0] - assert hasattr(result, "outputs") - - # The outputs should be SparseEmbeddingResponse - response = result.outputs + assert hasattr(outputs, "outputs") + response = outputs.outputs assert hasattr(response, "data") - assert hasattr(response, "usage") - + assert len(response.data) == 1 + print("") # Verify response data - data_entry = response.data[0] - assert data_entry.object == "sparse-embedding" - assert hasattr(data_entry, "sparse_embedding") - - # Verify sparse embedding format - sparse_embedding = data_entry.sparse_embedding - assert isinstance(sparse_embedding, list) - if sparse_embedding: - entry = sparse_embedding[0] - assert hasattr(entry, "token_id") - assert hasattr(entry, "weight") - assert isinstance(entry.token_id, int) - assert isinstance(entry.weight, float) - assert entry.weight >= 0 # SPLADE outputs are non-negative - - # Verify token presence based on return_tokens - if return_tokens: - # For offline mode, tokens might be None depending on renderer - # but token_id and weight should always be present - assert isinstance(entry.token_id, int) - assert isinstance(entry.weight, (float, int)) + for i, output in enumerate(response.data): + # Each output should have sparse embeddings + sparse_embedding = output.sparse_embedding + assert isinstance(sparse_embedding, list) + for idx, entry in enumerate(sparse_embedding): + # Verify token presence based on return_tokens + if return_tokens: + # For offline mode, tokens might be None depending on renderer + # but token_id and weight should always be present + assert isinstance(entry.token_id, int) + assert isinstance(entry.weight, (float, int)) + # Verify usage + assert response.usage.prompt_tokens > 0 + assert response.usage.total_tokens == response.usage.prompt_tokens def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): @@ -200,13 +189,11 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): outputs = pooler_output[0] - print(f"Outputs: {outputs}") - # Verify output structure assert hasattr(outputs, "outputs") response = outputs.outputs assert hasattr(response, "data") - assert len(outputs.outputs.data) == 3 + assert len(response.data) == 3 for i, output in enumerate(response.data): # Each output should have sparse embeddings sparse_embedding = output.sparse_embedding From bee36a20ccb71072b40309b17eb440da11059707 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 16:51:02 +0800 Subject: [PATCH 34/44] check embed result Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 0095cc306f8a..4e81b1dbe09c 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -112,6 +112,28 @@ async def test_bge_m3_sparse_plugin_online_no_tokens(server: RemoteOpenAIServer) assert entry.token is None +def _check_sparse_embedding(data, check_tokens=False): + expected_weights = [ + {"token_id": 32, "weight": 0.0552978515625, "token": "?"}, + {"token_id": 70, "weight": 0.09808349609375, "token": "the"}, + {"token_id": 83, "weight": 0.08154296875, "token": "is"}, + {"token_id": 111, "weight": 0.11810302734375, "token": "of"}, + {"token_id": 4865, "weight": 0.1171875, "token": "What"}, + {"token_id": 9942, "weight": 0.292236328125, "token": "France"}, + {"token_id": 10323, "weight": 0.2802734375, "token": "capital"}, + ] + expected_embed = {x["token_id"]: x for x in expected_weights} + + assert len(data) == len(expected_embed) + for entry in data: + expected_val = expected_embed[entry["token_id"]] + assert expected_val["weight"] == entry["weight"], ( + f"actual embed {entry} not equal to {expected_val}" + ) + if check_tokens: + assert expected_val["token"] == entry["token"] + + @pytest.mark.parametrize( "return_tokens", [True, False], @@ -150,13 +172,8 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): # Each output should have sparse embeddings sparse_embedding = output.sparse_embedding assert isinstance(sparse_embedding, list) - for idx, entry in enumerate(sparse_embedding): - # Verify token presence based on return_tokens - if return_tokens: - # For offline mode, tokens might be None depending on renderer - # but token_id and weight should always be present - assert isinstance(entry.token_id, int) - assert isinstance(entry.weight, (float, int)) + _check_sparse_embedding(sparse_embedding, return_tokens) + # Verify usage assert response.usage.prompt_tokens > 0 assert response.usage.total_tokens == response.usage.prompt_tokens From 15f54ba66094547bb1d9e2ba94ee8c80cdc0f37a Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 16:53:33 +0800 Subject: [PATCH 35/44] fix bugs in check offline mode result Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 4e81b1dbe09c..2fe7cbcad498 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -126,12 +126,12 @@ def _check_sparse_embedding(data, check_tokens=False): assert len(data) == len(expected_embed) for entry in data: - expected_val = expected_embed[entry["token_id"]] - assert expected_val["weight"] == entry["weight"], ( + expected_val = expected_embed[entry.token_id] + assert expected_val["weight"] == entry.weight, ( f"actual embed {entry} not equal to {expected_val}" ) if check_tokens: - assert expected_val["token"] == entry["token"] + assert expected_val["token"] == entry.token @pytest.mark.parametrize( From 7efcc16d44586f5586e345580558ae23dd0fc9bb Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 16:56:24 +0800 Subject: [PATCH 36/44] check token is None for return_tokens=False Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 2fe7cbcad498..c4177f816a22 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -131,7 +131,11 @@ def _check_sparse_embedding(data, check_tokens=False): f"actual embed {entry} not equal to {expected_val}" ) if check_tokens: - assert expected_val["token"] == entry.token + assert expected_val["token"] == entry.token, ( + f"actual embed {entry} not equal to {expected_val}" + ) + else: + assert entry.token is None, f"{entry} should not return token" @pytest.mark.parametrize( @@ -166,7 +170,6 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): response = outputs.outputs assert hasattr(response, "data") assert len(response.data) == 1 - print("") # Verify response data for i, output in enumerate(response.data): # Each output should have sparse embeddings From a29968fcc1e020d8c3082fc95b0b5608f9fa23f2 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 17:15:42 +0800 Subject: [PATCH 37/44] make _check_sparse_embedding compatible for both online serving and offline mode Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 60 +++++++++++-------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index c4177f816a22..d8ff9315c2fd 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -21,6 +21,40 @@ } +def _get_attr_or_val(obj: object | dict, key: str): + if isinstance(obj, dict) and key in obj: + return obj[key] + return getattr(obj, key, None) + + +def _check_sparse_embedding(data, check_tokens=False): + expected_weights = [ + {"token_id": 32, "weight": 0.0552978515625, "token": "?"}, + {"token_id": 70, "weight": 0.09808349609375, "token": "the"}, + {"token_id": 83, "weight": 0.08154296875, "token": "is"}, + {"token_id": 111, "weight": 0.11810302734375, "token": "of"}, + {"token_id": 4865, "weight": 0.1171875, "token": "What"}, + {"token_id": 9942, "weight": 0.292236328125, "token": "France"}, + {"token_id": 10323, "weight": 0.2802734375, "token": "capital"}, + ] + expected_embed = {x["token_id"]: x for x in expected_weights} + + assert len(data) == len(expected_embed) + for entry in data: + expected_val = expected_embed[_get_attr_or_val(entry, "token_id")] + assert expected_val["weight"] == _get_attr_or_val(entry, "weight"), ( + f"actual embed {entry} not equal to {expected_val}" + ) + if check_tokens: + assert expected_val["token"] == _get_attr_or_val(entry, "token"), ( + f"actual embed {entry} not equal to {expected_val}" + ) + else: + assert _get_attr_or_val(entry, "token") is None, ( + f"{entry} should not return token" + ) + + @pytest.fixture(scope="function") def server(): args = [ @@ -112,32 +146,6 @@ async def test_bge_m3_sparse_plugin_online_no_tokens(server: RemoteOpenAIServer) assert entry.token is None -def _check_sparse_embedding(data, check_tokens=False): - expected_weights = [ - {"token_id": 32, "weight": 0.0552978515625, "token": "?"}, - {"token_id": 70, "weight": 0.09808349609375, "token": "the"}, - {"token_id": 83, "weight": 0.08154296875, "token": "is"}, - {"token_id": 111, "weight": 0.11810302734375, "token": "of"}, - {"token_id": 4865, "weight": 0.1171875, "token": "What"}, - {"token_id": 9942, "weight": 0.292236328125, "token": "France"}, - {"token_id": 10323, "weight": 0.2802734375, "token": "capital"}, - ] - expected_embed = {x["token_id"]: x for x in expected_weights} - - assert len(data) == len(expected_embed) - for entry in data: - expected_val = expected_embed[entry.token_id] - assert expected_val["weight"] == entry.weight, ( - f"actual embed {entry} not equal to {expected_val}" - ) - if check_tokens: - assert expected_val["token"] == entry.token, ( - f"actual embed {entry} not equal to {expected_val}" - ) - else: - assert entry.token is None, f"{entry} should not return token" - - @pytest.mark.parametrize( "return_tokens", [True, False], From 4a18af71ea0c86b03ff918f94b2b3fcd024855d9 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 17:53:10 +0800 Subject: [PATCH 38/44] fix test online Signed-off-by: augusto.yjh --- tests/plugins_tests/test_bge_m3_sparse_plugin.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index d8ff9315c2fd..41f08e12d3d8 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -89,7 +89,7 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): response = ret.json() # Verify the request response is in the correct format - assert (parsed_response := IOProcessorResponse(**response)) + assert (parsed_response := IOProcessorResponse(**response).data) # Verify the output is formatted as expected for this plugin assert parsed_response.data @@ -102,15 +102,7 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): # Verify sparse embedding format sparse_embedding = data_entry.sparse_embedding assert isinstance(sparse_embedding, list) - if sparse_embedding: - entry = sparse_embedding[0] - assert hasattr(entry, "token_id") - assert hasattr(entry, "weight") - # When return_tokens=True, token should be present - assert entry.token is not None - assert isinstance(entry.token_id, int) - assert isinstance(entry.weight, float) - assert entry.weight >= 0 # SPLADE outputs are non-negative + _check_sparse_embedding(sparse_embedding, True) # Verify usage information assert parsed_response.usage From a07bed7e1cd8ef7eba1121f9cdc68217a69a9fcf Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 18:02:16 +0800 Subject: [PATCH 39/44] fix verify logic for online mode Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index 41f08e12d3d8..d6c7c3890eb1 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -78,6 +78,7 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): """Test BGE-M3 sparse plugin in online mode via API.""" request_payload = { "model": model_config["model_name"], + "task": "token_classify", "data": {"input": model_config["test_input"], "return_tokens": True}, } @@ -92,22 +93,25 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): assert (parsed_response := IOProcessorResponse(**response).data) # Verify the output is formatted as expected for this plugin - assert parsed_response.data - assert len(parsed_response.data) > 0 + assert _get_attr_or_val(parsed_response, "data") + assert len(_get_attr_or_val(parsed_response, "data")) > 0 - data_entry = parsed_response.data[0] - assert data_entry.object == "sparse-embedding" - assert hasattr(data_entry, "sparse_embedding") + data_entry = _get_attr_or_val(parsed_response, "data")[0] + assert _get_attr_or_val(data_entry, "object") == "sparse-embedding" + assert _get_attr_or_val(data_entry, "sparse_embedding") # Verify sparse embedding format - sparse_embedding = data_entry.sparse_embedding + sparse_embedding = _get_attr_or_val(data_entry, "sparse_embedding") assert isinstance(sparse_embedding, list) _check_sparse_embedding(sparse_embedding, True) # Verify usage information - assert parsed_response.usage - assert parsed_response.usage.prompt_tokens > 0 - assert parsed_response.usage.total_tokens == parsed_response.usage.prompt_tokens + usage = _get_attr_or_val(parsed_response, "usage") + assert usage, f"usage not found for {parsed_response}" + assert _get_attr_or_val(usage, "prompt_tokens") > 0 + assert _get_attr_or_val(usage, "total_tokens") == _get_attr_or_val( + usage, "prompt_tokens" + ) @pytest.mark.asyncio From 43b1f5466f1bcbd6109b3c3ee9ee2a7365169392 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 18:30:49 +0800 Subject: [PATCH 40/44] update online test case Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_plugin.py | 43 ++++--------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_plugin.py index d6c7c3890eb1..80f65ac13905 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_plugin.py +++ b/tests/plugins_tests/test_bge_m3_sparse_plugin.py @@ -7,9 +7,6 @@ import requests # Test configuration for BGE-M3 sparse plugin -from tests.plugins.bge_m3_sparse_plugin.bge_m3_sparse_processor.types import ( - SparseEmbeddingResponse, -) from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse @@ -74,12 +71,18 @@ def server(): @pytest.mark.asyncio -async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): +@pytest.mark.parametrize( + "return_tokens", + [True, False], +) +async def test_bge_m3_sparse_plugin_online( + server: RemoteOpenAIServer, return_tokens: bool +): """Test BGE-M3 sparse plugin in online mode via API.""" request_payload = { "model": model_config["model_name"], "task": "token_classify", - "data": {"input": model_config["test_input"], "return_tokens": True}, + "data": {"input": model_config["test_input"], "return_tokens": return_tokens}, } ret = requests.post( @@ -103,7 +106,7 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): # Verify sparse embedding format sparse_embedding = _get_attr_or_val(data_entry, "sparse_embedding") assert isinstance(sparse_embedding, list) - _check_sparse_embedding(sparse_embedding, True) + _check_sparse_embedding(sparse_embedding, return_tokens) # Verify usage information usage = _get_attr_or_val(parsed_response, "usage") @@ -114,34 +117,6 @@ async def test_bge_m3_sparse_plugin_online(server: RemoteOpenAIServer): ) -@pytest.mark.asyncio -async def test_bge_m3_sparse_plugin_online_no_tokens(server: RemoteOpenAIServer): - """Test BGE-M3 sparse plugin in online mode without returning tokens.""" - request_payload = { - "model": model_config["model_name"], - "input": model_config["test_input"], - "return_tokens": False, - } - - ret = requests.post( - server.url_for("pooling"), - json=request_payload, - ) - - response = ret.json() - print(f"Response: {response}") - - # Verify the request response is in the correct format - assert (parsed_response := SparseEmbeddingResponse(**response)) - - # Verify sparse embedding format - sparse_embedding = parsed_response.data[0].sparse_embedding - if sparse_embedding: - entry = sparse_embedding[0] - # When return_tokens=False, token should be None - assert entry.token is None - - @pytest.mark.parametrize( "return_tokens", [True, False], From dd77e52c9779140b38d044f2ed5fe5111d76a6ba Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Thu, 26 Feb 2026 18:34:34 +0800 Subject: [PATCH 41/44] rename test file for bge_m3_sparse_plugin Signed-off-by: augusto.yjh --- ...parse_plugin.py => test_bge_m3_sparse_io_processor_plugins.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/plugins_tests/{test_bge_m3_sparse_plugin.py => test_bge_m3_sparse_io_processor_plugins.py} (100%) diff --git a/tests/plugins_tests/test_bge_m3_sparse_plugin.py b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py similarity index 100% rename from tests/plugins_tests/test_bge_m3_sparse_plugin.py rename to tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py From 58ebcc7a457c36bf10f511f206ab92ce888495d5 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Fri, 27 Feb 2026 09:37:27 +0800 Subject: [PATCH 42/44] add bge_m3_sparse io processor plugin test into .buildkite Signed-off-by: augusto.yjh --- .buildkite/test-amd.yaml | 10 +++++++++- .buildkite/test_areas/plugins.yaml | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ffdf4b83c0e2..529ab88049b4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1370,6 +1370,10 @@ steps: - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y # end io_processor plugins test # begin stat_logger plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger @@ -2946,6 +2950,10 @@ steps: - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y # end io_processor plugins test # begin stat_logger plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger @@ -3227,4 +3235,4 @@ steps: num_gpus: 4 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index ccc54b47abd4..16f9abccf6e1 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -19,6 +19,10 @@ steps: - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y # end io_processor plugins test # begin stat_logger plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger From 166cef82d9828592348ae9b82c28363621152a67 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Fri, 27 Feb 2026 09:46:20 +0800 Subject: [PATCH 43/44] fix pre-commit check Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_io_processor_plugins.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py index 80f65ac13905..65dd02daf801 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py +++ b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py @@ -14,7 +14,9 @@ "model_name": "BAAI/bge-m3", "plugin": "bge_m3_sparse_plugin", "test_input": "What is the capital of France?", - "hf_overrides": {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"}, + "hf_overrides": json.dumps( + {"architectures": ["BgeM3EmbeddingModel"], "head_dtype": "float16"} + ), } @@ -61,7 +63,7 @@ def server(): "--max-num-seqs", "32", "--hf_overrides", - json.dumps(model_config["hf_overrides"]), + model_config["hf_overrides"], "--io-processor-plugin", model_config["plugin"], ] @@ -136,7 +138,7 @@ def test_bge_m3_sparse_plugin_offline(vllm_runner, return_tokens: bool): enforce_eager=True, max_num_seqs=32, io_processor_plugin=model_config["plugin"], - hf_overrides=model_config["hf_overrides"], + hf_overrides=json.loads(model_config["hf_overrides"]), default_torch_num_threads=1, ) as llm_runner: llm = llm_runner.get_llm() @@ -180,7 +182,7 @@ def test_bge_m3_sparse_plugin_offline_multiple_inputs(vllm_runner): enforce_eager=True, max_num_seqs=32, io_processor_plugin=model_config["plugin"], - hf_overrides=model_config["hf_overrides"], + hf_overrides=json.loads(model_config["hf_overrides"]), default_torch_num_threads=1, ) as llm_runner: llm = llm_runner.get_llm() From 1f4f9696bbb5cf65a4c961b0ac377238ec8e7840 Mon Sep 17 00:00:00 2001 From: "augusto.yjh" Date: Fri, 27 Feb 2026 11:36:53 +0800 Subject: [PATCH 44/44] check sparse-embedding weight using loose equality Signed-off-by: augusto.yjh --- .../test_bge_m3_sparse_io_processor_plugins.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py index 65dd02daf801..20c400e59795 100644 --- a/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py +++ b/tests/plugins_tests/test_bge_m3_sparse_io_processor_plugins.py @@ -20,6 +20,13 @@ } +def _float_close(expected: object, result: object): + assert isinstance(expected, float) and isinstance(result, float), ( + f"{expected=} or {result=} is not float" + ) + return (expected - result) < 1e-3 or abs(expected / result - 1) < 1e-3 + + def _get_attr_or_val(obj: object | dict, key: str): if isinstance(obj, dict) and key in obj: return obj[key] @@ -41,9 +48,9 @@ def _check_sparse_embedding(data, check_tokens=False): assert len(data) == len(expected_embed) for entry in data: expected_val = expected_embed[_get_attr_or_val(entry, "token_id")] - assert expected_val["weight"] == _get_attr_or_val(entry, "weight"), ( - f"actual embed {entry} not equal to {expected_val}" - ) + assert _float_close( + expected_val["weight"], _get_attr_or_val(entry, "weight") + ), f"actual embed {entry} not equal to {expected_val}" if check_tokens: assert expected_val["token"] == _get_attr_or_val(entry, "token"), ( f"actual embed {entry} not equal to {expected_val}"