From e0d8d484fb44ea4c695f29473e5994a60215e2e0 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 19 Mar 2026 15:57:38 +0800 Subject: [PATCH 1/4] init Signed-off-by: wang.yuqi --- tests/test_pooling_params.py | 2 +- vllm/config/model.py | 4 +- vllm/entrypoints/llm.py | 4 +- vllm/entrypoints/openai/api_server.py | 12 +++-- vllm/entrypoints/pooling/__init__.py | 40 +++++++++++----- .../layers/pooler/activations.py | 33 ++++--------- .../layers/pooler/seqwise/heads.py | 48 ++++++++++--------- .../layers/pooler/seqwise/methods.py | 2 +- vllm/model_executor/layers/pooler/special.py | 9 +--- .../layers/pooler/tokwise/heads.py | 30 ++++++------ vllm/model_executor/models/interfaces_base.py | 14 +++--- vllm/pooling_params.py | 7 +-- vllm/tasks.py | 5 -- vllm/v1/worker/gpu_model_runner.py | 10 +--- 14 files changed, 105 insertions(+), 115 deletions(-) diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index 54a577d2bf84..6cf2a82d2ff1 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -74,7 +74,7 @@ def test_embed_dimensions(model_info: EmbedModelInfo): pooling_params.verify(model_config) -@pytest.mark.parametrize("task", ["score", "classify"]) +@pytest.mark.parametrize("task", ["classify"]) def test_classify(task): model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS")) diff --git a/vllm/config/model.py b/vllm/config/model.py index b12202f9c712..e0dd83728d33 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1435,8 +1435,8 @@ def requires_raw_input_tokens(self) -> bool: @property def score_type(self) -> ScoreType: """ - Score API handles score/rerank for: - - "score" task (score_type: cross-encoder models) + Scoring API handles score/rerank for: + - "classify" task (score_type: cross-encoder models) - "embed" task (score_type: bi-encoder models) - "token_embed" task (score_type: late interaction models) """ diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5909b3043007..4b617333c02f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1477,9 +1477,9 @@ def _cross_encoding_score( data_1 = data_1 * len(data_2) if pooling_params is None: - pooling_params = PoolingParams(task="score") + pooling_params = PoolingParams(task="classify") elif pooling_params.task is None: - pooling_params.task = "score" + pooling_params.task = "classify" pooling_params_list = list[PoolingParams]() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 39e9076a7cc6..5f7d06f14193 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -22,7 +22,7 @@ from starlette.datastructures import State import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template @@ -164,7 +164,9 @@ async def build_async_engine_client_from_engine_args( def build_app( - args: Namespace, supported_tasks: tuple["SupportedTask", ...] | None = None + args: Namespace, + model_config: ModelConfig | None = None, + supported_tasks: tuple["SupportedTask", ...] | None = None, ) -> FastAPI: if supported_tasks is None: warnings.warn( @@ -251,7 +253,7 @@ def build_app( if any(task in POOLING_TASKS for task in supported_tasks): from vllm.entrypoints.pooling import register_pooling_api_routers - register_pooling_api_routers(app, supported_tasks) + register_pooling_api_routers(app, supported_tasks, model_config) app.root_path = args.root_path app.add_middleware( @@ -592,8 +594,10 @@ async def build_and_serve( uvicorn_kwargs["log_config"] = log_config supported_tasks = await engine_client.get_supported_tasks() + model_config = engine_client.model_config + logger.info("Supported tasks: %s", supported_tasks) - app = build_app(args, supported_tasks) + app = build_app(args, model_config, supported_tasks) await init_app_state(engine_client, app.state, args, supported_tasks) logger.info("Starting vLLM server on %s", listen_address) diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py index d2baea8959d2..e115b710ceeb 100644 --- a/vllm/entrypoints/pooling/__init__.py +++ b/vllm/entrypoints/pooling/__init__.py @@ -5,6 +5,9 @@ from fastapi import FastAPI +from vllm.config import ModelConfig +from vllm.logger import init_logger + if TYPE_CHECKING: from argparse import Namespace @@ -17,9 +20,30 @@ RequestLogger = object SupportedTask = object +logger = init_logger(__name__) + + +def enable_scoring_api( + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +) -> bool: + if any(t in supported_tasks for t in ("embed", "token_embed")): + return True + + if model_config is not None and "classify" in supported_tasks: + num_labels = getattr(model_config.hf_config, "num_labels", 0) + if num_labels != 1: + logger.debug_once("Score API is only enabled for num_labels == 1.") + return False + return True + + return False + def register_pooling_api_routers( - app: FastAPI, supported_tasks: tuple["SupportedTask", ...] + app: FastAPI, + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, ): from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router @@ -37,11 +61,7 @@ def register_pooling_api_routers( app.include_router(embed_router) - # Score API handles score/rerank for: - # - "score" task (score_type: cross-encoder models) - # - "embed" task (score_type: bi-encoder models) - # - "token_embed" task (score_type: late interaction models) - if any(t in supported_tasks for t in ("score", "embed", "token_embed")): + if enable_scoring_api(supported_tasks, model_config): from vllm.entrypoints.pooling.score.api_router import router as score_router app.include_router(score_router) @@ -61,6 +81,8 @@ def init_pooling_state( from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.tasks import POOLING_TASKS + model_config = engine_client.model_config + resolved_chat_template = load_chat_template(args.chat_template) state.serving_pooling = ( @@ -102,10 +124,6 @@ def init_pooling_state( if "classify" in supported_tasks else None ) - # Score API handles score/rerank for: - # - "score" task (score_type: cross-encoder models) - # - "embed" task (score_type: bi-encoder models) - # - "token_embed" task (score_type: late interaction models) state.serving_scores = ( ServingScores( engine_client, @@ -114,6 +132,6 @@ def init_pooling_state( score_template=resolved_chat_template, log_error_stack=args.log_error_stack, ) - if any(t in supported_tasks for t in ("embed", "score", "token_embed")) + if enable_scoring_api(supported_tasks, model_config) else None ) diff --git a/vllm/model_executor/layers/pooler/activations.py b/vllm/model_executor/layers/pooler/activations.py index b57e6ba68b94..9ec5bd253720 100644 --- a/vllm/model_executor/layers/pooler/activations.py +++ b/vllm/model_executor/layers/pooler/activations.py @@ -15,26 +15,22 @@ logger = init_logger(__name__) - -def get_classification_act_fn( +def get_act_fn( config: PretrainedConfig, + static_num_labels: bool = True, ) -> "PoolerActivation": + # get classification act_fn # Implement alignment with transformers ForSequenceClassificationLoss # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92 problem_type = getattr(config, "problem_type", "") if problem_type == "regression": return PoolerIdentity() if problem_type == "single_label_classification": - return PoolerClassify() + return PoolerClassify(static_num_labels=static_num_labels) if problem_type == "multi_label_classification": return PoolerMultiLabelClassify() - return PoolerClassify() - - -def get_cross_encoder_act_fn( - config: PretrainedConfig, -) -> "PoolerActivation": + # get cross_encoder act_fn function_name: str | None = None if ( hasattr(config, "sentence_transformers") @@ -55,24 +51,16 @@ def get_cross_encoder_act_fn( fn = resolve_obj_by_qualname(function_name)() return PoolerActivation.wraps(fn) - return PoolerClassify() + return PoolerClassify(static_num_labels=static_num_labels) def resolve_classifier_act_fn( model_config: ModelConfig, static_num_labels: bool = True, - act_fn: "PoolerActivation | str | None" = None, + act_fn: "PoolerActivation | None" = None, ): - if isinstance(act_fn, str): - if act_fn == "classify": - return get_classification_act_fn(model_config.hf_config) - if act_fn == "score": - return get_cross_encoder_act_fn(model_config.hf_config) - - raise ValueError(f"act_fn [{act_fn=}] not supported.") - if act_fn is None: - return PoolerClassify(static_num_labels=static_num_labels) + return get_act_fn(model_config.hf_config, static_num_labels) assert callable(act_fn) return act_fn @@ -97,9 +85,8 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: def forward(self, pooled_data: _T) -> _T: # shape: - # classify (& score) -> (batch_size, num_classes) - # embed -> (batch_size, embedding_dim) or list(embedding_dim) - # (batch_size, dimensions) or list(dimensions) if using MRL + # classify -> (batch_size, num_classes) + # embed -> (batch_size, embedding_size) or list(embedding_size) if isinstance(pooled_data, list): return [self.forward_chunk(data) for data in pooled_data] diff --git a/vllm/model_executor/layers/pooler/seqwise/heads.py b/vllm/model_executor/layers/pooler/seqwise/heads.py index 42059284e5cd..31a961223927 100644 --- a/vllm/model_executor/layers/pooler/seqwise/heads.py +++ b/vllm/model_executor/layers/pooler/seqwise/heads.py @@ -56,29 +56,31 @@ def forward( if isinstance(pooled_data, list): pooled_data = torch.stack(pooled_data) - # pooled_data shape: [batchsize, hidden_dimension] + # pooled_data shape: [batchsize, hidden_size] if self.head_dtype is not None: pooled_data = pooled_data.to(self.head_dtype) # Apply ST projector if self.projector is not None: - pooled_data = self.projector(pooled_data) - # pooled_data shape: [batchsize, embedding_dimension] + embeddings = self.projector(pooled_data) + else: + embeddings = pooled_data + # embeddings shape: [batchsize, embedding_size] # for matryoshka representation dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params] if any(d is not None for d in dimensions_list): # change the output dimension - assert len(pooled_data) == len(dimensions_list) - if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list): + assert len(embeddings) == len(dimensions_list) + if len(set(dimensions_list)) == 1 and not isinstance(embeddings, list): # if all dimensions are the same d = dimensions_list[0] - pooled_data = pooled_data[..., :d] + embeddings = embeddings[..., :d] else: - pooled_data = [ + embeddings = [ vecs if d is None else vecs[..., :d] - for vecs, d in zip(pooled_data, dimensions_list) + for vecs, d in zip(embeddings, dimensions_list) ] # for normalize @@ -86,15 +88,15 @@ def forward( flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: if flags[0]: - pooled_data = self.activation(pooled_data) + embeddings = self.activation(embeddings) else: - pooled_data = [ + embeddings = [ self.activation(vecs) if f else vecs - for vecs, f in zip(pooled_data, flags) + for vecs, f in zip(embeddings, flags) ] - # pooled_data shape: [batchsize, embedding_dimension] - return pooled_data + # embeddings shape: [batchsize, embedding_size] + return embeddings class ClassifierPoolerHead(SequencePoolerHead): @@ -113,7 +115,7 @@ def __init__( self.activation = activation def get_supported_tasks(self) -> Set[PoolingTask]: - return {"classify", "score"} + return {"classify"} def forward( self, @@ -131,21 +133,23 @@ def forward( pooled_data = pooled_data.to(self.head_dtype) if self.classifier is not None: - pooled_data = self.classifier(pooled_data) - # pooled_data shape: [batchsize, num_labels] + logits = self.classifier(pooled_data) + else: + logits = pooled_data + # logits shape: [batchsize, num_labels] if self.logit_bias is not None: - pooled_data -= self.logit_bias + logits -= self.logit_bias if self.activation is not None: flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: - pooled_data = self.activation(pooled_data) if flags[0] else pooled_data + logits = self.activation(logits) if flags[0] else logits else: - pooled_data = [ + logits = [ self.activation(vecs) if f else vecs - for vecs, f in zip(pooled_data, flags) + for vecs, f in zip(logits, flags) ] - # pooled_data shape: [batchsize, num_labels] - return pooled_data + # logits shape: [batchsize, num_labels] + return logits diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py index 5d8551095096..f3c7f29d6092 100644 --- a/vllm/model_executor/layers/pooler/seqwise/methods.py +++ b/vllm/model_executor/layers/pooler/seqwise/methods.py @@ -17,7 +17,7 @@ class SequencePoolingMethod(nn.Module, ABC): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"token_embed", "token_classify", "embed", "classify", "score"} + return {"token_embed", "token_classify", "embed", "classify"} def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return PoolingParamsUpdate() diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py index 5e0f9ec75597..686072632685 100644 --- a/vllm/model_executor/layers/pooler/special.py +++ b/vllm/model_executor/layers/pooler/special.py @@ -52,13 +52,6 @@ def for_seq_cls( pooler_config, pooling=pooling, classifier=classifier, - act_fn="classify", - ), - "score": pooler_for_classify( - pooler_config, - pooling=pooling, - classifier=classifier, - act_fn="score", ), } ) @@ -115,7 +108,7 @@ def extra_repr(self) -> str: class IdentityPooler(Pooler): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"plugin", "score"} + return {"plugin"} def forward( self, diff --git a/vllm/model_executor/layers/pooler/tokwise/heads.py b/vllm/model_executor/layers/pooler/tokwise/heads.py index 4183f5b1ba25..80c5c831fa08 100644 --- a/vllm/model_executor/layers/pooler/tokwise/heads.py +++ b/vllm/model_executor/layers/pooler/tokwise/heads.py @@ -68,22 +68,24 @@ def forward_chunk( if self.head_dtype is not None: pooled_data = pooled_data.to(self.head_dtype) - # pooled_data shape: [n_tokens, hidden_dimension] + # pooled_data shape: [n_tokens, hidden_size] # Apply ST projector if self.projector is not None: - pooled_data = self.projector(pooled_data) - # pooled_data shape: [n_tokens, embedding_dimension] + embeddings = self.projector(pooled_data) + else: + embeddings = pooled_data + # embeddings shape: [n_tokens, embedding_size] # for matryoshka representation - pooled_data = pooled_data[..., : pooling_param.dimensions] + embeddings = embeddings[..., : pooling_param.dimensions] # for normalize if self.activation is not None and pooling_param.use_activation: - pooled_data = self.activation(pooled_data) + embeddings = self.activation(embeddings) - # pooled_data shape: [n_tokens, embedding_dimension] - return pooled_data + # embeddings shape: [n_tokens, embedding_size] + return embeddings class TokenClassifierPoolerHead(TokenPoolerHead): @@ -118,16 +120,16 @@ def forward_chunk( # hidden_states shape: [n_token, hidden_size] if self.classifier is not None: - scores = self.classifier(pooled_data) + logits = self.classifier(pooled_data) else: - scores = pooled_data - # scores shape: [n_token, num_labels] + logits = pooled_data + # logits shape: [n_token, num_labels] if self.logit_bias is not None: - scores -= self.logit_bias + logits -= self.logit_bias if self.activation is not None and pooling_param.use_activation: - scores = self.activation(scores) + logits = self.activation(logits) - # scores shape: [n_token, num_labels] - return scores + # logits shape: [n_token, num_labels] + return logits diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 55c42e5fa57e..0c182a891cd3 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -194,18 +194,18 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): [vllm.config.model.ModelConfig.score_type][] to use by default. - Score API handles score/rerank for: - - "score" task (score_type: cross-encoder models) - - "embed" task (score_type: bi-encoder models) - - "token_embed" task (score_type: late interaction models) + Scoring API handles score/rerank for:\n + - "classify" task (score_type: cross-encoder models)\n + - "embed" task (score_type: bi-encoder models)\n + - "token_embed" task (score_type: late interaction models)\n - score_type defaults to bi-encoder, then the Score API uses the "embed" task. + score_type defaults to bi-encoder, then the Score API uses the "embed" task.\n If you set score_type to cross-encoder via [vllm.model_executor.models.interfaces.SupportsCrossEncoding][], - then the Score API uses the "score" task. + then the Score API uses the "score" task.\n If you set score_type to late-interaction via [vllm.model_executor.models.interfaces.SupportsLateInteraction][], - then the Score API uses the "token_embed" task. + then the Score API uses the "token_embed" task.\n """ pooler: Pooler diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index e5e993b75556..a1a49e41948f 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -54,10 +54,6 @@ class PoolingParams( dimensions: int | None = None # --8<-- [end:embed-pooling-params] - ## for classification, scoring and rerank - # --8<-- [start:classify-pooling-params] - # --8<-- [end:classify-pooling-params] - ## for step pooling models step_tag_id: int | None = None returned_token_ids: list[int] | None = None @@ -79,7 +75,6 @@ def valid_parameters(self): return { "embed": ["dimensions", "use_activation"], "classify": ["use_activation"], - "score": ["use_activation"], "token_embed": ["dimensions", "use_activation"], "token_classify": ["use_activation"], } @@ -184,7 +179,7 @@ def _set_default_parameters(self, model_config: ModelConfig): elif self.dimensions < 1: raise ValueError("Dimensions must be greater than 0") - elif self.task in ["classify", "score", "token_classify"]: + elif self.task in ["classify", "token_classify"]: if self.use_activation is None: self.use_activation = True else: diff --git a/vllm/tasks.py b/vllm/tasks.py index 83dd7f85eee0..4e324c188519 100644 --- a/vllm/tasks.py +++ b/vllm/tasks.py @@ -8,7 +8,6 @@ PoolingTask = Literal[ "embed", "classify", - "score", "token_embed", "token_classify", "plugin", @@ -16,10 +15,6 @@ ] POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask) -# Score API handles score/rerank for: -# - "score" task (score_type: cross-encoder models) -# - "embed" task (score_type: bi-encoder models) -# - "token_embed" task (score_type: late interaction models) ScoreType = Literal["bi-encoder", "cross-encoder", "late-interaction"] FrontendTask = Literal["render"] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 595e8cc39b70..94042afae700 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2827,15 +2827,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - supported_tasks = list(model.pooler.get_supported_tasks()) - - if "score" in supported_tasks: - num_labels = getattr(self.model_config.hf_config, "num_labels", 0) - if num_labels != 1: - supported_tasks.remove("score") - logger.debug_once("Score API is only enabled for num_labels == 1.") - - return supported_tasks + return list(model.pooler.get_supported_tasks()) def get_supported_tasks(self) -> tuple[SupportedTask, ...]: tasks = list[SupportedTask]() From fb987a91039b5c0aed6fb4d9a8b44d1aba8e7015 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 19 Mar 2026 16:48:17 +0800 Subject: [PATCH 2/4] refine Signed-off-by: wang.yuqi --- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/pooling/score/protocol.py | 4 ++-- vllm/entrypoints/pooling/score/serving.py | 2 +- vllm/entrypoints/sagemaker/api_router.py | 18 +++++++++++++----- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5f7d06f14193..2dd6c138cfb6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -202,7 +202,7 @@ def build_app( attach_router as register_sagemaker_api_router, ) - register_sagemaker_api_router(app, supported_tasks) + register_sagemaker_api_router(app, supported_tasks, model_config) if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import ( diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 2aea1bd7b27a..bb633fc28b3c 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -35,7 +35,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: max_total_tokens_param="max_model_len", ) - def to_pooling_params(self, task: PoolingTask = "score"): + def to_pooling_params(self, task: PoolingTask = "classify"): return PoolingParams( task=task, use_activation=self.use_activation, @@ -111,7 +111,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: max_total_tokens_param="max_model_len", ) - def to_pooling_params(self, task: PoolingTask = "score"): + def to_pooling_params(self, task: PoolingTask = "classify"): return PoolingParams( task=task, use_activation=self.use_activation, diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index c58fe6d36c07..d8cbff99d068 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -413,7 +413,7 @@ async def _cross_encoding_score( # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] - default_pooling_params = request.to_pooling_params("score") + default_pooling_params = request.to_pooling_params("classify") for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/sagemaker/api_router.py index 32faaa02e681..e8c48d1c6d53 100644 --- a/vllm/entrypoints/sagemaker/api_router.py +++ b/vllm/entrypoints/sagemaker/api_router.py @@ -10,9 +10,11 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, Response +from vllm.config import ModelConfig from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.pooling import enable_scoring_api from vllm.entrypoints.pooling.base.serving import PoolingServing from vllm.entrypoints.serve.instrumentator.basic import base from vllm.entrypoints.serve.instrumentator.health import health @@ -25,7 +27,10 @@ EndpointFn = Callable[[RequestType, Request], Awaitable[Any]] -def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): +def get_invocation_types( + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +): # NOTE: Items defined earlier take higher priority INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [] @@ -70,7 +75,7 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): (ClassificationRequest, (classify, create_classify)), ] - if "score" in supported_tasks: + if enable_scoring_api(supported_tasks, model_config): from vllm.entrypoints.pooling.score.api_router import do_rerank, rerank from vllm.entrypoints.pooling.score.protocol import RerankRequest @@ -78,7 +83,6 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): (RerankRequest, (rerank, do_rerank)), ] - if "score" in supported_tasks or "embed" in supported_tasks: from vllm.entrypoints.pooling.score.api_router import create_score, score from vllm.entrypoints.pooling.score.protocol import ScoreRequest @@ -97,11 +101,15 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): return INVOCATION_TYPES -def attach_router(app: FastAPI, supported_tasks: tuple["SupportedTask", ...]): +def attach_router( + app: FastAPI, + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +): router = APIRouter() # NOTE: Construct the TypeAdapters only once - INVOCATION_TYPES = get_invocation_types(supported_tasks) + INVOCATION_TYPES = get_invocation_types(supported_tasks, model_config) INVOCATION_VALIDATORS = [ (pydantic.TypeAdapter(request_type), (get_handler, endpoint)) for request_type, (get_handler, endpoint) in INVOCATION_TYPES From d7ad6d10735926ebd42e8ee6fe4a5e944247a753 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 19 Mar 2026 17:58:59 +0800 Subject: [PATCH 3/4] refine Signed-off-by: wang.yuqi --- vllm/config/model.py | 8 ++++---- vllm/entrypoints/openai/api_server.py | 4 ++-- vllm/model_executor/layers/pooler/activations.py | 1 + vllm/model_executor/layers/pooler/seqwise/poolers.py | 2 +- vllm/model_executor/layers/pooler/tokwise/poolers.py | 2 +- vllm/pooling_params.py | 10 ++++++++++ 6 files changed, 19 insertions(+), 8 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index e0dd83728d33..6d382837062c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1435,10 +1435,10 @@ def requires_raw_input_tokens(self) -> bool: @property def score_type(self) -> ScoreType: """ - Scoring API handles score/rerank for: - - "classify" task (score_type: cross-encoder models) - - "embed" task (score_type: bi-encoder models) - - "token_embed" task (score_type: late interaction models) + Scoring API handles score/rerank for:\n + - "classify" task (score_type: cross-encoder models)\n + - "embed" task (score_type: bi-encoder models)\n + - "token_embed" task (score_type: late interaction models)\n """ # fixme: self._model_info.score_type is the score type before # as_seq_cls_model, which is "bi-encoder", rather than the diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2dd6c138cfb6..5095c4e14fcb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -165,8 +165,8 @@ async def build_async_engine_client_from_engine_args( def build_app( args: Namespace, - model_config: ModelConfig | None = None, supported_tasks: tuple["SupportedTask", ...] | None = None, + model_config: ModelConfig | None = None, ) -> FastAPI: if supported_tasks is None: warnings.warn( @@ -597,7 +597,7 @@ async def build_and_serve( model_config = engine_client.model_config logger.info("Supported tasks: %s", supported_tasks) - app = build_app(args, model_config, supported_tasks) + app = build_app(args, supported_tasks, model_config) await init_app_state(engine_client, app.state, args, supported_tasks) logger.info("Starting vLLM server on %s", listen_address) diff --git a/vllm/model_executor/layers/pooler/activations.py b/vllm/model_executor/layers/pooler/activations.py index 9ec5bd253720..4213ee7b85cb 100644 --- a/vllm/model_executor/layers/pooler/activations.py +++ b/vllm/model_executor/layers/pooler/activations.py @@ -15,6 +15,7 @@ logger = init_logger(__name__) + def get_act_fn( config: PretrainedConfig, static_num_labels: bool = True, diff --git a/vllm/model_executor/layers/pooler/seqwise/poolers.py b/vllm/model_executor/layers/pooler/seqwise/poolers.py index 8bf3e25e66b6..f46834a7c3f2 100644 --- a/vllm/model_executor/layers/pooler/seqwise/poolers.py +++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py @@ -108,7 +108,7 @@ def pooler_for_classify( *, pooling: SequencePoolingMethod | SequencePoolingFn | None = None, classifier: ClassifierFn | None = None, - act_fn: PoolerActivation | str | None = None, + act_fn: PoolerActivation | None = None, ): if pooling is None: pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type()) diff --git a/vllm/model_executor/layers/pooler/tokwise/poolers.py b/vllm/model_executor/layers/pooler/tokwise/poolers.py index 996f20d98cc9..c56970fcabaa 100644 --- a/vllm/model_executor/layers/pooler/tokwise/poolers.py +++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py @@ -116,7 +116,7 @@ def pooler_for_token_classify( *, pooling: TokenPoolingMethod | TokenPoolingFn | None = None, classifier: ClassifierFn | None = None, - act_fn: PoolerActivation | str | None = None, + act_fn: PoolerActivation | None = None, ): if pooling is None: pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type()) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index a1a49e41948f..b347ec831abc 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -7,9 +7,12 @@ import msgspec from vllm.config import ModelConfig, PoolerConfig +from vllm.logger import init_logger from vllm.sampling_params import RequestOutputKind from vllm.tasks import PoolingTask +logger = init_logger(__name__) + class LateInteractionParams( msgspec.Struct, @@ -84,6 +87,13 @@ def clone(self) -> "PoolingParams": return deepcopy(self) def verify(self, model_config: ModelConfig) -> None: + if self.task == "score": + logger.warning_once( + "`score` task is deprecated and will be removed in v0.20. " + "Please use `classify` instead." + ) + self.task = "classify" + # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": From 4fddf96b0552fa8540e356e92f0a16af4be0e5c2 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 20 Mar 2026 08:11:07 +0800 Subject: [PATCH 4/4] + docs Signed-off-by: wang.yuqi --- docs/models/pooling_models/README.md | 63 ++++++++++++++------------ docs/models/pooling_models/classify.md | 4 +- docs/models/pooling_models/scoring.md | 17 ++++--- 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/docs/models/pooling_models/README.md b/docs/models/pooling_models/README.md index b34cc1efe6ae..02e2c82cf009 100644 --- a/docs/models/pooling_models/README.md +++ b/docs/models/pooling_models/README.md @@ -31,28 +31,29 @@ Of course, we also have "plugin" tasks that allow users to customize input and o ### Pooling Tasks -| Pooling Tasks | Granularity | Outputs | -|--------------------|---------------|-------------------------------------------------| -| `classify` | Sequence-wise | probability vector of classes for each sequence | -| `score` (see note) | Sequence-wise | reranker score for each sequence | -| `embed` | Sequence-wise | vector representations for each sequence | -| `token_classify` | Token-wise | probability vector of classes for each token | -| `token_embed` | Token-wise | vector representations for each token | +| Pooling Tasks | Granularity | Outputs | +|-----------------------|---------------|-------------------------------------------------| +| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence | +| `embed` | Sequence-wise | vector representations for each sequence | +| `token_classify` | Token-wise | probability vector of classes for each token | +| `token_embed` | Token-wise | vector representations for each token | !!! note Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. ### Score Types -| Pooling Tasks | Granularity | Outputs | Score Types | scoring function | -|--------------------|---------------|-------------------------------------------------|--------------------|--------------------------| -| `classify` | Sequence-wise | probability vector of classes for each sequence | nan | nan | -| `score` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier | -| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity | -| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan | -| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) | +The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. -The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. +| Pooling Tasks | Granularity | Outputs | Score Types | scoring function | +|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------| +| `classify` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier | +| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity | +| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan | +| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) | + +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. ### Pooling Usages @@ -85,14 +86,16 @@ enabling the corresponding APIs. ### Offline APIs corresponding to pooling tasks -| Task | APIs | -|------------------|----------------------------------------------------------------------------| -| `embed` | `LLM.embed(...)`,`LLM.encode(..., pooling_task="embed")`, `LLM.score(...)` | -| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | -| `score` | `LLM.score(...)` | -| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | -| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` | -| `plugin` | `LLM.encode(..., pooling_task="plugin")` | +| Task | APIs | +|------------------|---------------------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | + +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. ### `LLM.classify` @@ -206,11 +209,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe vLLM will attempt to automatically convert the model according to the architecture names shown in the table below. -| Architecture | `--convert` | Supported pooling tasks | -| ----------------------------------------------- | ----------- | ------------------------------------- | -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | -| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. @@ -251,3 +254,7 @@ Pooling models now default support all pooling, you can use it without any setti - Extracting hidden states prefers using `token_embed` task. - Named Entity Recognition (NER) and reward models prefers using `token_classify` task. + +### Score task + +`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. diff --git a/docs/models/pooling_models/classify.md b/docs/models/pooling_models/classify.md index 10d7892b5361..1247bb4a0bbc 100644 --- a/docs/models/pooling_models/classify.md +++ b/docs/models/pooling_models/classify.md @@ -17,6 +17,8 @@ The key distinction between (sequence) classification and token classification l Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md). +Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md). + ## Typical Use Cases ### Classification @@ -54,7 +56,7 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md). ---8<-- "docs/models/pooling_models/scoring.md:supported-score-models" +--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models" ### Reward Models diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md index 6227b689acb0..ac94a0cd76bc 100644 --- a/docs/models/pooling_models/scoring.md +++ b/docs/models/pooling_models/scoring.md @@ -10,11 +10,11 @@ The score models is designed to compute similarity scores between two input prom - Model Usage: Scoring - Pooling Task: -| Score Types | Pooling Tasks | scoring function | -|--------------------|---------------|--------------------------| -| `cross-encoder` | `score` | linear classifier | -| `late-interaction` | `token_embed` | late interaction(MaxSim) | -| `bi-encoder` | `embed` | cosine similarity | +| Score Types | Pooling Tasks | scoring function | +|--------------------|-----------------------|--------------------------| +| `cross-encoder` | `classify` (see note) | linear classifier | +| `late-interaction` | `token_embed` | late interaction(MaxSim) | +| `bi-encoder` | `embed` | cosine similarity | - Offline APIs: - `LLM.score` @@ -22,13 +22,16 @@ The score models is designed to compute similarity scores between two input prom - [Score API](scoring.md#score-api) (`/score`) - [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. + ## Supported Models ### Cross-encoder models [Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. ---8<-- [start:supported-score-models] +--8<-- [start:supported-cross-encoder-models] #### Text-only Models @@ -99,7 +102,7 @@ The score models is designed to compute similarity scores between two input prom vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' ``` ---8<-- [end:supported-score-models] +--8<-- [end:supported-cross-encoder-models] ### Late-interaction models