From f6ef43018b803f5ce5cecbd81ee8bfc967d5cdc7 Mon Sep 17 00:00:00 2001 From: vedant jhaveri Date: Fri, 30 Jan 2026 21:20:40 +0000 Subject: [PATCH 01/28] add scoring API --- docs/serving/openai_compatible_server.md | 84 +++ .../openai/test_generative_scores.py | 509 ++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 14 + .../openai/generative_scores/__init__.py | 2 + .../openai/generative_scores/api_router.py | 170 ++++++ .../openai/generative_scores/protocol.py | 138 +++++ .../openai/generative_scores/serving.py | 419 ++++++++++++++ 7 files changed, 1336 insertions(+) create mode 100644 tests/entrypoints/openai/test_generative_scores.py create mode 100644 vllm/entrypoints/openai/generative_scores/__init__.py create mode 100644 vllm/entrypoints/openai/generative_scores/api_router.py create mode 100644 vllm/entrypoints/openai/generative_scores/protocol.py create mode 100644 vllm/entrypoints/openai/generative_scores/serving.py diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index a7b1c18a62a5..4bad9ce4b951 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -77,6 +77,9 @@ In addition, we have the following custom APIs: - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. - Only applicable to [cross-encoder models](../models/pooling_models.md). +- [Generative Scores API](#generative-scores-api) (`/v1/generative-scores`) + - Computes next-token probabilities for specified token IDs. + - Only applicable to [text generation models](../models/generative_models.md). ## Chat Template @@ -1147,6 +1150,87 @@ The following extra parameters are supported: --8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params" ``` +### Generative Scores API + +The Generative Scores API computes the probability of specified token IDs appearing as the next token after a given query+item prompt. This is useful for classification tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens without generating them. + +Unlike traditional logprobs which are limited to top-k tokens, this API: +- Returns probabilities for any specified token IDs in the vocabulary +- Supports batch scoring of multiple items against a single query +- Offers both subset softmax (normalize over label tokens only) and true model probabilities + +#### Example: Sentiment Classification + +```python +import requests + +# Token IDs for "Yes" and "No" (model-specific) +YES_TOKEN_ID = 2332 +NO_TOKEN_ID = 1223 + +response = requests.post( + "http://localhost:8000/v1/generative-scores", + json={ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "query": "<|user|>Is the following city the capital of France? ", + "items": [ + "Paris <|assistant|>", + "London <|assistant|>", + "Berlin <|assistant|>" + ], + "label_token_ids": [YES_TOKEN_ID, NO_TOKEN_ID], + "apply_softmax": True, + "item_first": False + } +) + +# Response: +# { +# "results": [ +# {"index": 0, "token_probs": {"2332": 0.95, "1223": 0.05}}, # Paris: Yes=95% +# {"index": 1, "token_probs": {"2332": 0.10, "1223": 0.90}}, # London: No=90% +# {"index": 2, "token_probs": {"2332": 0.05, "1223": 0.95}} # Berlin: No=95% +# ] +# } +``` + +#### Request Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model` | string | null | Model to use for scoring. | +| `query` | string \| list[int] | required | Query text or pre-tokenized token IDs. | +| `items` | list[string] \| list[list[int]] | required | Items to score against the query. | +| `label_token_ids` | list[int] | required | Token IDs to compute probabilities for. | +| `apply_softmax` | bool | true | If true, normalize over label tokens. If false, return true model probs. | +| `item_first` | bool | false | If true, prepend items to query. Otherwise append. | +| `temperature` | float | 0.0 | Temperature for logits (0.0 = greedy). | +| `top_k` | int | 0 | Top-k filtering (0 = disabled for scoring). | +| `top_p` | float | 1.0 | Top-p filtering (1.0 = disabled for scoring). | +| `add_special_tokens` | bool | true | Whether to add special tokens when tokenizing. | + +#### Probability Computation + +The endpoint computes probabilities from the model's next-token distribution: + +1. **Prompt Construction**: For each item, build `prompt = query + item` (or `item + query` if `item_first=True`) +2. **Forward Pass**: Run the model to get next-token logits +3. **Probability Extraction**: Get logprobs for specified `label_token_ids` + +The `apply_softmax` parameter controls normalization: + +- **`apply_softmax=True`** (default): Softmax over only the label tokens + ``` + P(token_i | prompt) = exp(logit_i) / Σ exp(logit_j) for j in label_token_ids + ``` + Probabilities sum to 1 over the label tokens. + +- **`apply_softmax=False`**: True model probabilities + ``` + P(token_i | prompt) = exp(logprob_i) # logprob is already normalized over full vocab + ``` + Probabilities are the actual model confidence over the full vocabulary. + ## Ray Serve LLM Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure. diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/openai/test_generative_scores.py new file mode 100644 index 000000000000..8bee13e0e0ba --- /dev/null +++ b/tests/entrypoints/openai/test_generative_scores.py @@ -0,0 +1,509 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the Generative Scores API. + +These tests verify: +1. Request/Response protocol models +2. Probability computation (apply_softmax=True and apply_softmax=False) +3. Input validation +4. Error handling +""" + +import math +from dataclasses import dataclass, field +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.generative_scores.protocol import ( + GenerativeScoreItemResult, + GenerativeScoreRequest, + GenerativeScoreResponse, +) +from vllm.entrypoints.openai.generative_scores.serving import ( + OpenAIServingGenerativeScores, +) +from vllm.entrypoints.openai.models.protocol import BaseModelPath +from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.logprobs import Logprob +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.tokenizers import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + vocab_size = 50257 # GPT-2 vocab size + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + def get_vocab_size(self): + return self.vocab_size + + +# ============================================================================ +# Protocol Tests +# ============================================================================ + + +class TestGenerativeScoreProtocol: + """Tests for protocol models.""" + + def test_request_basic_fields(self): + """Test request with basic required fields.""" + request = GenerativeScoreRequest( + query="Is this city the capital?", + items=["Paris", "London"], + label_token_ids=[1234, 5678], + ) + assert request.query == "Is this city the capital?" + assert request.items == ["Paris", "London"] + assert request.label_token_ids == [1234, 5678] + assert request.apply_softmax is True # default + assert request.item_first is False # default + assert request.temperature == 0.0 # default for scoring + assert request.top_k == 0 # default (disabled) + assert request.top_p == 1.0 # default (disabled) + + def test_request_with_pretokenized_input(self): + """Test request with pre-tokenized token IDs.""" + request = GenerativeScoreRequest( + query=[100, 200, 300], + items=[[400, 500], [600, 700, 800]], + label_token_ids=[1234, 5678], + ) + assert request.query == [100, 200, 300] + assert request.items == [[400, 500], [600, 700, 800]] + + def test_request_custom_options(self): + """Test request with custom options.""" + request = GenerativeScoreRequest( + query="Test query", + items=["Item1"], + label_token_ids=[100], + apply_softmax=False, + item_first=True, + temperature=0.5, + add_special_tokens=False, + ) + assert request.apply_softmax is False + assert request.item_first is True + assert request.temperature == 0.5 + assert request.add_special_tokens is False + + def test_response_structure(self): + """Test response model structure.""" + response = GenerativeScoreResponse( + model="test-model", + results=[ + GenerativeScoreItemResult( + index=0, + token_probs={"1234": 0.7, "5678": 0.3}, + ) + ], + usage={"prompt_tokens": 10, "total_tokens": 11, "completion_tokens": 1}, + ) + assert response.object == "generative_score" + assert response.model == "test-model" + assert len(response.results) == 1 + assert response.results[0].token_probs["1234"] == 0.7 + + +# ============================================================================ +# Probability Computation Tests +# ============================================================================ + + +class TestProbabilityComputation: + """Tests for probability computation logic.""" + + def test_compute_probabilities_with_softmax(self): + """Test subset softmax normalization (apply_softmax=True). + + When apply_softmax=True, we normalize only over the label tokens. + softmax([logprob_a, logprob_b]) should sum to 1. + """ + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores + ) + + # Example logprobs (log probabilities from the model) + # These are already log(softmax(logits)) values + label_logprobs = { + 100: -1.0, # ~0.368 before normalization + 200: -2.0, # ~0.135 before normalization + } + + probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) + + # With subset softmax, probs should sum to 1 + total = sum(probs.values()) + assert abs(total - 1.0) < 1e-6, f"Probabilities should sum to 1, got {total}" + + # Check relative ordering is preserved + assert probs[100] > probs[200], "Higher logprob should have higher probability" + + # Verify the math: softmax([−1, −2]) = [e^−1/(e^−1+e^−2), e^−2/(e^−1+e^−2)] + exp_neg1 = math.exp(-1) + exp_neg2 = math.exp(-2) + expected_prob_100 = exp_neg1 / (exp_neg1 + exp_neg2) + expected_prob_200 = exp_neg2 / (exp_neg1 + exp_neg2) + assert abs(probs[100] - expected_prob_100) < 1e-6 + assert abs(probs[200] - expected_prob_200) < 1e-6 + + def test_compute_probabilities_without_softmax(self): + """Test true model probabilities (apply_softmax=False). + + When apply_softmax=False, we return exp(logprob) which gives the + true model probability for each token over the full vocab. + """ + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores + ) + + # Example logprobs (already normalized over full vocab by the model) + label_logprobs = { + 100: -1.0, # exp(-1) ≈ 0.368 + 200: -2.0, # exp(-2) ≈ 0.135 + } + + probs = serving._compute_probabilities(label_logprobs, apply_softmax=False) + + # These should NOT sum to 1 (they're just exp of the logprobs) + expected_prob_100 = math.exp(-1.0) + expected_prob_200 = math.exp(-2.0) + + assert abs(probs[100] - expected_prob_100) < 1e-6 + assert abs(probs[200] - expected_prob_200) < 1e-6 + + # These probabilities don't sum to 1 (unless we happened to pick + # the only tokens with probability mass) + total = sum(probs.values()) + assert total < 1.0, "True probs over subset shouldn't sum to 1" + + def test_compute_probabilities_numerical_stability(self): + """Test that computation is numerically stable with extreme values.""" + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores + ) + + # Very negative logprobs (very unlikely tokens) + label_logprobs = { + 100: -100.0, + 200: -100.5, + } + + # Should not overflow/underflow with subset softmax + probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) + total = sum(probs.values()) + assert abs(total - 1.0) < 1e-6 + assert probs[100] > probs[200] + + +# ============================================================================ +# Mock Engine Tests +# ============================================================================ + + +def _create_mock_engine(): + """Create a mock AsyncLLM engine.""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + return mock_engine + + +def _create_serving(mock_engine) -> OpenAIServingGenerativeScores: + """Create an OpenAIServingGenerativeScores instance with mocks.""" + models = OpenAIServingModels( + engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + ) + return OpenAIServingGenerativeScores( + mock_engine, + models, + request_logger=None, + ) + + +def _create_mock_request_output( + logprobs_dict: dict[int, float], + token_id: int = 100, +) -> RequestOutput: + """Create a mock RequestOutput with specified logprobs.""" + # Convert to Logprob objects + logprobs_with_objs = { + tid: Logprob(logprob=lp, rank=i + 1) + for i, (tid, lp) in enumerate(logprobs_dict.items()) + } + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[token_id], + cumulative_logprob=-1.0, + logprobs=[logprobs_with_objs], + finish_reason="length", + ) + + return RequestOutput( + request_id="test-request", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + outputs=[completion_output], + finished=True, + ) + + +# ============================================================================ +# Validation Tests +# ============================================================================ + + +class TestValidation: + """Tests for input validation.""" + + @pytest.mark.asyncio + async def test_invalid_token_id_out_of_range(self): + """Test that out-of-range token IDs return an error.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) + + request = GenerativeScoreRequest( + model=MODEL_NAME, + query="test query", + items=["item1"], + label_token_ids=[999999], # Way beyond vocab size + ) + + result = await serving.create_generative_score(request, None) + + assert isinstance(result, ErrorResponse) + assert "out of vocabulary range" in result.error.message.lower() + + @pytest.mark.asyncio + async def test_empty_label_token_ids(self): + """Test that empty label_token_ids returns an error.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) + + request = GenerativeScoreRequest( + model=MODEL_NAME, + query="test query", + items=["item1"], + label_token_ids=[], # Empty + ) + + result = await serving.create_generative_score(request, None) + + assert isinstance(result, ErrorResponse) + assert "at least one token" in result.error.message.lower() + + @pytest.mark.asyncio + async def test_empty_items(self): + """Test that empty items list returns an error.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) + + request = GenerativeScoreRequest( + model=MODEL_NAME, + query="test query", + items=[], # Empty + label_token_ids=[100], + ) + + result = await serving.create_generative_score(request, None) + + assert isinstance(result, ErrorResponse) + assert "at least one item" in result.error.message.lower() + + +# ============================================================================ +# Integration-style Tests (with mocked engine) +# ============================================================================ + + +class TestGenerativeScoreGeneration: + """Tests for the full generation flow with mocked engine.""" + + @pytest.mark.asyncio + async def test_successful_generation(self): + """Test successful score generation with mocked engine output.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) + + # Set up the mock to return logprobs for our label tokens + label_token_ids = [1234, 5678] + mock_logprobs = { + 1234: -0.5, # Higher probability + 5678: -2.0, # Lower probability + # Include some other tokens that would be in full vocab + 100: -3.0, + 200: -4.0, + } + + mock_output = _create_mock_request_output(mock_logprobs) + + async def mock_generate(*args, **kwargs): + yield mock_output + + mock_engine.generate = mock_generate + + request = GenerativeScoreRequest( + model=MODEL_NAME, + query="Is Paris the capital of France?", + items=["Yes", "No"], + label_token_ids=label_token_ids, + apply_softmax=True, + ) + + result = await serving.create_generative_score(request, None) + + # Should succeed + assert isinstance(result, GenerativeScoreResponse) + assert len(result.results) == 2 # One per item + + # Check probabilities are in valid range + for item_result in result.results: + for prob in item_result.token_probs.values(): + assert 0.0 <= prob <= 1.0 + + @pytest.mark.asyncio + async def test_item_first_ordering(self): + """Test that item_first=True prepends item to query.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) + + # Track what prompts are built + built_prompts = [] + + async def mock_tokenizer_call(text, **kwargs): + result = MagicMock() + result.input_ids = [ord(c) for c in text[:5]] # Simple mock + return result + + # We can verify the prompt building by checking the _build_prompts method + tokenizer = MagicMock() + tokenizer.return_value = MagicMock(input_ids=[1, 2, 3]) + + request = GenerativeScoreRequest( + query=[100, 101, 102], # Pre-tokenized + items=[[200, 201], [300, 301]], # Pre-tokenized + label_token_ids=[500], + item_first=False, + ) + + # Build prompts and check ordering + engine_prompts, _ = await serving._build_prompts(request, tokenizer) + + # With item_first=False: query + item + assert engine_prompts[0]["prompt_token_ids"] == [100, 101, 102, 200, 201] + assert engine_prompts[1]["prompt_token_ids"] == [100, 101, 102, 300, 301] + + # Now test with item_first=True + request.item_first = True + engine_prompts, _ = await serving._build_prompts(request, tokenizer) + + # With item_first=True: item + query + assert engine_prompts[0]["prompt_token_ids"] == [200, 201, 100, 101, 102] + assert engine_prompts[1]["prompt_token_ids"] == [300, 301, 100, 101, 102] + + +# ============================================================================ +# Math Verification Tests +# ============================================================================ + + +class TestMathVerification: + """Detailed tests to verify the probability math is correct.""" + + def test_softmax_over_subset(self): + """Verify: apply_softmax=True gives softmax over subset.""" + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores + ) + + # logits (before log) for full vocab might be [1.0, 2.0, 3.0, ...] + # After softmax over full vocab, logprobs become log(softmax(logits)) + # For our test, let's say: + # - Token A has logprob -0.5 (exp(-0.5) ≈ 0.606 true prob) + # - Token B has logprob -1.5 (exp(-1.5) ≈ 0.223 true prob) + + label_logprobs = {10: -0.5, 20: -1.5} + + # With apply_softmax=True, we do softmax over just [10, 20] + # softmax([-0.5, -1.5]) = [exp(-0.5)/(exp(-0.5)+exp(-1.5)), + # exp(-1.5)/(exp(-0.5)+exp(-1.5))] + probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) + + exp_a = math.exp(-0.5) + exp_b = math.exp(-1.5) + denom = exp_a + exp_b + + expected_a = exp_a / denom + expected_b = exp_b / denom + + assert abs(probs[10] - expected_a) < 1e-9 + assert abs(probs[20] - expected_b) < 1e-9 + assert abs(sum(probs.values()) - 1.0) < 1e-9 + + def test_true_probs_without_softmax(self): + """Verify: apply_softmax=False gives exp(logprob) = true model prob.""" + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores + ) + + # These logprobs come from log(softmax(logits)) computed by the model + # So exp(logprob) gives the true probability + label_logprobs = {10: -0.5, 20: -1.5} + + probs = serving._compute_probabilities(label_logprobs, apply_softmax=False) + + # Just exp of the logprobs + expected_a = math.exp(-0.5) + expected_b = math.exp(-1.5) + + assert abs(probs[10] - expected_a) < 1e-9 + assert abs(probs[20] - expected_b) < 1e-9 + + # They don't sum to 1 (unless we selected all tokens) + assert sum(probs.values()) < 1.0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a1ee3607a051..a0b34c0f0fa6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -189,6 +189,12 @@ def build_app(args: Namespace, supported_tasks: tuple["SupportedTask", ...]) -> register_generate_api_routers(app) + from vllm.entrypoints.openai.generative_scores.api_router import ( + register_generative_scores_api_routers, + ) + + register_generative_scores_api_routers(app) + if "transcription" in supported_tasks: from vllm.entrypoints.openai.translations.api_router import ( attach_router as register_translations_api_router, @@ -317,6 +323,14 @@ async def init_app_state( engine_client, state, args, request_logger, supported_tasks ) + from vllm.entrypoints.openai.generative_scores.api_router import ( + init_generative_scores_state, + ) + + init_generative_scores_state( + engine_client, state, args, request_logger, supported_tasks + ) + if "transcription" in supported_tasks: from vllm.entrypoints.openai.translations.api_router import ( init_transcription_state, diff --git a/vllm/entrypoints/openai/generative_scores/__init__.py b/vllm/entrypoints/openai/generative_scores/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py new file mode 100644 index 000000000000..55773f25a461 --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/api_router.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""API router for the Generative Scores endpoint. + +This module defines the FastAPI routes for the /v1/generative-scores endpoint. +""" + +from http import HTTPStatus +from typing import TYPE_CHECKING + +from fastapi import APIRouter, Depends, Request +from fastapi.responses import JSONResponse +from typing_extensions import assert_never + +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.generative_scores.protocol import ( + GenerativeScoreRequest, + GenerativeScoreResponse, +) +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.logger import init_logger + +if TYPE_CHECKING: + from argparse import Namespace + + from starlette.datastructures import State + + from vllm.engine.protocol import EngineClient + from vllm.entrypoints.logger import RequestLogger + from vllm.tasks import SupportedTask + + +router = APIRouter() + +logger = init_logger(__name__) + + +def generative_scores(request: Request): + """Get the generative scores handler from app state.""" + return request.app.state.openai_serving_generative_scores + + +@router.post( + "/v1/generative-scores", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_generative_score( + request: GenerativeScoreRequest, + raw_request: Request, +): + """Compute generative scores for the given query and items. + + This endpoint scores the probability of specified token IDs appearing after + the given query and item are appended together. For example: + + query = "<|user|>Is the following city the capital of France? " + items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"] + label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No" + item_first = False + + This would pass the following prompts to the model: + - "<|user|>Is the following city the capital of France? Paris <|assistant|>" + - "<|user|>Is the following city the capital of France? London <|assistant|>" + - "<|user|>Is the following city the capital of France? Berlin <|assistant|>" + + The API would return the probabilities of the model producing "Yes" and "No" + as the next token for each prompt. + + Args: + request: The GenerativeScoreRequest containing: + - model: The model to use (optional) + - query: The query text or pre-tokenized token IDs + - items: List of item texts or pre-tokenized token IDs + - label_token_ids: List of token IDs to compute probabilities for + - apply_softmax: Whether to normalize over only label tokens (default: True) + - item_first: Whether to prepend items to query (default: False) + + Returns: + GenerativeScoreResponse containing probabilities for each item. + + Raises: + 400 Bad Request: If label_token_ids are out of vocabulary range. + 500 Internal Server Error: If an internal error occurs. + """ + handler = generative_scores(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Generative Scores API" + ) + + try: + generator = await handler.create_generative_score(request, raw_request) + except Exception as e: + return handler.create_error_response(e) + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, GenerativeScoreResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post( + "/generative-scores", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_generative_score_unversioned( + request: GenerativeScoreRequest, + raw_request: Request, +): + """Unversioned alias for the generative-scores endpoint. + + This provides the same functionality as /v1/generative-scores. + See create_generative_score for full documentation. + """ + return await create_generative_score(request, raw_request) + + +def register_generative_scores_api_routers(app): + """Register the generative scores API router with the app.""" + app.include_router(router) + + +def init_generative_scores_state( + engine_client: "EngineClient", + state: "State", + args: "Namespace", + request_logger: "RequestLogger | None", + supported_tasks: tuple["SupportedTask", ...], +): + """Initialize the generative scores serving state. + + Args: + engine_client: The engine client for model inference. + state: The application state to store the handler. + args: Command line arguments. + request_logger: Logger for request logging. + supported_tasks: Tuple of supported tasks. + """ + from vllm.entrypoints.openai.generative_scores.serving import ( + OpenAIServingGenerativeScores, + ) + + # Only initialize for generative models + if "generate" in supported_tasks: + state.openai_serving_generative_scores = OpenAIServingGenerativeScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + ) + else: + state.openai_serving_generative_scores = None diff --git a/vllm/entrypoints/openai/generative_scores/protocol.py b/vllm/entrypoints/openai/generative_scores/protocol.py new file mode 100644 index 000000000000..e2feb1cca085 --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/protocol.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Protocol definitions for the Generative Scores API. + +This module defines the request and response models for the /v1/generative-scores +endpoint, which computes the probability of specified token IDs appearing as the +next token after a given query+item prompt. +""" + +import time +from typing import Literal + +from pydantic import Field + +from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo +from vllm.utils import random_uuid + + +class GenerativeScoreRequest(OpenAIBaseModel): + """Request for computing generative scores. + + This endpoint scores the probability of specified token IDs appearing after + the given query and item are appended together. For example: + + query = "<|user|>Is the following city the capital of France? " + items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"] + label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No" + item_first = False + + This would pass the following prompts to the model: + "<|user|>Is the following city the capital of France? Paris <|assistant|>" + "<|user|>Is the following city the capital of France? London <|assistant|>" + "<|user|>Is the following city the capital of France? Berlin <|assistant|>" + + The API would then return the probabilities of the model producing "Yes" + and "No" as the next token. + + Attributes: + model: The model to use for scoring. Optional, follows existing patterns. + query: The query text or pre-tokenized query token IDs. + items: The item text(s) or pre-tokenized item token IDs. + label_token_ids: List of token IDs to compute probabilities for. + apply_softmax: Whether to normalize probabilities using softmax over only + the label_token_ids (True) or return true model probabilities over + the full vocab for those ids (False). + item_first: If True, prepend items to query. Otherwise append items to query. + temperature: Temperature for logits. Default 0.0 for scoring (greedy). + top_k: Top-k filtering. Default 0 (disabled) for scoring. + top_p: Top-p filtering. Default 1.0 (disabled) for scoring. + add_special_tokens: Whether to add special tokens when tokenizing. + """ + + model: str | None = None + query: str | list[int] = Field( + ..., + description="The query text or pre-tokenized query token IDs.", + ) + items: list[str] | list[list[int]] = Field( + ..., + description="List of item texts or pre-tokenized item token IDs.", + ) + label_token_ids: list[int] = Field( + ..., + description="List of token IDs to compute probabilities for.", + ) + apply_softmax: bool = Field( + default=True, + description=( + "If True, normalize probabilities using softmax over only the " + "label_token_ids. If False, return the true model probabilities " + "over the full vocab for those ids." + ), + ) + item_first: bool = Field( + default=False, + description="If True, prepend items to query. Otherwise append items to query.", + ) + temperature: float | None = Field( + default=0.0, + description="Temperature for logits. Default 0.0 for scoring.", + ) + top_k: int | None = Field( + default=0, + description="Top-k filtering. Default 0 (disabled) for scoring.", + ) + top_p: float | None = Field( + default=1.0, + description="Top-p filtering. Default 1.0 (disabled) for scoring.", + ) + add_special_tokens: bool = Field( + default=True, + description="Whether to add special tokens when tokenizing.", + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0)." + ), + ) + request_id: str = Field( + default_factory=random_uuid, + description="The request_id related to this request.", + ) + + +class GenerativeScoreItemResult(OpenAIBaseModel): + """Result for a single item in the generative scores response. + + Attributes: + index: The index of this item in the input items list. + token_probs: Dictionary mapping token IDs (as strings) to their probabilities. + """ + + index: int + token_probs: dict[str, float] = Field( + description="Mapping of token ID (as string) to probability." + ) + + +class GenerativeScoreResponse(OpenAIBaseModel): + """Response from the generative scores endpoint. + + Attributes: + id: Unique identifier for this response. + object: Type of object, always "generative_score". + created: Unix timestamp of when the response was created. + model: The model used for scoring. + results: List of scoring results, one per input item. + usage: Token usage information. + """ + + id: str = Field(default_factory=lambda: f"genscore-{random_uuid()}") + object: Literal["generative_score"] = "generative_score" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + results: list[GenerativeScoreItemResult] + usage: UsageInfo diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/openai/generative_scores/serving.py new file mode 100644 index 000000000000..5dce294510ff --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/serving.py @@ -0,0 +1,419 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Serving class for the Generative Scores API. + +This module implements the OpenAIServingGenerativeScores class which handles +requests to compute the probability of specified token IDs appearing as the +next token after a given query+item prompt. +""" + +import asyncio +import math +import time +from collections.abc import AsyncGenerator, Mapping + +from fastapi import Request + +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo +from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.entrypoints.openai.generative_scores.protocol import ( + GenerativeScoreItemResult, + GenerativeScoreRequest, + GenerativeScoreResponse, +) +from vllm.entrypoints.openai.models.serving import OpenAIServingModels +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams +from vllm.utils.async_utils import merge_async_iterators + +logger = init_logger(__name__) + + +class OpenAIServingGenerativeScores(OpenAIServing): + """Serving class for the Generative Scores API. + + This class handles computing the probability of specified token IDs + appearing as the next token after concatenating query and item prompts. + + The key operation is: + 1. For each item, build a prompt: query + item (or item + query if item_first) + 2. Run a forward pass to get the next token distribution + 3. Extract probabilities for the specified label_token_ids + 4. Normalize either over the full vocab (apply_softmax=False) or + over just the label_token_ids (apply_softmax=True) + """ + + def __init__( + self, + engine_client: EngineClient, + models: OpenAIServingModels, + *, + request_logger: RequestLogger | None, + log_error_stack: bool = False, + ) -> None: + super().__init__( + engine_client=engine_client, + models=models, + request_logger=request_logger, + log_error_stack=log_error_stack, + ) + + async def create_generative_score( + self, + request: GenerativeScoreRequest, + raw_request: Request | None = None, + ) -> GenerativeScoreResponse | ErrorResponse: + """Create generative scores for the given request. + + Args: + request: The GenerativeScoreRequest containing query, items, and + label_token_ids. + raw_request: The raw FastAPI request object. + + Returns: + GenerativeScoreResponse with probabilities for each item, or + ErrorResponse if an error occurred. + """ + # Check model + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # Check if engine is alive + if self.engine_client.errored: + raise self.engine_client.dead_error + + # Get tokenizer + tokenizer = self.renderer.tokenizer + if tokenizer is None: + return self.create_error_response( + "Tokenizer not available. Cannot process generative score request." + ) + + # Validate label_token_ids + vocab_size = self.model_config.get_vocab_size() + for token_id in request.label_token_ids: + if token_id < 0 or token_id >= vocab_size: + return self.create_error_response( + f"label_token_id {token_id} is out of vocabulary range " + f"[0, {vocab_size}). Please provide valid token IDs." + ) + + if len(request.label_token_ids) == 0: + return self.create_error_response( + "label_token_ids must contain at least one token ID." + ) + + # Validate items + if len(request.items) == 0: + return self.create_error_response( + "items must contain at least one item." + ) + + try: + lora_request = self._maybe_get_adapters(request) + except (ValueError, TypeError, RuntimeError) as e: + logger.exception("Error preparing request components") + return self.create_error_response(e) + + request_id = f"genscore-{self._base_request_id(raw_request, request.request_id)}" + created_time = int(time.time()) + + # Build prompts for each item + try: + engine_prompts, prompt_token_counts = await self._build_prompts( + request, tokenizer + ) + except (ValueError, TypeError) as e: + logger.exception("Error building prompts") + return self.create_error_response(e) + + # Create sampling params for scoring + # We use max_tokens=1 with logprobs=-1 to get full vocab logprobs + # for the next token distribution + sampling_params = SamplingParams( + max_tokens=1, + temperature=request.temperature if request.temperature else 0.0, + top_k=request.top_k if request.top_k is not None else 0, + top_p=request.top_p if request.top_p is not None else 1.0, + logprobs=-1, # Get all vocab logprobs + n=1, + ) + + # Get trace headers + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + # Schedule requests for all prompts + generators: list[AsyncGenerator[RequestOutput, None]] = [] + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs( + request_id_item, + engine_prompt, + params=sampling_params, + lora_request=lora_request, + ) + + generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + generators.append(generator) + + # Collect results + result_generator = merge_async_iterators(*generators) + results: list[RequestOutput | None] = [None] * len(engine_prompts) + + try: + async for i, res in result_generator: + results[i] = res + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except Exception as e: + logger.exception("Error during generation") + return self.create_error_response(e) + + # Process results to extract label token probabilities + item_results: list[GenerativeScoreItemResult] = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + + for i, result in enumerate(results): + if result is None: + return self.create_error_response( + f"Failed to generate result for item {i}" + ) + + # Check for errors + if result.outputs and result.outputs[0].finish_reason == "error": + return self.create_error_response( + f"Generation error for item {i}" + ) + + # Get logprobs from the generated token + if not result.outputs or len(result.outputs) == 0: + return self.create_error_response( + f"No output generated for item {i}" + ) + + output = result.outputs[0] + if output.logprobs is None or len(output.logprobs) == 0: + return self.create_error_response( + f"No logprobs available for item {i}. " + "This might indicate an issue with logprobs configuration." + ) + + # The logprobs dict maps token_id -> Logprob object + # For logprobs=-1, this contains all vocab tokens + logprobs_dict = output.logprobs[0] + + # Extract logprobs for label tokens + label_logprobs: dict[int, float] = {} + missing_tokens = [] + for token_id in request.label_token_ids: + if token_id in logprobs_dict: + label_logprobs[token_id] = logprobs_dict[token_id].logprob + else: + missing_tokens.append(token_id) + + if missing_tokens: + return self.create_error_response( + f"Token IDs {missing_tokens} not found in logprobs for item {i}. " + "This might indicate the tokens are outside the model's vocabulary." + ) + + # Compute probabilities based on apply_softmax setting + token_probs = self._compute_probabilities( + label_logprobs, + apply_softmax=request.apply_softmax, + ) + + item_results.append( + GenerativeScoreItemResult( + index=i, + token_probs={str(k): v for k, v in token_probs.items()}, + ) + ) + + # Update token counts + total_prompt_tokens += prompt_token_counts[i] + total_completion_tokens += len(output.token_ids) + + # Build response + model_name = self.models.model_name(lora_request) + response = GenerativeScoreResponse( + id=request_id, + created=created_time, + model=model_name, + results=item_results, + usage=UsageInfo( + prompt_tokens=total_prompt_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens, + completion_tokens=total_completion_tokens, + ), + ) + + return response + + async def _build_prompts( + self, + request: GenerativeScoreRequest, + tokenizer, + ) -> tuple[list[TokensPrompt], list[int]]: + """Build prompts by concatenating query and items. + + Args: + request: The request containing query, items, and settings. + tokenizer: The tokenizer to use. + + Returns: + Tuple of (list of TokensPrompt, list of prompt token counts). + """ + # Tokenize query if it's a string + if isinstance(request.query, str): + async_tokenizer = self._get_async_tokenizer(tokenizer) + query_result = await async_tokenizer( + request.query, + add_special_tokens=request.add_special_tokens, + ) + query_token_ids = query_result.input_ids + else: + query_token_ids = request.query + + engine_prompts: list[TokensPrompt] = [] + prompt_token_counts: list[int] = [] + + for item in request.items: + # Tokenize item if it's a string + if isinstance(item, str): + async_tokenizer = self._get_async_tokenizer(tokenizer) + # Don't add special tokens for items to avoid duplicate BOS/EOS + item_result = await async_tokenizer( + item, + add_special_tokens=False, + ) + item_token_ids = item_result.input_ids + else: + item_token_ids = item + + # Concatenate based on item_first setting + if request.item_first: + prompt_token_ids = item_token_ids + query_token_ids + else: + prompt_token_ids = query_token_ids + item_token_ids + + engine_prompts.append( + TokensPrompt(prompt_token_ids=prompt_token_ids) + ) + prompt_token_counts.append(len(prompt_token_ids)) + + return engine_prompts, prompt_token_counts + + def _compute_probabilities( + self, + label_logprobs: dict[int, float], + apply_softmax: bool, + ) -> dict[int, float]: + """Compute probabilities from logprobs. + + Args: + label_logprobs: Dictionary mapping token_id to logprob. + apply_softmax: If True, normalize over only the label tokens. + If False, return true model probabilities (exp(logprob)). + + Returns: + Dictionary mapping token_id to probability. + """ + if apply_softmax: + # Normalize over only the label tokens (subset softmax) + # softmax(gathered_logits) over the subset + logprobs_list = list(label_logprobs.values()) + max_logprob = max(logprobs_list) + + # Compute exp(logprob - max) for numerical stability + exp_values = { + token_id: math.exp(logprob - max_logprob) + for token_id, logprob in label_logprobs.items() + } + sum_exp = sum(exp_values.values()) + + return { + token_id: exp_val / sum_exp + for token_id, exp_val in exp_values.items() + } + else: + # Return true model probabilities + # Since logprobs are already log(softmax(logits)), + # we just need to exp() them + return { + token_id: math.exp(logprob) + for token_id, logprob in label_logprobs.items() + } + + async def _get_trace_headers( + self, + headers: Mapping[str, str], + ) -> Mapping[str, str] | None: + """Extract trace headers from request headers.""" + from vllm.tracing import ( + contains_trace_headers, + extract_trace_headers, + log_tracing_disabled_warning, + ) + + if not contains_trace_headers(headers): + return None + + if not await self.engine_client.is_tracing_enabled(): + log_tracing_disabled_warning() + return None + + return extract_trace_headers(headers) + + def _base_request_id( + self, + raw_request: Request | None, + request_id: str | None, + ) -> str: + """Get base request ID from raw request or generate one.""" + if request_id: + return request_id + if raw_request: + return getattr(raw_request.state, "request_id", None) or \ + str(id(raw_request)) + from vllm.utils import random_uuid + return random_uuid() + + def _log_inputs( + self, + request_id: str, + prompt: TokensPrompt, + params: SamplingParams, + lora_request: LoRARequest | None, + ) -> None: + """Log request inputs.""" + if self.request_logger is None: + return + + self.request_logger.log_inputs( + request_id=request_id, + prompt=str(prompt.get("prompt_token_ids", [])[:10]) + "...", + prompt_token_ids=None, + prompt_embeds=None, + params=params, + lora_request=lora_request, + ) From c4d13da49df26b4e858fe209e2cb220ce2e9d97c Mon Sep 17 00:00:00 2001 From: vedant jhaveri Date: Fri, 30 Jan 2026 23:56:01 +0000 Subject: [PATCH 02/28] only compute necessary tokens for scoring rather than all the tokens which made the API super slow for large batch sizes --- .../openai/test_generative_scores.py | 524 ++++++++---------- .../openai/generative_scores/api_router.py | 22 - .../openai/generative_scores/serving.py | 12 +- vllm/sampling_params.py | 6 + vllm/v1/sample/metadata.py | 5 + vllm/v1/sample/sampler.py | 72 ++- vllm/v1/worker/gpu_input_batch.py | 19 + vllm/v1/worker/gpu_model_runner.py | 1 + 8 files changed, 335 insertions(+), 326 deletions(-) diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/openai/test_generative_scores.py index 8bee13e0e0ba..6a48a54e35f2 100644 --- a/tests/entrypoints/openai/test_generative_scores.py +++ b/tests/entrypoints/openai/test_generative_scores.py @@ -12,7 +12,7 @@ import math from dataclasses import dataclass, field from typing import Any -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import MagicMock import pytest @@ -74,166 +74,7 @@ def get_vocab_size(self): # ============================================================================ -# Protocol Tests -# ============================================================================ - - -class TestGenerativeScoreProtocol: - """Tests for protocol models.""" - - def test_request_basic_fields(self): - """Test request with basic required fields.""" - request = GenerativeScoreRequest( - query="Is this city the capital?", - items=["Paris", "London"], - label_token_ids=[1234, 5678], - ) - assert request.query == "Is this city the capital?" - assert request.items == ["Paris", "London"] - assert request.label_token_ids == [1234, 5678] - assert request.apply_softmax is True # default - assert request.item_first is False # default - assert request.temperature == 0.0 # default for scoring - assert request.top_k == 0 # default (disabled) - assert request.top_p == 1.0 # default (disabled) - - def test_request_with_pretokenized_input(self): - """Test request with pre-tokenized token IDs.""" - request = GenerativeScoreRequest( - query=[100, 200, 300], - items=[[400, 500], [600, 700, 800]], - label_token_ids=[1234, 5678], - ) - assert request.query == [100, 200, 300] - assert request.items == [[400, 500], [600, 700, 800]] - - def test_request_custom_options(self): - """Test request with custom options.""" - request = GenerativeScoreRequest( - query="Test query", - items=["Item1"], - label_token_ids=[100], - apply_softmax=False, - item_first=True, - temperature=0.5, - add_special_tokens=False, - ) - assert request.apply_softmax is False - assert request.item_first is True - assert request.temperature == 0.5 - assert request.add_special_tokens is False - - def test_response_structure(self): - """Test response model structure.""" - response = GenerativeScoreResponse( - model="test-model", - results=[ - GenerativeScoreItemResult( - index=0, - token_probs={"1234": 0.7, "5678": 0.3}, - ) - ], - usage={"prompt_tokens": 10, "total_tokens": 11, "completion_tokens": 1}, - ) - assert response.object == "generative_score" - assert response.model == "test-model" - assert len(response.results) == 1 - assert response.results[0].token_probs["1234"] == 0.7 - - -# ============================================================================ -# Probability Computation Tests -# ============================================================================ - - -class TestProbabilityComputation: - """Tests for probability computation logic.""" - - def test_compute_probabilities_with_softmax(self): - """Test subset softmax normalization (apply_softmax=True). - - When apply_softmax=True, we normalize only over the label tokens. - softmax([logprob_a, logprob_b]) should sum to 1. - """ - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) - - # Example logprobs (log probabilities from the model) - # These are already log(softmax(logits)) values - label_logprobs = { - 100: -1.0, # ~0.368 before normalization - 200: -2.0, # ~0.135 before normalization - } - - probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) - - # With subset softmax, probs should sum to 1 - total = sum(probs.values()) - assert abs(total - 1.0) < 1e-6, f"Probabilities should sum to 1, got {total}" - - # Check relative ordering is preserved - assert probs[100] > probs[200], "Higher logprob should have higher probability" - - # Verify the math: softmax([−1, −2]) = [e^−1/(e^−1+e^−2), e^−2/(e^−1+e^−2)] - exp_neg1 = math.exp(-1) - exp_neg2 = math.exp(-2) - expected_prob_100 = exp_neg1 / (exp_neg1 + exp_neg2) - expected_prob_200 = exp_neg2 / (exp_neg1 + exp_neg2) - assert abs(probs[100] - expected_prob_100) < 1e-6 - assert abs(probs[200] - expected_prob_200) < 1e-6 - - def test_compute_probabilities_without_softmax(self): - """Test true model probabilities (apply_softmax=False). - - When apply_softmax=False, we return exp(logprob) which gives the - true model probability for each token over the full vocab. - """ - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) - - # Example logprobs (already normalized over full vocab by the model) - label_logprobs = { - 100: -1.0, # exp(-1) ≈ 0.368 - 200: -2.0, # exp(-2) ≈ 0.135 - } - - probs = serving._compute_probabilities(label_logprobs, apply_softmax=False) - - # These should NOT sum to 1 (they're just exp of the logprobs) - expected_prob_100 = math.exp(-1.0) - expected_prob_200 = math.exp(-2.0) - - assert abs(probs[100] - expected_prob_100) < 1e-6 - assert abs(probs[200] - expected_prob_200) < 1e-6 - - # These probabilities don't sum to 1 (unless we happened to pick - # the only tokens with probability mass) - total = sum(probs.values()) - assert total < 1.0, "True probs over subset shouldn't sum to 1" - - def test_compute_probabilities_numerical_stability(self): - """Test that computation is numerically stable with extreme values.""" - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) - - # Very negative logprobs (very unlikely tokens) - label_logprobs = { - 100: -100.0, - 200: -100.5, - } - - # Should not overflow/underflow with subset softmax - probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) - total = sum(probs.values()) - assert abs(total - 1.0) < 1e-6 - assert probs[100] > probs[200] - - -# ============================================================================ -# Mock Engine Tests +# Test Fixtures and Helpers # ============================================================================ @@ -266,7 +107,6 @@ def _create_mock_request_output( token_id: int = 100, ) -> RequestOutput: """Create a mock RequestOutput with specified logprobs.""" - # Convert to Logprob objects logprobs_with_objs = { tid: Logprob(logprob=lp, rank=i + 1) for i, (tid, lp) in enumerate(logprobs_dict.items()) @@ -291,70 +131,223 @@ def _create_mock_request_output( # ============================================================================ -# Validation Tests +# Protocol Tests (Parameterized) # ============================================================================ -class TestValidation: - """Tests for input validation.""" - - @pytest.mark.asyncio - async def test_invalid_token_id_out_of_range(self): - """Test that out-of-range token IDs return an error.""" - mock_engine = _create_mock_engine() - serving = _create_serving(mock_engine) - +class TestGenerativeScoreProtocol: + """Tests for protocol models - parameterized for efficiency.""" + + @pytest.mark.parametrize( + "query,items,label_ids,extra_kwargs,expected_attrs", + [ + # Basic string input with defaults + ( + "Is this city the capital?", + ["Paris", "London"], + [1234, 5678], + {}, + { + "apply_softmax": True, + "item_first": False, + "temperature": 0.0, + "top_k": 0, + "top_p": 1.0, + }, + ), + # Pre-tokenized input + ( + [100, 200, 300], + [[400, 500], [600, 700, 800]], + [1234], + {}, + {"apply_softmax": True, "item_first": False}, + ), + # Custom options + ( + "Test query", + ["Item1"], + [100], + { + "apply_softmax": False, + "item_first": True, + "temperature": 0.5, + "add_special_tokens": False, + }, + { + "apply_softmax": False, + "item_first": True, + "temperature": 0.5, + "add_special_tokens": False, + }, + ), + ], + ids=["basic_defaults", "pretokenized", "custom_options"], + ) + def test_request_construction( + self, query, items, label_ids, extra_kwargs, expected_attrs + ): + """Test request construction with various inputs and options.""" request = GenerativeScoreRequest( - model=MODEL_NAME, - query="test query", - items=["item1"], - label_token_ids=[999999], # Way beyond vocab size + query=query, + items=items, + label_token_ids=label_ids, + **extra_kwargs, ) + assert request.query == query + assert request.items == items + assert request.label_token_ids == label_ids + for attr, expected in expected_attrs.items(): + assert getattr(request, attr) == expected, f"{attr} mismatch" - result = await serving.create_generative_score(request, None) + def test_response_structure(self): + """Test response model structure.""" + response = GenerativeScoreResponse( + model="test-model", + results=[ + GenerativeScoreItemResult( + index=0, + token_probs={"1234": 0.7, "5678": 0.3}, + ) + ], + usage={ + "prompt_tokens": 10, + "total_tokens": 11, + "completion_tokens": 1, + }, + ) + assert response.object == "generative_score" + assert response.model == "test-model" + assert len(response.results) == 1 + assert response.results[0].token_probs["1234"] == 0.7 - assert isinstance(result, ErrorResponse) - assert "out of vocabulary range" in result.error.message.lower() - @pytest.mark.asyncio - async def test_empty_label_token_ids(self): - """Test that empty label_token_ids returns an error.""" - mock_engine = _create_mock_engine() - serving = _create_serving(mock_engine) +# ============================================================================ +# Probability Computation Tests (Parameterized - replaces 2 test classes) +# ============================================================================ - request = GenerativeScoreRequest( - model=MODEL_NAME, - query="test query", - items=["item1"], - label_token_ids=[], # Empty + +class TestProbabilityComputation: + """Unified tests for probability computation - covers both softmax modes.""" + + @pytest.mark.parametrize( + "label_logprobs,apply_softmax,should_sum_to_one", + [ + # apply_softmax=True cases (subset softmax, sums to 1) + ({100: -1.0, 200: -2.0}, True, True), + ({10: -0.5, 20: -1.5}, True, True), + # Numerical stability with extreme values + ({100: -100.0, 200: -100.5}, True, True), + # apply_softmax=False cases (true probs, don't sum to 1) + ({100: -1.0, 200: -2.0}, False, False), + ({10: -0.5, 20: -1.5}, False, False), + ], + ids=[ + "softmax_basic", + "softmax_different_values", + "softmax_numerical_stability", + "true_probs_basic", + "true_probs_different_values", + ], + ) + def test_compute_probabilities( + self, label_logprobs, apply_softmax, should_sum_to_one + ): + """Test probability computation with various inputs and modes.""" + serving = OpenAIServingGenerativeScores.__new__( + OpenAIServingGenerativeScores ) - result = await serving.create_generative_score(request, None) + probs = serving._compute_probabilities( + label_logprobs, apply_softmax=apply_softmax + ) + + # Check sum behavior + total = sum(probs.values()) + if should_sum_to_one: + assert abs(total - 1.0) < 1e-6, f"Expected sum=1, got {total}" + else: + assert total < 1.0, f"True probs should sum <1, got {total}" + + # Verify ordering is preserved (higher logprob = higher prob) + sorted_logprobs = sorted( + label_logprobs.items(), key=lambda x: x[1], reverse=True + ) + sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True) + assert [x[0] for x in sorted_logprobs] == [x[0] for x in sorted_probs] + + # Verify math for specific cases + if apply_softmax: + # softmax: exp(x_i - max) / sum(exp(x_j - max)) + max_lp = max(label_logprobs.values()) + exp_vals = {k: math.exp(v - max_lp) for k, v in label_logprobs.items()} + sum_exp = sum(exp_vals.values()) + for token_id, logprob in label_logprobs.items(): + expected = exp_vals[token_id] / sum_exp + assert abs(probs[token_id] - expected) < 1e-9 + else: + # true probs: just exp(logprob) + for token_id, logprob in label_logprobs.items(): + expected = math.exp(logprob) + assert abs(probs[token_id] - expected) < 1e-9 + + +# ============================================================================ +# Validation Tests (Parameterized) +# ============================================================================ - assert isinstance(result, ErrorResponse) - assert "at least one token" in result.error.message.lower() + +class TestValidation: + """Tests for input validation - parameterized.""" @pytest.mark.asyncio - async def test_empty_items(self): - """Test that empty items list returns an error.""" + @pytest.mark.parametrize( + "request_kwargs,expected_error_substring", + [ + # Out of range token ID + ( + { + "query": "test query", + "items": ["item1"], + "label_token_ids": [999999], + }, + "out of vocabulary range", + ), + # Empty label_token_ids + ( + { + "query": "test query", + "items": ["item1"], + "label_token_ids": [], + }, + "at least one token", + ), + # Empty items + ( + { + "query": "test query", + "items": [], + "label_token_ids": [100], + }, + "at least one item", + ), + ], + ids=["invalid_token_id", "empty_label_tokens", "empty_items"], + ) + async def test_validation_errors(self, request_kwargs, expected_error_substring): + """Test that invalid inputs return appropriate errors.""" mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - request = GenerativeScoreRequest( - model=MODEL_NAME, - query="test query", - items=[], # Empty - label_token_ids=[100], - ) - + request = GenerativeScoreRequest(model=MODEL_NAME, **request_kwargs) result = await serving.create_generative_score(request, None) assert isinstance(result, ErrorResponse) - assert "at least one item" in result.error.message.lower() + assert expected_error_substring in result.error.message.lower() # ============================================================================ -# Integration-style Tests (with mocked engine) +# Integration Tests (with mocked engine) # ============================================================================ @@ -367,12 +360,10 @@ async def test_successful_generation(self): mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - # Set up the mock to return logprobs for our label tokens label_token_ids = [1234, 5678] mock_logprobs = { - 1234: -0.5, # Higher probability - 5678: -2.0, # Lower probability - # Include some other tokens that would be in full vocab + 1234: -0.5, + 5678: -2.0, 100: -3.0, 200: -4.0, } @@ -394,9 +385,8 @@ async def mock_generate(*args, **kwargs): result = await serving.create_generative_score(request, None) - # Should succeed assert isinstance(result, GenerativeScoreResponse) - assert len(result.results) == 2 # One per item + assert len(result.results) == 2 # Check probabilities are in valid range for item_result in result.results: @@ -404,105 +394,43 @@ async def mock_generate(*args, **kwargs): assert 0.0 <= prob <= 1.0 @pytest.mark.asyncio - async def test_item_first_ordering(self): - """Test that item_first=True prepends item to query.""" + @pytest.mark.parametrize( + "item_first,expected_prompts", + [ + ( + False, + [ + [100, 101, 102, 200, 201], + [100, 101, 102, 300, 301], + ], + ), + ( + True, + [ + [200, 201, 100, 101, 102], + [300, 301, 100, 101, 102], + ], + ), + ], + ids=["query_first", "item_first"], + ) + async def test_item_ordering(self, item_first, expected_prompts): + """Test that item_first flag correctly controls prompt ordering.""" mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - - # Track what prompts are built - built_prompts = [] - - async def mock_tokenizer_call(text, **kwargs): - result = MagicMock() - result.input_ids = [ord(c) for c in text[:5]] # Simple mock - return result - - # We can verify the prompt building by checking the _build_prompts method tokenizer = MagicMock() - tokenizer.return_value = MagicMock(input_ids=[1, 2, 3]) request = GenerativeScoreRequest( - query=[100, 101, 102], # Pre-tokenized - items=[[200, 201], [300, 301]], # Pre-tokenized + query=[100, 101, 102], + items=[[200, 201], [300, 301]], label_token_ids=[500], - item_first=False, + item_first=item_first, ) - # Build prompts and check ordering engine_prompts, _ = await serving._build_prompts(request, tokenizer) - # With item_first=False: query + item - assert engine_prompts[0]["prompt_token_ids"] == [100, 101, 102, 200, 201] - assert engine_prompts[1]["prompt_token_ids"] == [100, 101, 102, 300, 301] - - # Now test with item_first=True - request.item_first = True - engine_prompts, _ = await serving._build_prompts(request, tokenizer) - - # With item_first=True: item + query - assert engine_prompts[0]["prompt_token_ids"] == [200, 201, 100, 101, 102] - assert engine_prompts[1]["prompt_token_ids"] == [300, 301, 100, 101, 102] - - -# ============================================================================ -# Math Verification Tests -# ============================================================================ - - -class TestMathVerification: - """Detailed tests to verify the probability math is correct.""" - - def test_softmax_over_subset(self): - """Verify: apply_softmax=True gives softmax over subset.""" - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) - - # logits (before log) for full vocab might be [1.0, 2.0, 3.0, ...] - # After softmax over full vocab, logprobs become log(softmax(logits)) - # For our test, let's say: - # - Token A has logprob -0.5 (exp(-0.5) ≈ 0.606 true prob) - # - Token B has logprob -1.5 (exp(-1.5) ≈ 0.223 true prob) - - label_logprobs = {10: -0.5, 20: -1.5} - - # With apply_softmax=True, we do softmax over just [10, 20] - # softmax([-0.5, -1.5]) = [exp(-0.5)/(exp(-0.5)+exp(-1.5)), - # exp(-1.5)/(exp(-0.5)+exp(-1.5))] - probs = serving._compute_probabilities(label_logprobs, apply_softmax=True) - - exp_a = math.exp(-0.5) - exp_b = math.exp(-1.5) - denom = exp_a + exp_b - - expected_a = exp_a / denom - expected_b = exp_b / denom - - assert abs(probs[10] - expected_a) < 1e-9 - assert abs(probs[20] - expected_b) < 1e-9 - assert abs(sum(probs.values()) - 1.0) < 1e-9 - - def test_true_probs_without_softmax(self): - """Verify: apply_softmax=False gives exp(logprob) = true model prob.""" - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) - - # These logprobs come from log(softmax(logits)) computed by the model - # So exp(logprob) gives the true probability - label_logprobs = {10: -0.5, 20: -1.5} - - probs = serving._compute_probabilities(label_logprobs, apply_softmax=False) - - # Just exp of the logprobs - expected_a = math.exp(-0.5) - expected_b = math.exp(-1.5) - - assert abs(probs[10] - expected_a) < 1e-9 - assert abs(probs[20] - expected_b) < 1e-9 - - # They don't sum to 1 (unless we selected all tokens) - assert sum(probs.values()) < 1.0 + for i, expected in enumerate(expected_prompts): + assert engine_prompts[i]["prompt_token_ids"] == expected if __name__ == "__main__": diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py index 55773f25a461..8a2b91dd6f0f 100644 --- a/vllm/entrypoints/openai/generative_scores/api_router.py +++ b/vllm/entrypoints/openai/generative_scores/api_router.py @@ -111,28 +111,6 @@ async def create_generative_score( assert_never(generator) -@router.post( - "/generative-scores", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_generative_score_unversioned( - request: GenerativeScoreRequest, - raw_request: Request, -): - """Unversioned alias for the generative-scores endpoint. - - This provides the same functionality as /v1/generative-scores. - See create_generative_score for full documentation. - """ - return await create_generative_score(request, raw_request) - - def register_generative_scores_api_routers(app): """Register the generative scores API router with the app.""" app.include_router(router) diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/openai/generative_scores/serving.py index 5dce294510ff..30693eb1aed5 100644 --- a/vllm/entrypoints/openai/generative_scores/serving.py +++ b/vllm/entrypoints/openai/generative_scores/serving.py @@ -134,14 +134,15 @@ async def create_generative_score( return self.create_error_response(e) # Create sampling params for scoring - # We use max_tokens=1 with logprobs=-1 to get full vocab logprobs - # for the next token distribution + # We use max_tokens=1 with logprob_token_ids to efficiently get + # logprobs for only the specified label tokens (not full vocab) sampling_params = SamplingParams( max_tokens=1, temperature=request.temperature if request.temperature else 0.0, top_k=request.top_k if request.top_k is not None else 0, top_p=request.top_p if request.top_p is not None else 1.0, - logprobs=-1, # Get all vocab logprobs + logprobs=len(request.label_token_ids), # Request enough logprobs + logprob_token_ids=request.label_token_ids, # Efficient: only these tokens n=1, ) @@ -283,9 +284,11 @@ async def _build_prompts( Returns: Tuple of (list of TokensPrompt, list of prompt token counts). """ + # Get async tokenizer once for efficiency + async_tokenizer = self._get_async_tokenizer(tokenizer) + # Tokenize query if it's a string if isinstance(request.query, str): - async_tokenizer = self._get_async_tokenizer(tokenizer) query_result = await async_tokenizer( request.query, add_special_tokens=request.add_special_tokens, @@ -300,7 +303,6 @@ async def _build_prompts( for item in request.items: # Tokenize item if it's a string if isinstance(item, str): - async_tokenizer = self._get_async_tokenizer(tokenizer) # Don't add special tokens for items to avoid duplicate BOS/EOS item_result = await async_tokenizer( item, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 1d097852e194..8f081b2ca822 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -190,6 +190,12 @@ class SamplingParams( prompt_logprobs: int | None = None """Number of log probabilities to return per prompt token. When set to -1, return all `vocab_size` log probabilities.""" + logprob_token_ids: list[int] | None = None + """Specific token IDs to return logprobs for. More efficient than + logprobs=-1 when you only need logprobs for a small set of tokens. + When set, logprobs for exactly these token IDs will be returned, + in addition to the sampled token. This is useful for scoring tasks + where you want to compare probabilities of specific label tokens.""" flat_logprobs: bool = False """Whether to return logprobs in flatten format (i.e. FlatLogprob) for better performance. diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index b1101b1b2318..fda90bec6b96 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -22,6 +22,11 @@ class SamplingMetadata: # None means no logprobs, 0 means sampled token logprobs only max_num_logprobs: int | None + # Specific token IDs to compute logprobs for (more efficient than full vocab) + # When set, logprobs are computed only for these token IDs using gather + # req_index -> list of token IDs to get logprobs for + logprob_token_ids: dict[int, list[int]] | None + no_penalties: bool prompt_token_ids: torch.Tensor | None frequency_penalties: torch.Tensor diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index c75b4f0543c0..6bf4e488fda4 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -13,6 +13,7 @@ from vllm.v1.sample.ops.logprobs import batched_count_greater_than from vllm.v1.sample.ops.penalties import apply_all_penalties from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler +from vllm.v1.worker.gpu.sample.logprob import compute_token_logprobs _SAMPLING_EPS = 1e-5 @@ -102,8 +103,16 @@ def forward( # return int32 (while PyTorch argmax and topk return int64). sampled = sampled.long() + # Handle logprob_token_ids if specified (more efficient than full vocab) + # This is used by generative_scores API to get logprobs for specific tokens + logprob_token_ids_tensors = None + if sampling_metadata.logprob_token_ids: + logprob_token_ids_tensors = self.gather_specific_token_logprobs( + logits, sampling_metadata.logprob_token_ids, sampled + ) + if num_logprobs is None: - logprobs_tensors = None + logprobs_tensors = logprob_token_ids_tensors elif num_logprobs == -1: # Return the full unsorted and unranked logprobs. logprobs_tensors = LogprobsTensors( @@ -115,6 +124,11 @@ def forward( raw_logprobs, num_logprobs, token_ids=sampled ) + # If we have both num_logprobs and logprob_token_ids, prefer + # logprob_token_ids as it's more specific + if logprob_token_ids_tensors is not None and num_logprobs is not None: + logprobs_tensors = logprob_token_ids_tensors + # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) @@ -128,6 +142,62 @@ def forward( ) return sampler_output + def gather_specific_token_logprobs( + self, + logits: torch.Tensor, + logprob_token_ids: dict[int, list[int]], + sampled: torch.Tensor, + ) -> LogprobsTensors | None: + """Compute logprobs for specific token IDs. + + This is more efficient than computing full vocab logprobs when you only + need logprobs for a small set of tokens (e.g., for scoring tasks). + + Args: + logits: [batch_size, vocab_size] tensor of logits + logprob_token_ids: dict mapping req_index -> list of token IDs + sampled: [batch_size] tensor of sampled token IDs + + Returns: + LogprobsTensors with logprobs for the specified tokens, or None + if no requests have logprob_token_ids. + """ + if not logprob_token_ids: + return None + + batch_size = logits.shape[0] + vocab_size = logits.shape[1] + + # For now, assume all requests in the batch have the same token IDs + # (this is the common case for generative_scores API) + # Get the first request's token IDs as the common set + first_token_ids = next(iter(logprob_token_ids.values())) + num_tokens = len(first_token_ids) + + # Create token_ids tensor: [batch_size, num_tokens] + # Include sampled token as first element (like gather_logprobs does) + token_ids_tensor = torch.zeros( + batch_size, num_tokens + 1, dtype=torch.int64, device=logits.device + ) + token_ids_tensor[:, 0] = sampled # First column is sampled token + token_ids_tensor[:, 1:] = torch.tensor( + first_token_ids, dtype=torch.int64, device=logits.device + ) + + # Compute logprobs efficiently using the Triton kernel + logprobs = compute_token_logprobs(logits, token_ids_tensor) + + # Compute ranks for the sampled token + token_ranks = torch.empty(batch_size, dtype=torch.int64, device=logits.device) + sampled_logits = logits.gather(-1, sampled.unsqueeze(-1)) + token_ranks = (logits > sampled_logits).sum(dim=-1) + + return LogprobsTensors( + logprob_token_ids=token_ids_tensor.to(torch.int32), + logprobs=logprobs, + selected_token_ranks=token_ranks, + ) + @staticmethod def apply_temperature( logits: torch.Tensor, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index c70970fdc06e..02e9c42cafa4 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -223,6 +223,10 @@ def __init__( self.num_logprobs: dict[str, int] = {} + # req_id -> list of specific token IDs to compute logprobs for + # More efficient than num_logprobs=-1 when only a few tokens are needed + self.logprob_token_ids: dict[str, list[int]] = {} + # To accumulate prompt logprobs tensor chunks across prefill steps. self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {} @@ -383,6 +387,10 @@ def add_request( else sampling_params.logprobs ) + # Store specific token IDs to compute logprobs for (more efficient) + if sampling_params.logprob_token_ids is not None: + self.logprob_token_ids[req_id] = sampling_params.logprob_token_ids + if sampling_params.allowed_token_ids: self.has_allowed_token_ids.add(req_id) if self.allowed_token_ids_mask_cpu_tensor is None: @@ -509,6 +517,7 @@ def remove_request(self, req_id: str) -> int | None: self.repetition_penalties_reqs.discard(req_id) self.generators.pop(req_index, None) self.num_logprobs.pop(req_id, None) + self.logprob_token_ids.pop(req_id, None) self.in_progress_prompt_logprobs_cpu.pop(req_id, None) if self.prev_req_id_to_index is not None: self.prev_req_id_to_index.pop(req_id, None) @@ -835,6 +844,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata: ) allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] + # Build per-request logprob_token_ids mapping: req_index -> token_ids + logprob_token_ids_by_index: dict[int, list[int]] | None = None + if self.logprob_token_ids: + logprob_token_ids_by_index = {} + for req_id, token_ids in self.logprob_token_ids.items(): + if req_id in self.req_id_to_index: + req_index = self.req_id_to_index[req_id] + logprob_token_ids_by_index[req_index] = token_ids + return SamplingMetadata( temperature=temperature, all_greedy=self.all_greedy, @@ -843,6 +861,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata: top_k=None if self.no_top_k else self.top_k[:num_reqs], generators=self.generators, max_num_logprobs=self.max_num_logprobs, + logprob_token_ids=logprob_token_ids_by_index, prompt_token_ids=prompt_token_ids, frequency_penalties=self.frequency_penalties[:num_reqs], presence_penalties=self.presence_penalties[:num_reqs], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1bd6e5116a7d..d0316091509f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4926,6 +4926,7 @@ def _dummy_sampler_run( top_k=dummy_tensors(logits.size(1) - 1), generators={}, max_num_logprobs=None, + logprob_token_ids=None, no_penalties=True, prompt_token_ids=None, frequency_penalties=dummy_tensors(0.1), From 8288f48a7f5659bc40481d8e1b06e7664b4825e5 Mon Sep 17 00:00:00 2001 From: vedant jhaveri Date: Sat, 31 Jan 2026 00:34:41 +0000 Subject: [PATCH 03/28] clean up code --- .../openai/test_generative_scores.py | 15 +- .../openai/test_generative_scores_e2e.py | 341 ++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/engine/serving.py | 6 + .../openai/generative_scores/__init__.py | 22 ++ .../openai/generative_scores/api_router.py | 2 +- .../openai/generative_scores/protocol.py | 2 +- .../openai/generative_scores/serving.py | 3 + 8 files changed, 385 insertions(+), 8 deletions(-) create mode 100644 tests/entrypoints/openai/test_generative_scores_e2e.py diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/openai/test_generative_scores.py index 6a48a54e35f2..a89efe772ae1 100644 --- a/tests/entrypoints/openai/test_generative_scores.py +++ b/tests/entrypoints/openai/test_generative_scores.py @@ -33,9 +33,11 @@ from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM -MODEL_NAME = "openai-community/gpt2" +# Use local model path for testing +MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" BASE_MODEL_PATHS = [ - BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME, model_path=MODEL_PATH), ] @@ -48,7 +50,7 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" - tokenizer = MODEL_NAME + tokenizer = MODEL_PATH trust_remote_code = False tokenizer_mode = "auto" max_model_len = 100 @@ -64,7 +66,7 @@ class MockModelConfig: generation_config: str = "auto" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) skip_tokenizer_init = False - vocab_size = 50257 # GPT-2 vocab size + vocab_size = 151936 # Qwen3-0.6B vocab size def get_diff_sampling_param(self): return self.diff_sampling_param or {} @@ -81,7 +83,7 @@ def get_vocab_size(self): def _create_mock_engine(): """Create a mock AsyncLLM engine.""" mock_engine = MagicMock(spec=AsyncLLM) - mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_PATH) mock_engine.errored = False mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() @@ -125,6 +127,7 @@ def _create_mock_request_output( request_id="test-request", prompt="test prompt", prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, outputs=[completion_output], finished=True, ) @@ -331,6 +334,8 @@ class TestValidation: }, "at least one item", ), + # Note: mixed_item_types (string and token list) is validated by + # Pydantic before our code runs, so we test it in e2e tests instead ], ids=["invalid_token_id", "empty_label_tokens", "empty_items"], ) diff --git a/tests/entrypoints/openai/test_generative_scores_e2e.py b/tests/entrypoints/openai/test_generative_scores_e2e.py new file mode 100644 index 000000000000..8e9dcc54c2fa --- /dev/null +++ b/tests/entrypoints/openai/test_generative_scores_e2e.py @@ -0,0 +1,341 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""End-to-end tests for the Generative Scores API. + +These tests verify the full HTTP request/response flow using RemoteOpenAIServer. +""" + +import pytest +import requests + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "512", + "--enforce-eager", + "--max-num-seqs", + "32", + ] + + with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: + yield remote_server + + +class TestGenerativeScoresE2E: + """End-to-end tests for generative scores API.""" + + @pytest.mark.asyncio + async def test_basic_generative_score_request(self, server: RemoteOpenAIServer): + """Test basic generative score request with string inputs.""" + # Get some token IDs to test with - we'll use common tokens + # For Qwen3-0.6B, let's use tokens for "Yes" and "No" + # First, let's make a simple request to verify the endpoint works + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Is Paris the capital of France? Answer with Yes or No: ", + "items": ["Paris is beautiful.", "London is rainy."], + "label_token_ids": [9454, 2753], # Common token IDs + "apply_softmax": True, + "item_first": False, + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + + # Verify response structure + assert "id" in data + assert data["id"].startswith("genscore-") + assert data["object"] == "generative_score" + assert "model" in data + assert "results" in data + assert "usage" in data + assert len(data["results"]) == 2 + + # Verify each result has expected structure + for i, result in enumerate(data["results"]): + assert result["index"] == i + assert "token_probs" in result + # Probabilities should be between 0 and 1 + for token_id, prob in result["token_probs"].items(): + assert 0.0 <= prob <= 1.0 + + # With apply_softmax=True, probabilities should sum to ~1 + for result in data["results"]: + prob_sum = sum(result["token_probs"].values()) + assert abs(prob_sum - 1.0) < 1e-5, f"Prob sum: {prob_sum}" + + @pytest.mark.asyncio + async def test_generative_score_with_pretokenized_input( + self, server: RemoteOpenAIServer + ): + """Test generative score with pre-tokenized inputs.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": [100, 200, 300, 400, 500], # Pre-tokenized query + "items": [[600, 700], [800, 900, 1000]], # Pre-tokenized items + "label_token_ids": [1, 2, 3], + "apply_softmax": True, + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + + assert data["object"] == "generative_score" + assert len(data["results"]) == 2 + + @pytest.mark.asyncio + async def test_generative_score_apply_softmax_false( + self, server: RemoteOpenAIServer + ): + """Test generative score with apply_softmax=False returns true model probs.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query ", + "items": ["item1", "item2"], + "label_token_ids": [100, 200, 300], + "apply_softmax": False, + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + + # With apply_softmax=False, probabilities should NOT sum to 1 + # (they are true model probs over full vocab for those tokens) + for result in data["results"]: + prob_sum = sum(result["token_probs"].values()) + # True probs typically sum to much less than 1 + assert prob_sum < 1.0, f"Prob sum should be < 1: {prob_sum}" + + @pytest.mark.asyncio + async def test_generative_score_item_first(self, server: RemoteOpenAIServer): + """Test generative score with item_first=True.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": " is the answer", + "items": ["Yes", "No"], + "label_token_ids": [100, 200], + "item_first": True, # Items prepended to query + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + assert len(data["results"]) == 2 + + @pytest.mark.asyncio + async def test_generative_score_validation_empty_items( + self, server: RemoteOpenAIServer + ): + """Test that empty items returns an error.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query", + "items": [], + "label_token_ids": [100], + }, + ) + + assert response.status_code == 400 + data = response.json() + assert "error" in data + assert "at least one item" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_generative_score_validation_empty_label_tokens( + self, server: RemoteOpenAIServer + ): + """Test that empty label_token_ids returns an error.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query", + "items": ["item1"], + "label_token_ids": [], + }, + ) + + assert response.status_code == 400 + data = response.json() + assert "error" in data + assert "at least one token" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_generative_score_validation_invalid_token_id( + self, server: RemoteOpenAIServer + ): + """Test that out-of-range token IDs return an error.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query", + "items": ["item1"], + "label_token_ids": [9999999999], # Way out of vocab range + }, + ) + + assert response.status_code == 400 + data = response.json() + assert "error" in data + assert "out of vocabulary range" in data["error"]["message"].lower() + + @pytest.mark.asyncio + async def test_generative_score_validation_mixed_item_types( + self, server: RemoteOpenAIServer + ): + """Test that mixed item types (string and token list) returns a validation error. + + Note: Pydantic validates types at request parsing, so this returns a 422 + Unprocessable Entity error, not a 400 from our validation logic. + """ + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query", + "items": ["string item", [100, 200]], # Mixed types + "label_token_ids": [100], + }, + ) + + # Pydantic returns 422 for type validation errors + assert response.status_code == 422 + + @pytest.mark.asyncio + async def test_generative_score_usage_tracking(self, server: RemoteOpenAIServer): + """Test that usage info is properly tracked.""" + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "A test query with multiple tokens ", + "items": ["item one", "item two", "item three"], + "label_token_ids": [100, 200], + }, + ) + + assert response.status_code == 200 + data = response.json() + + usage = data["usage"] + assert "prompt_tokens" in usage + assert "completion_tokens" in usage + assert "total_tokens" in usage + assert usage["prompt_tokens"] > 0 + assert usage["completion_tokens"] > 0 + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + + +class TestLogprobTokenIds: + """Tests to verify logprob_token_ids feature works correctly. + + These tests verify that the logprob_token_ids field in SamplingParams + works correctly, which is the underlying mechanism for generative scores. + """ + + @pytest.mark.asyncio + async def test_logprob_token_ids_via_completion(self, server: RemoteOpenAIServer): + """Test that logprob_token_ids returns correct logprobs for specified tokens.""" + # Use the completions API directly to test logprob_token_ids + client = server.get_client() + + # Request completion with logprobs + response = client.completions.create( + model=MODEL_NAME, + prompt="The capital of France is", + max_tokens=1, + logprobs=5, # Get top 5 logprobs + temperature=0.0, + ) + + assert len(response.choices) == 1 + choice = response.choices[0] + assert choice.logprobs is not None + assert len(choice.logprobs.top_logprobs) > 0 + + @pytest.mark.asyncio + async def test_generative_score_with_many_label_tokens( + self, server: RemoteOpenAIServer + ): + """Test generative score with many label tokens to stress test logprob_token_ids.""" + # Use a larger set of label tokens + label_token_ids = list(range(100, 200)) # 100 tokens + + response = requests.post( + server.url_for("v1/generative-scores"), + json={ + "model": MODEL_NAME, + "query": "Test query ", + "items": ["item1"], + "label_token_ids": label_token_ids, + "apply_softmax": True, + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + + # Should have probs for all 100 tokens + result = data["results"][0] + assert len(result["token_probs"]) == 100 + + @pytest.mark.asyncio + async def test_generative_score_consistency(self, server: RemoteOpenAIServer): + """Test that generative scores are consistent across identical requests.""" + request_body = { + "model": MODEL_NAME, + "query": "Is this consistent? ", + "items": ["Yes it is."], + "label_token_ids": [100, 200, 300], + "apply_softmax": True, + "temperature": 0.0, # Deterministic + } + + response1 = requests.post( + server.url_for("v1/generative-scores"), + json=request_body, + ) + response2 = requests.post( + server.url_for("v1/generative-scores"), + json=request_body, + ) + + assert response1.status_code == 200 + assert response2.status_code == 200 + + data1 = response1.json() + data2 = response2.json() + + # Probabilities should be identical for deterministic inference + probs1 = data1["results"][0]["token_probs"] + probs2 = data2["results"][0]["token_probs"] + + for token_id in probs1: + assert abs(probs1[token_id] - probs2[token_id]) < 1e-6 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a0b34c0f0fa6..8aafb435d9fb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -327,7 +327,7 @@ async def init_app_state( init_generative_scores_state, ) - init_generative_scores_state( + await init_generative_scores_state( engine_client, state, args, request_logger, supported_tasks ) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 8c3c028d05ed..3e92dfdb58b4 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -86,6 +86,10 @@ ScoreResponse, ScoreTextRequest, ) +from vllm.entrypoints.openai.generative_scores.protocol import ( + GenerativeScoreRequest, + GenerativeScoreResponse, +) from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.tokenize.protocol import ( @@ -166,6 +170,7 @@ def __init__(self, message: str = "Internal server error"): | ResponsesRequest | IOProcessorRequest | GenerateRequest + | GenerativeScoreRequest ) AnyResponse: TypeAlias = ( @@ -179,6 +184,7 @@ def __init__(self, message: str = "Internal server error"): | ClassificationResponse | ScoreResponse | GenerateResponse + | GenerativeScoreResponse ) diff --git a/vllm/entrypoints/openai/generative_scores/__init__.py b/vllm/entrypoints/openai/generative_scores/__init__.py index 208f01a7cb5e..fdebec0a71b8 100644 --- a/vllm/entrypoints/openai/generative_scores/__init__.py +++ b/vllm/entrypoints/openai/generative_scores/__init__.py @@ -1,2 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.entrypoints.openai.generative_scores.api_router import ( + init_generative_scores_state, + register_generative_scores_api_routers, +) +from vllm.entrypoints.openai.generative_scores.protocol import ( + GenerativeScoreItemResult, + GenerativeScoreRequest, + GenerativeScoreResponse, +) +from vllm.entrypoints.openai.generative_scores.serving import ( + OpenAIServingGenerativeScores, +) + +__all__ = [ + "GenerativeScoreItemResult", + "GenerativeScoreRequest", + "GenerativeScoreResponse", + "OpenAIServingGenerativeScores", + "init_generative_scores_state", + "register_generative_scores_api_routers", +] diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py index 8a2b91dd6f0f..6074db195889 100644 --- a/vllm/entrypoints/openai/generative_scores/api_router.py +++ b/vllm/entrypoints/openai/generative_scores/api_router.py @@ -116,7 +116,7 @@ def register_generative_scores_api_routers(app): app.include_router(router) -def init_generative_scores_state( +async def init_generative_scores_state( engine_client: "EngineClient", state: "State", args: "Namespace", diff --git a/vllm/entrypoints/openai/generative_scores/protocol.py b/vllm/entrypoints/openai/generative_scores/protocol.py index e2feb1cca085..7e500b6bcff4 100644 --- a/vllm/entrypoints/openai/generative_scores/protocol.py +++ b/vllm/entrypoints/openai/generative_scores/protocol.py @@ -130,7 +130,7 @@ class GenerativeScoreResponse(OpenAIBaseModel): usage: Token usage information. """ - id: str = Field(default_factory=lambda: f"genscore-{random_uuid()}") + id: str = Field(default="") object: Literal["generative_score"] = "generative_score" created: int = Field(default_factory=lambda: int(time.time())) model: str diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/openai/generative_scores/serving.py index 30693eb1aed5..6da011ee3832 100644 --- a/vllm/entrypoints/openai/generative_scores/serving.py +++ b/vllm/entrypoints/openai/generative_scores/serving.py @@ -115,6 +115,9 @@ async def create_generative_score( "items must contain at least one item." ) + # Note: Mixed item types (string and token list) are validated by + # Pydantic at request parsing time, so we don't need to check here. + try: lora_request = self._maybe_get_adapters(request) except (ValueError, TypeError, RuntimeError) as e: From 2dc625019c4e24eb56eee42902baab269e6f449a Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Wed, 4 Feb 2026 21:01:56 +0000 Subject: [PATCH 04/28] combine generative score API in v1/score to unify the scoring endpoint. generative used when we launch server with a *FaorCausalLM* model --- .../openai/test_generative_scores.py | 4 +- vllm/config/model.py | 12 ++ vllm/entrypoints/openai/api_server.py | 40 +++-- vllm/entrypoints/openai/engine/serving.py | 6 - .../openai/generative_scores/__init__.py | 24 --- .../openai/generative_scores/api_router.py | 148 ------------------ .../openai/generative_scores/protocol.py | 138 ---------------- vllm/entrypoints/pooling/__init__.py | 20 +++ .../score/generative_scores.py} | 143 +++++++++++++++-- vllm/entrypoints/pooling/score/protocol.py | 7 + vllm/entrypoints/pooling/score/serving.py | 120 +++++++++++++- 11 files changed, 318 insertions(+), 344 deletions(-) delete mode 100644 vllm/entrypoints/openai/generative_scores/__init__.py delete mode 100644 vllm/entrypoints/openai/generative_scores/api_router.py delete mode 100644 vllm/entrypoints/openai/generative_scores/protocol.py rename vllm/entrypoints/{openai/generative_scores/serving.py => pooling/score/generative_scores.py} (75%) diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/openai/test_generative_scores.py index a89efe772ae1..318b8e5d2163 100644 --- a/tests/entrypoints/openai/test_generative_scores.py +++ b/tests/entrypoints/openai/test_generative_scores.py @@ -18,12 +18,12 @@ from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scores.protocol import ( +from vllm.entrypoints.pooling.score.generative_scores import ( GenerativeScoreItemResult, GenerativeScoreRequest, GenerativeScoreResponse, ) -from vllm.entrypoints.openai.generative_scores.serving import ( +from vllm.entrypoints.pooling.score.generative_scores import ( OpenAIServingGenerativeScores, ) from vllm.entrypoints.openai.models.protocol import BaseModelPath diff --git a/vllm/config/model.py b/vllm/config/model.py index 1254d75ace80..1c6b5b813cca 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1431,6 +1431,18 @@ def is_cross_encoder(self) -> bool: self._model_info.supports_cross_encoding or self.convert_type == "classify" ) + @property + def is_causal_lm(self) -> bool: + """Check if the model architecture is a CausalLM model. + + Returns True if any architecture in hf_config.architectures matches + the pattern .*ForCausalLM.* (e.g., LlamaForCausalLM, Qwen2ForCausalLM). + """ + import re + architectures = getattr(self.hf_config, "architectures", []) + pattern = re.compile(r".*ForCausalLM.*") + return any(pattern.match(arch) for arch in architectures) + @property def is_pp_supported(self) -> bool: return self._model_info.supports_pp diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8aafb435d9fb..26d75eaf1568 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -189,12 +189,6 @@ def build_app(args: Namespace, supported_tasks: tuple["SupportedTask", ...]) -> register_generate_api_routers(app) - from vllm.entrypoints.openai.generative_scores.api_router import ( - register_generative_scores_api_routers, - ) - - register_generative_scores_api_routers(app) - if "transcription" in supported_tasks: from vllm.entrypoints.openai.translations.api_router import ( attach_router as register_translations_api_router, @@ -213,6 +207,11 @@ def build_app(args: Namespace, supported_tasks: tuple["SupportedTask", ...]) -> from vllm.entrypoints.pooling import register_pooling_api_routers register_pooling_api_routers(app, supported_tasks) + elif "generate" in supported_tasks: + # For CausalLM models, register score routes to enable generative scoring + from vllm.entrypoints.pooling.score.api_router import router as score_router + + app.include_router(score_router) app.root_path = args.root_path app.add_middleware( @@ -323,14 +322,6 @@ async def init_app_state( engine_client, state, args, request_logger, supported_tasks ) - from vllm.entrypoints.openai.generative_scores.api_router import ( - init_generative_scores_state, - ) - - await init_generative_scores_state( - engine_client, state, args, request_logger, supported_tasks - ) - if "transcription" in supported_tasks: from vllm.entrypoints.openai.translations.api_router import ( init_transcription_state, @@ -349,6 +340,27 @@ async def init_app_state( from vllm.entrypoints.pooling import init_pooling_state init_pooling_state(engine_client, state, args, request_logger, supported_tasks) + elif "generate" in supported_tasks: + # For CausalLM models, initialize score state for generative scoring + from vllm.entrypoints.pooling.score.generative_scores import ( + OpenAIServingGenerativeScores, + ) + from vllm.entrypoints.pooling.score.serving import ServingScores + + generative_scores_handler = OpenAIServingGenerativeScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_scores = ServingScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + score_template=None, + log_error_stack=args.log_error_stack, + generative_scores_handler=generative_scores_handler, + ) state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 3e92dfdb58b4..8c3c028d05ed 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -86,10 +86,6 @@ ScoreResponse, ScoreTextRequest, ) -from vllm.entrypoints.openai.generative_scores.protocol import ( - GenerativeScoreRequest, - GenerativeScoreResponse, -) from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.tokenize.protocol import ( @@ -170,7 +166,6 @@ def __init__(self, message: str = "Internal server error"): | ResponsesRequest | IOProcessorRequest | GenerateRequest - | GenerativeScoreRequest ) AnyResponse: TypeAlias = ( @@ -184,7 +179,6 @@ def __init__(self, message: str = "Internal server error"): | ClassificationResponse | ScoreResponse | GenerateResponse - | GenerativeScoreResponse ) diff --git a/vllm/entrypoints/openai/generative_scores/__init__.py b/vllm/entrypoints/openai/generative_scores/__init__.py deleted file mode 100644 index fdebec0a71b8..000000000000 --- a/vllm/entrypoints/openai/generative_scores/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.entrypoints.openai.generative_scores.api_router import ( - init_generative_scores_state, - register_generative_scores_api_routers, -) -from vllm.entrypoints.openai.generative_scores.protocol import ( - GenerativeScoreItemResult, - GenerativeScoreRequest, - GenerativeScoreResponse, -) -from vllm.entrypoints.openai.generative_scores.serving import ( - OpenAIServingGenerativeScores, -) - -__all__ = [ - "GenerativeScoreItemResult", - "GenerativeScoreRequest", - "GenerativeScoreResponse", - "OpenAIServingGenerativeScores", - "init_generative_scores_state", - "register_generative_scores_api_routers", -] diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py deleted file mode 100644 index 6074db195889..000000000000 --- a/vllm/entrypoints/openai/generative_scores/api_router.py +++ /dev/null @@ -1,148 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""API router for the Generative Scores endpoint. - -This module defines the FastAPI routes for the /v1/generative-scores endpoint. -""" - -from http import HTTPStatus -from typing import TYPE_CHECKING - -from fastapi import APIRouter, Depends, Request -from fastapi.responses import JSONResponse -from typing_extensions import assert_never - -from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scores.protocol import ( - GenerativeScoreRequest, - GenerativeScoreResponse, -) -from vllm.entrypoints.openai.utils import validate_json_request -from vllm.entrypoints.utils import load_aware_call, with_cancellation -from vllm.logger import init_logger - -if TYPE_CHECKING: - from argparse import Namespace - - from starlette.datastructures import State - - from vllm.engine.protocol import EngineClient - from vllm.entrypoints.logger import RequestLogger - from vllm.tasks import SupportedTask - - -router = APIRouter() - -logger = init_logger(__name__) - - -def generative_scores(request: Request): - """Get the generative scores handler from app state.""" - return request.app.state.openai_serving_generative_scores - - -@router.post( - "/v1/generative-scores", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_generative_score( - request: GenerativeScoreRequest, - raw_request: Request, -): - """Compute generative scores for the given query and items. - - This endpoint scores the probability of specified token IDs appearing after - the given query and item are appended together. For example: - - query = "<|user|>Is the following city the capital of France? " - items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"] - label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No" - item_first = False - - This would pass the following prompts to the model: - - "<|user|>Is the following city the capital of France? Paris <|assistant|>" - - "<|user|>Is the following city the capital of France? London <|assistant|>" - - "<|user|>Is the following city the capital of France? Berlin <|assistant|>" - - The API would return the probabilities of the model producing "Yes" and "No" - as the next token for each prompt. - - Args: - request: The GenerativeScoreRequest containing: - - model: The model to use (optional) - - query: The query text or pre-tokenized token IDs - - items: List of item texts or pre-tokenized token IDs - - label_token_ids: List of token IDs to compute probabilities for - - apply_softmax: Whether to normalize over only label tokens (default: True) - - item_first: Whether to prepend items to query (default: False) - - Returns: - GenerativeScoreResponse containing probabilities for each item. - - Raises: - 400 Bad Request: If label_token_ids are out of vocabulary range. - 500 Internal Server Error: If an internal error occurs. - """ - handler = generative_scores(raw_request) - if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Generative Scores API" - ) - - try: - generator = await handler.create_generative_score(request, raw_request) - except Exception as e: - return handler.create_error_response(e) - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - elif isinstance(generator, GenerativeScoreResponse): - return JSONResponse(content=generator.model_dump()) - - assert_never(generator) - - -def register_generative_scores_api_routers(app): - """Register the generative scores API router with the app.""" - app.include_router(router) - - -async def init_generative_scores_state( - engine_client: "EngineClient", - state: "State", - args: "Namespace", - request_logger: "RequestLogger | None", - supported_tasks: tuple["SupportedTask", ...], -): - """Initialize the generative scores serving state. - - Args: - engine_client: The engine client for model inference. - state: The application state to store the handler. - args: Command line arguments. - request_logger: Logger for request logging. - supported_tasks: Tuple of supported tasks. - """ - from vllm.entrypoints.openai.generative_scores.serving import ( - OpenAIServingGenerativeScores, - ) - - # Only initialize for generative models - if "generate" in supported_tasks: - state.openai_serving_generative_scores = OpenAIServingGenerativeScores( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - log_error_stack=args.log_error_stack, - ) - else: - state.openai_serving_generative_scores = None diff --git a/vllm/entrypoints/openai/generative_scores/protocol.py b/vllm/entrypoints/openai/generative_scores/protocol.py deleted file mode 100644 index 7e500b6bcff4..000000000000 --- a/vllm/entrypoints/openai/generative_scores/protocol.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Protocol definitions for the Generative Scores API. - -This module defines the request and response models for the /v1/generative-scores -endpoint, which computes the probability of specified token IDs appearing as the -next token after a given query+item prompt. -""" - -import time -from typing import Literal - -from pydantic import Field - -from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo -from vllm.utils import random_uuid - - -class GenerativeScoreRequest(OpenAIBaseModel): - """Request for computing generative scores. - - This endpoint scores the probability of specified token IDs appearing after - the given query and item are appended together. For example: - - query = "<|user|>Is the following city the capital of France? " - items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"] - label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No" - item_first = False - - This would pass the following prompts to the model: - "<|user|>Is the following city the capital of France? Paris <|assistant|>" - "<|user|>Is the following city the capital of France? London <|assistant|>" - "<|user|>Is the following city the capital of France? Berlin <|assistant|>" - - The API would then return the probabilities of the model producing "Yes" - and "No" as the next token. - - Attributes: - model: The model to use for scoring. Optional, follows existing patterns. - query: The query text or pre-tokenized query token IDs. - items: The item text(s) or pre-tokenized item token IDs. - label_token_ids: List of token IDs to compute probabilities for. - apply_softmax: Whether to normalize probabilities using softmax over only - the label_token_ids (True) or return true model probabilities over - the full vocab for those ids (False). - item_first: If True, prepend items to query. Otherwise append items to query. - temperature: Temperature for logits. Default 0.0 for scoring (greedy). - top_k: Top-k filtering. Default 0 (disabled) for scoring. - top_p: Top-p filtering. Default 1.0 (disabled) for scoring. - add_special_tokens: Whether to add special tokens when tokenizing. - """ - - model: str | None = None - query: str | list[int] = Field( - ..., - description="The query text or pre-tokenized query token IDs.", - ) - items: list[str] | list[list[int]] = Field( - ..., - description="List of item texts or pre-tokenized item token IDs.", - ) - label_token_ids: list[int] = Field( - ..., - description="List of token IDs to compute probabilities for.", - ) - apply_softmax: bool = Field( - default=True, - description=( - "If True, normalize probabilities using softmax over only the " - "label_token_ids. If False, return the true model probabilities " - "over the full vocab for those ids." - ), - ) - item_first: bool = Field( - default=False, - description="If True, prepend items to query. Otherwise append items to query.", - ) - temperature: float | None = Field( - default=0.0, - description="Temperature for logits. Default 0.0 for scoring.", - ) - top_k: int | None = Field( - default=0, - description="Top-k filtering. Default 0 (disabled) for scoring.", - ) - top_p: float | None = Field( - default=1.0, - description="Top-p filtering. Default 1.0 (disabled) for scoring.", - ) - add_special_tokens: bool = Field( - default=True, - description="Whether to add special tokens when tokenizing.", - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0)." - ), - ) - request_id: str = Field( - default_factory=random_uuid, - description="The request_id related to this request.", - ) - - -class GenerativeScoreItemResult(OpenAIBaseModel): - """Result for a single item in the generative scores response. - - Attributes: - index: The index of this item in the input items list. - token_probs: Dictionary mapping token IDs (as strings) to their probabilities. - """ - - index: int - token_probs: dict[str, float] = Field( - description="Mapping of token ID (as string) to probability." - ) - - -class GenerativeScoreResponse(OpenAIBaseModel): - """Response from the generative scores endpoint. - - Attributes: - id: Unique identifier for this response. - object: Type of object, always "generative_score". - created: Unix timestamp of when the response was created. - model: The model used for scoring. - results: List of scoring results, one per input item. - usage: Token usage information. - """ - - id: str = Field(default="") - object: Literal["generative_score"] = "generative_score" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - results: list[GenerativeScoreItemResult] - usage: UsageInfo diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py index 737f1efe895e..6293a3ea6955 100644 --- a/vllm/entrypoints/pooling/__init__.py +++ b/vllm/entrypoints/pooling/__init__.py @@ -54,6 +54,9 @@ def init_pooling_state( from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling + from vllm.entrypoints.pooling.score.generative_scores import ( + OpenAIServingGenerativeScores, + ) from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.tasks import POOLING_TASKS @@ -101,6 +104,22 @@ def init_pooling_state( if "classify" in supported_tasks else None ) + + # Initialize generative scores handler for CausalLM models + # This handler is used to route /v1/score requests to generative scores + # when the model architecture is CausalLM + generative_scores_handler = None + if "embed" in supported_tasks or "score" in supported_tasks: + # Check if we should create the generative scores handler + # by checking if generate task is supported (CausalLM models) + if "generate" in supported_tasks: + generative_scores_handler = OpenAIServingGenerativeScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_scores = ( ServingScores( engine_client, @@ -108,6 +127,7 @@ def init_pooling_state( request_logger=request_logger, score_template=resolved_chat_template, log_error_stack=args.log_error_stack, + generative_scores_handler=generative_scores_handler, ) if ("embed" in supported_tasks or "score" in supported_tasks) else None diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/pooling/score/generative_scores.py similarity index 75% rename from vllm/entrypoints/openai/generative_scores/serving.py rename to vllm/entrypoints/pooling/score/generative_scores.py index 6da011ee3832..6999f3449094 100644 --- a/vllm/entrypoints/openai/generative_scores/serving.py +++ b/vllm/entrypoints/pooling/score/generative_scores.py @@ -1,41 +1,162 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Serving class for the Generative Scores API. +"""Generative Scores implementation for CausalLM models. -This module implements the OpenAIServingGenerativeScores class which handles -requests to compute the probability of specified token IDs appearing as the -next token after a given query+item prompt. +This module implements generative scoring functionality that computes the +probability of specified token IDs appearing as the next token after a +given query+item prompt. This is used internally by the score endpoint +when the model architecture is a CausalLM. """ import asyncio import math import time from collections.abc import AsyncGenerator, Mapping +from typing import Literal from fastapi import Request +from pydantic import Field from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo -from vllm.entrypoints.openai.engine.serving import OpenAIServing -from vllm.entrypoints.openai.generative_scores.protocol import ( - GenerativeScoreItemResult, - GenerativeScoreRequest, - GenerativeScoreResponse, +from vllm.entrypoints.openai.engine.protocol import ( + ErrorResponse, + OpenAIBaseModel, + UsageInfo, ) +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid from vllm.utils.async_utils import merge_async_iterators logger = init_logger(__name__) +# ============================================================================ +# Protocol definitions +# ============================================================================ + + +class GenerativeScoreRequest(OpenAIBaseModel): + """Request for computing generative scores. + + This is used internally when routing score requests to CausalLM models. + + Attributes: + model: The model to use for scoring. Optional, follows existing patterns. + query: The query text or pre-tokenized query token IDs. + items: The item text(s) or pre-tokenized item token IDs. + label_token_ids: List of token IDs to compute probabilities for. + apply_softmax: Whether to normalize probabilities using softmax over only + the label_token_ids (True) or return true model probabilities over + the full vocab for those ids (False). + item_first: If True, prepend items to query. Otherwise append items to query. + temperature: Temperature for logits. Default 0.0 for scoring (greedy). + top_k: Top-k filtering. Default 0 (disabled) for scoring. + top_p: Top-p filtering. Default 1.0 (disabled) for scoring. + add_special_tokens: Whether to add special tokens when tokenizing. + """ + + model: str | None = None + query: str | list[int] = Field( + ..., + description="The query text or pre-tokenized query token IDs.", + ) + items: list[str] | list[list[int]] = Field( + ..., + description="List of item texts or pre-tokenized item token IDs.", + ) + label_token_ids: list[int] = Field( + ..., + description="List of token IDs to compute probabilities for.", + ) + apply_softmax: bool = Field( + default=True, + description=( + "If True, normalize probabilities using softmax over only the " + "label_token_ids. If False, return the true model probabilities " + "over the full vocab for those ids." + ), + ) + item_first: bool = Field( + default=False, + description="If True, prepend items to query. Otherwise append items to query.", + ) + temperature: float | None = Field( + default=0.0, + description="Temperature for logits. Default 0.0 for scoring.", + ) + top_k: int | None = Field( + default=0, + description="Top-k filtering. Default 0 (disabled) for scoring.", + ) + top_p: float | None = Field( + default=1.0, + description="Top-p filtering. Default 1.0 (disabled) for scoring.", + ) + add_special_tokens: bool = Field( + default=True, + description="Whether to add special tokens when tokenizing.", + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0)." + ), + ) + request_id: str = Field( + default_factory=random_uuid, + description="The request_id related to this request.", + ) + + +class GenerativeScoreItemResult(OpenAIBaseModel): + """Result for a single item in the generative scores response. + + Attributes: + index: The index of this item in the input items list. + token_probs: Dictionary mapping token IDs (as strings) to their probabilities. + """ + + index: int + token_probs: dict[str, float] = Field( + description="Mapping of token ID (as string) to probability." + ) + + +class GenerativeScoreResponse(OpenAIBaseModel): + """Response from the generative scores computation. + + Attributes: + id: Unique identifier for this response. + object: Type of object, always "generative_score". + created: Unix timestamp of when the response was created. + model: The model used for scoring. + results: List of scoring results, one per input item. + usage: Token usage information. + """ + + id: str = Field(default="") + object: Literal["generative_score"] = "generative_score" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + results: list[GenerativeScoreItemResult] + usage: UsageInfo + + +# ============================================================================ +# Serving class +# ============================================================================ + + class OpenAIServingGenerativeScores(OpenAIServing): - """Serving class for the Generative Scores API. + """Serving class for generative scores computation. This class handles computing the probability of specified token IDs appearing as the next token after concatenating query and item prompts. diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 2af43c4a8115..b9e3b58b538f 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -28,6 +28,13 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): default=None, description=("Additional kwargs to pass to the HF processor."), ) + label_token_ids: list[int] | None = Field( + default=None, + description=( + "List of token IDs to compute probabilities for when using " + "CausalLM models. Required for generative scoring." + ), + ) # --8<-- [end:score-extra-params] def to_pooling_params(self): diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index 85c74e5a26c2..018dc01af2c9 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -4,7 +4,7 @@ import time from collections.abc import AsyncGenerator, Mapping from concurrent.futures import ThreadPoolExecutor -from typing import Any +from typing import TYPE_CHECKING, Any from fastapi import Request @@ -43,6 +43,11 @@ from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.async_utils import make_async, merge_async_iterators +if TYPE_CHECKING: + from vllm.entrypoints.pooling.score.generative_scores import ( + OpenAIServingGenerativeScores, + ) + logger = init_logger(__name__) @@ -55,6 +60,7 @@ def __init__( request_logger: RequestLogger | None, score_template: str | None = None, log_error_stack: bool = False, + generative_scores_handler: "OpenAIServingGenerativeScores | None" = None, ) -> None: super().__init__( engine_client=engine_client, @@ -63,6 +69,7 @@ def __init__( log_error_stack=log_error_stack, ) self.score_template = score_template + self.generative_scores_handler = generative_scores_handler self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) @@ -344,6 +351,107 @@ async def _run_scoring( trace_headers=trace_headers, ) + async def _generative_score( + self, + request: ScoreRequest, + raw_request: Request | None = None, + ) -> ScoreResponse | ErrorResponse: + """ + Handle scoring for CausalLM models using the generative scores API. + + Converts the score request to generative score format, calls the + generative scores handler, and converts the response back to + the standard score response format. + """ + from vllm.entrypoints.pooling.score.generative_scores import ( + GenerativeScoreRequest, + ) + + if self.generative_scores_handler is None: + return self.create_error_response( + "Generative scores handler not initialized. " + "This is required for CausalLM models." + ) + + # Extract data from request + data_1 = request.data_1 + data_2 = request.data_2 + + # Normalize data_1 and data_2 to lists + if isinstance(data_1, str): + data_1 = [data_1] + elif isinstance(data_1, dict): + data_1 = data_1.get("content", []) + + if isinstance(data_2, str): + data_2 = [data_2] + elif isinstance(data_2, dict): + data_2 = data_2.get("content", []) + + # Validate input lens + _validate_score_input_lens(data_1, data_2) + + # If data_1 has single item, expand to match data_2 + if len(data_1) == 1: + data_1 = data_1 * len(data_2) + + request_id = f"score-{self._base_request_id(raw_request)}" + created_time = int(time.time()) + + # Build generative score requests for each pair + # Each pair becomes: query=data_1[i], items=[data_2[i]] + all_results: list[ScoreResponseData] = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + + for idx, (d1, d2) in enumerate(zip(data_1, data_2)): + gen_request = GenerativeScoreRequest( + model=request.model, + query=d1, + items=[d2], + label_token_ids=request.label_token_ids, + apply_softmax=True, # Always use softmax for normalized probabilities + item_first=False, + add_special_tokens=True, + priority=request.priority, + ) + + gen_response = await self.generative_scores_handler.create_generative_score( + gen_request, raw_request + ) + + if isinstance(gen_response, ErrorResponse): + return gen_response + + # Extract the first label token probability as the score + if gen_response.results and len(gen_response.results) > 0: + token_probs = gen_response.results[0].token_probs + # Get probability of the first label token + first_token_id = str(request.label_token_ids[0]) + score = token_probs.get(first_token_id, 0.0) + + all_results.append( + ScoreResponseData( + index=idx, + score=score, + ) + ) + + total_prompt_tokens += gen_response.usage.prompt_tokens + total_completion_tokens += (gen_response.usage.completion_tokens or 0) + + return ScoreResponse( + id=request_id, + created=created_time, + model=self.models.model_name(), + data=all_results, + usage=UsageInfo( + prompt_tokens=total_prompt_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens, + completion_tokens=total_completion_tokens, + ), + ) + async def create_score( self, request: ScoreRequest, @@ -358,6 +466,16 @@ async def create_score( if error_check_ret is not None: return error_check_ret + # Check if model is CausalLM and route accordingly + if self.model_config.is_causal_lm: + # For CausalLM models, require label_token_ids + if request.label_token_ids is None: + return self.create_error_response( + "label_token_ids is required for CausalLM models. " + "Please provide a list of token IDs to compute probabilities for." + ) + return await self._generative_score(request, raw_request) + request_id = f"score-{self._base_request_id(raw_request)}" created_time = int(time.time()) From 977731776b63a9bcc48e77434a45921d94a9bac5 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Wed, 4 Feb 2026 22:25:00 +0000 Subject: [PATCH 05/28] update docs --- docs/serving/openai_compatible_server.md | 158 ++++++++++------------- 1 file changed, 70 insertions(+), 88 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 4bad9ce4b951..8adc98aa7ffb 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -70,16 +70,14 @@ In addition, we have the following custom APIs: - Applicable to all [pooling models](../models/pooling_models.md). - [Classification API](#classification-api) (`/classify`) - Only applicable to [classification models](../models/pooling_models.md). -- [Score API](#score-api) (`/score`) - - Applicable to [embedding models and cross-encoder models](../models/pooling_models.md). +- [Score API](#score-api) (`/score`, `/v1/score`) + - Applicable to [embedding models, cross-encoder models](../models/pooling_models.md), and [CausalLM models](../models/generative_models.md). + - For CausalLM models, computes next-token probabilities for specified `label_token_ids`. - [Re-rank API](#re-rank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. - Only applicable to [cross-encoder models](../models/pooling_models.md). -- [Generative Scores API](#generative-scores-api) (`/v1/generative-scores`) - - Computes next-token probabilities for specified token IDs. - - Only applicable to [text generation models](../models/generative_models.md). ## Chat Template @@ -829,8 +827,13 @@ these extra parameters are supported instead: ### Score API -Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair. -Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. +Our Score API provides a unified interface for computing similarity or relevance scores: + +- **Embedding models**: Computes cosine similarity between embeddings. +- **Cross-encoder models**: Predicts relevance scores for sentence pairs. +- **CausalLM models**: Computes next-token probabilities for specified `label_token_ids` (requires the `label_token_ids` parameter). + +For embedding and cross-encoder models, the score typically represents similarity on a scale of 0 to 1. You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). @@ -1059,6 +1062,66 @@ The following extra parameters are supported: --8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params" ``` +#### CausalLM Models (Generative Scoring) + +When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the endpoint computes the probability of specified token IDs appearing as the next token. This is useful for classification tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens. + +**Requirements for CausalLM models:** + +- The `label_token_ids` parameter is **required** and specifies which token IDs to compute probabilities for. +- The score returned is the probability of the first token in `label_token_ids`. + +##### Example: Classification with CausalLM + +```bash +curl -X POST http://localhost:8000/v1/score \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "queries": "Is this city the capital of France?", + "documents": ["Paris", "London", "Berlin"], + "label_token_ids": [9454, 2753] + }' +``` + +??? console "Response" + + ```json + { + "id": "score-abc123", + "object": "list", + "created": 1234567890, + "model": "Qwen/Qwen3-0.6B", + "data": [ + {"index": 0, "object": "score", "score": 0.95}, + {"index": 1, "object": "score", "score": 0.12}, + {"index": 2, "object": "score", "score": 0.08} + ], + "usage": {"prompt_tokens": 45, "total_tokens": 48, "completion_tokens": 3} + } + ``` + +##### How it works + +1. **Prompt Construction**: For each document, builds `prompt = query + document` +2. **Forward Pass**: Runs the model to get next-token logits +3. **Probability Extraction**: Extracts probabilities for specified `label_token_ids` +4. **Softmax Normalization**: Applies softmax over only the label tokens (probabilities sum to 1 over label tokens) +5. **Score Selection**: Returns the probability of the first token in `label_token_ids` as the score + +##### Finding Token IDs + +To find the token IDs for your labels, use the tokenizer: + +```python +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") +yes_id = tokenizer.encode("Yes", add_special_tokens=False)[0] +no_id = tokenizer.encode("No", add_special_tokens=False)[0] +print(f"Yes: {yes_id}, No: {no_id}") +``` + ### Re-rank API Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and @@ -1150,87 +1213,6 @@ The following extra parameters are supported: --8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params" ``` -### Generative Scores API - -The Generative Scores API computes the probability of specified token IDs appearing as the next token after a given query+item prompt. This is useful for classification tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens without generating them. - -Unlike traditional logprobs which are limited to top-k tokens, this API: -- Returns probabilities for any specified token IDs in the vocabulary -- Supports batch scoring of multiple items against a single query -- Offers both subset softmax (normalize over label tokens only) and true model probabilities - -#### Example: Sentiment Classification - -```python -import requests - -# Token IDs for "Yes" and "No" (model-specific) -YES_TOKEN_ID = 2332 -NO_TOKEN_ID = 1223 - -response = requests.post( - "http://localhost:8000/v1/generative-scores", - json={ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "query": "<|user|>Is the following city the capital of France? ", - "items": [ - "Paris <|assistant|>", - "London <|assistant|>", - "Berlin <|assistant|>" - ], - "label_token_ids": [YES_TOKEN_ID, NO_TOKEN_ID], - "apply_softmax": True, - "item_first": False - } -) - -# Response: -# { -# "results": [ -# {"index": 0, "token_probs": {"2332": 0.95, "1223": 0.05}}, # Paris: Yes=95% -# {"index": 1, "token_probs": {"2332": 0.10, "1223": 0.90}}, # London: No=90% -# {"index": 2, "token_probs": {"2332": 0.05, "1223": 0.95}} # Berlin: No=95% -# ] -# } -``` - -#### Request Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `model` | string | null | Model to use for scoring. | -| `query` | string \| list[int] | required | Query text or pre-tokenized token IDs. | -| `items` | list[string] \| list[list[int]] | required | Items to score against the query. | -| `label_token_ids` | list[int] | required | Token IDs to compute probabilities for. | -| `apply_softmax` | bool | true | If true, normalize over label tokens. If false, return true model probs. | -| `item_first` | bool | false | If true, prepend items to query. Otherwise append. | -| `temperature` | float | 0.0 | Temperature for logits (0.0 = greedy). | -| `top_k` | int | 0 | Top-k filtering (0 = disabled for scoring). | -| `top_p` | float | 1.0 | Top-p filtering (1.0 = disabled for scoring). | -| `add_special_tokens` | bool | true | Whether to add special tokens when tokenizing. | - -#### Probability Computation - -The endpoint computes probabilities from the model's next-token distribution: - -1. **Prompt Construction**: For each item, build `prompt = query + item` (or `item + query` if `item_first=True`) -2. **Forward Pass**: Run the model to get next-token logits -3. **Probability Extraction**: Get logprobs for specified `label_token_ids` - -The `apply_softmax` parameter controls normalization: - -- **`apply_softmax=True`** (default): Softmax over only the label tokens - ``` - P(token_i | prompt) = exp(logit_i) / Σ exp(logit_j) for j in label_token_ids - ``` - Probabilities sum to 1 over the label tokens. - -- **`apply_softmax=False`**: True model probabilities - ``` - P(token_i | prompt) = exp(logprob_i) # logprob is already normalized over full vocab - ``` - Probabilities are the actual model confidence over the full vocabulary. - ## Ray Serve LLM Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure. From b2b52234ad925af5665549907cf4326213975c14 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 5 Feb 2026 01:53:54 +0000 Subject: [PATCH 06/28] change test end point to v1/score --- .../openai/test_generative_scores_e2e.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_generative_scores_e2e.py b/tests/entrypoints/openai/test_generative_scores_e2e.py index 8e9dcc54c2fa..ba74410cad02 100644 --- a/tests/entrypoints/openai/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/test_generative_scores_e2e.py @@ -40,7 +40,7 @@ async def test_basic_generative_score_request(self, server: RemoteOpenAIServer): # For Qwen3-0.6B, let's use tokens for "Yes" and "No" # First, let's make a simple request to verify the endpoint works response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Is Paris the capital of France? Answer with Yes or No: ", @@ -82,7 +82,7 @@ async def test_generative_score_with_pretokenized_input( ): """Test generative score with pre-tokenized inputs.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": [100, 200, 300, 400, 500], # Pre-tokenized query @@ -104,7 +104,7 @@ async def test_generative_score_apply_softmax_false( ): """Test generative score with apply_softmax=False returns true model probs.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query ", @@ -128,7 +128,7 @@ async def test_generative_score_apply_softmax_false( async def test_generative_score_item_first(self, server: RemoteOpenAIServer): """Test generative score with item_first=True.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": " is the answer", @@ -148,7 +148,7 @@ async def test_generative_score_validation_empty_items( ): """Test that empty items returns an error.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -168,7 +168,7 @@ async def test_generative_score_validation_empty_label_tokens( ): """Test that empty label_token_ids returns an error.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -188,7 +188,7 @@ async def test_generative_score_validation_invalid_token_id( ): """Test that out-of-range token IDs return an error.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -212,7 +212,7 @@ async def test_generative_score_validation_mixed_item_types( Unprocessable Entity error, not a 400 from our validation logic. """ response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -228,7 +228,7 @@ async def test_generative_score_validation_mixed_item_types( async def test_generative_score_usage_tracking(self, server: RemoteOpenAIServer): """Test that usage info is properly tracked.""" response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "A test query with multiple tokens ", @@ -285,7 +285,7 @@ async def test_generative_score_with_many_label_tokens( label_token_ids = list(range(100, 200)) # 100 tokens response = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json={ "model": MODEL_NAME, "query": "Test query ", @@ -315,11 +315,11 @@ async def test_generative_score_consistency(self, server: RemoteOpenAIServer): } response1 = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json=request_body, ) response2 = requests.post( - server.url_for("v1/generative-scores"), + server.url_for("v1/score"), json=request_body, ) From 21b35e1c968cec35c8b7b496b0d51daadbaced1c Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 5 Feb 2026 18:36:52 +0000 Subject: [PATCH 07/28] remove sampling params that we do not need for scoring --- .../pooling/score/generative_scores.py | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/vllm/entrypoints/pooling/score/generative_scores.py b/vllm/entrypoints/pooling/score/generative_scores.py index 6999f3449094..4f885c2db40e 100644 --- a/vllm/entrypoints/pooling/score/generative_scores.py +++ b/vllm/entrypoints/pooling/score/generative_scores.py @@ -56,9 +56,6 @@ class GenerativeScoreRequest(OpenAIBaseModel): the label_token_ids (True) or return true model probabilities over the full vocab for those ids (False). item_first: If True, prepend items to query. Otherwise append items to query. - temperature: Temperature for logits. Default 0.0 for scoring (greedy). - top_k: Top-k filtering. Default 0 (disabled) for scoring. - top_p: Top-p filtering. Default 1.0 (disabled) for scoring. add_special_tokens: Whether to add special tokens when tokenizing. """ @@ -87,18 +84,6 @@ class GenerativeScoreRequest(OpenAIBaseModel): default=False, description="If True, prepend items to query. Otherwise append items to query.", ) - temperature: float | None = Field( - default=0.0, - description="Temperature for logits. Default 0.0 for scoring.", - ) - top_k: int | None = Field( - default=0, - description="Top-k filtering. Default 0 (disabled) for scoring.", - ) - top_p: float | None = Field( - default=1.0, - description="Top-p filtering. Default 1.0 (disabled) for scoring.", - ) add_special_tokens: bool = Field( default=True, description="Whether to add special tokens when tokenizing.", @@ -260,13 +245,13 @@ async def create_generative_score( # Create sampling params for scoring # We use max_tokens=1 with logprob_token_ids to efficiently get # logprobs for only the specified label tokens (not full vocab) + # Note: temperature/top_k/top_p don't affect logprobs - they only + # affect the sampling distribution. Logprobs are computed from raw + # logits via log_softmax before any sampling transformations. sampling_params = SamplingParams( max_tokens=1, - temperature=request.temperature if request.temperature else 0.0, - top_k=request.top_k if request.top_k is not None else 0, - top_p=request.top_p if request.top_p is not None else 1.0, - logprobs=len(request.label_token_ids), # Request enough logprobs - logprob_token_ids=request.label_token_ids, # Efficient: only these tokens + logprobs=len(request.label_token_ids), + logprob_token_ids=request.label_token_ids, n=1, ) From 16c2f21d974ecc4cea996e667b94e31634dd764d Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 5 Feb 2026 18:46:39 +0000 Subject: [PATCH 08/28] update tests --- .../openai/test_generative_scores.py | 6 +- .../openai/test_generative_scores_e2e.py | 275 ++++++------------ 2 files changed, 98 insertions(+), 183 deletions(-) diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/openai/test_generative_scores.py index 318b8e5d2163..cefb0ecf8095 100644 --- a/tests/entrypoints/openai/test_generative_scores.py +++ b/tests/entrypoints/openai/test_generative_scores.py @@ -153,9 +153,7 @@ class TestGenerativeScoreProtocol: { "apply_softmax": True, "item_first": False, - "temperature": 0.0, - "top_k": 0, - "top_p": 1.0, + "add_special_tokens": True, }, ), # Pre-tokenized input @@ -174,13 +172,11 @@ class TestGenerativeScoreProtocol: { "apply_softmax": False, "item_first": True, - "temperature": 0.5, "add_special_tokens": False, }, { "apply_softmax": False, "item_first": True, - "temperature": 0.5, "add_special_tokens": False, }, ), diff --git a/tests/entrypoints/openai/test_generative_scores_e2e.py b/tests/entrypoints/openai/test_generative_scores_e2e.py index ba74410cad02..5f46e39dff2c 100644 --- a/tests/entrypoints/openai/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/test_generative_scores_e2e.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""End-to-end tests for the Generative Scores API. +"""End-to-end tests for the Score API with CausalLM models. These tests verify the full HTTP request/response flow using RemoteOpenAIServer. +The Score API with label_token_ids enables generative scoring for CausalLM models. """ import pytest @@ -30,140 +31,85 @@ def server(): yield remote_server -class TestGenerativeScoresE2E: - """End-to-end tests for generative scores API.""" +class TestScoreAPIWithCausalLM: + """End-to-end tests for Score API with CausalLM models using label_token_ids.""" @pytest.mark.asyncio - async def test_basic_generative_score_request(self, server: RemoteOpenAIServer): - """Test basic generative score request with string inputs.""" - # Get some token IDs to test with - we'll use common tokens - # For Qwen3-0.6B, let's use tokens for "Yes" and "No" - # First, let's make a simple request to verify the endpoint works + async def test_basic_score_request(self, server: RemoteOpenAIServer): + """Test basic score request with label_token_ids for CausalLM.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": "Is Paris the capital of France? Answer with Yes or No: ", - "items": ["Paris is beautiful.", "London is rainy."], + "queries": "Is Paris the capital of France? Answer with Yes or No: ", + "documents": ["Paris is beautiful.", "London is rainy."], "label_token_ids": [9454, 2753], # Common token IDs - "apply_softmax": True, - "item_first": False, }, ) assert response.status_code == 200, f"Response: {response.text}" data = response.json() - # Verify response structure + # Verify response structure matches ScoreResponse format assert "id" in data - assert data["id"].startswith("genscore-") - assert data["object"] == "generative_score" + assert data["id"].startswith("score-") + assert data["object"] == "list" assert "model" in data - assert "results" in data + assert "data" in data assert "usage" in data - assert len(data["results"]) == 2 + assert len(data["data"]) == 2 # Verify each result has expected structure - for i, result in enumerate(data["results"]): + for i, result in enumerate(data["data"]): assert result["index"] == i - assert "token_probs" in result - # Probabilities should be between 0 and 1 - for token_id, prob in result["token_probs"].items(): - assert 0.0 <= prob <= 1.0 - - # With apply_softmax=True, probabilities should sum to ~1 - for result in data["results"]: - prob_sum = sum(result["token_probs"].values()) - assert abs(prob_sum - 1.0) < 1e-5, f"Prob sum: {prob_sum}" + assert result["object"] == "score" + assert "score" in result + # Score should be between 0 and 1 (probability) + assert 0.0 <= result["score"] <= 1.0 @pytest.mark.asyncio - async def test_generative_score_with_pretokenized_input( - self, server: RemoteOpenAIServer - ): - """Test generative score with pre-tokenized inputs.""" + async def test_score_with_multiple_documents(self, server: RemoteOpenAIServer): + """Test score request with multiple documents.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": [100, 200, 300, 400, 500], # Pre-tokenized query - "items": [[600, 700], [800, 900, 1000]], # Pre-tokenized items - "label_token_ids": [1, 2, 3], - "apply_softmax": True, + "queries": "Is this city a capital? ", + "documents": ["Paris", "London", "Berlin", "New York", "Tokyo"], + "label_token_ids": [9454, 2753], }, ) assert response.status_code == 200, f"Response: {response.text}" data = response.json() - assert data["object"] == "generative_score" - assert len(data["results"]) == 2 - - @pytest.mark.asyncio - async def test_generative_score_apply_softmax_false( - self, server: RemoteOpenAIServer - ): - """Test generative score with apply_softmax=False returns true model probs.""" - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "query": "Test query ", - "items": ["item1", "item2"], - "label_token_ids": [100, 200, 300], - "apply_softmax": False, - }, - ) - - assert response.status_code == 200, f"Response: {response.text}" - data = response.json() - - # With apply_softmax=False, probabilities should NOT sum to 1 - # (they are true model probs over full vocab for those tokens) - for result in data["results"]: - prob_sum = sum(result["token_probs"].values()) - # True probs typically sum to much less than 1 - assert prob_sum < 1.0, f"Prob sum should be < 1: {prob_sum}" - - @pytest.mark.asyncio - async def test_generative_score_item_first(self, server: RemoteOpenAIServer): - """Test generative score with item_first=True.""" - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "query": " is the answer", - "items": ["Yes", "No"], - "label_token_ids": [100, 200], - "item_first": True, # Items prepended to query - }, - ) - - assert response.status_code == 200, f"Response: {response.text}" - data = response.json() - assert len(data["results"]) == 2 + assert len(data["data"]) == 5 + for i, result in enumerate(data["data"]): + assert result["index"] == i + assert 0.0 <= result["score"] <= 1.0 @pytest.mark.asyncio - async def test_generative_score_validation_empty_items( + async def test_score_validation_missing_label_token_ids( self, server: RemoteOpenAIServer ): - """Test that empty items returns an error.""" + """Test that missing label_token_ids returns an error for CausalLM.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": "Test query", - "items": [], - "label_token_ids": [100], + "queries": "Test query", + "documents": ["doc1", "doc2"], + # label_token_ids missing - should error for CausalLM }, ) assert response.status_code == 400 data = response.json() assert "error" in data - assert "at least one item" in data["error"]["message"].lower() + assert "label_token_ids" in data["error"]["message"].lower() @pytest.mark.asyncio - async def test_generative_score_validation_empty_label_tokens( + async def test_score_validation_empty_label_tokens( self, server: RemoteOpenAIServer ): """Test that empty label_token_ids returns an error.""" @@ -171,8 +117,8 @@ async def test_generative_score_validation_empty_label_tokens( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": "Test query", - "items": ["item1"], + "queries": "Test query", + "documents": ["item1"], "label_token_ids": [], }, ) @@ -183,7 +129,7 @@ async def test_generative_score_validation_empty_label_tokens( assert "at least one token" in data["error"]["message"].lower() @pytest.mark.asyncio - async def test_generative_score_validation_invalid_token_id( + async def test_score_validation_invalid_token_id( self, server: RemoteOpenAIServer ): """Test that out-of-range token IDs return an error.""" @@ -191,8 +137,8 @@ async def test_generative_score_validation_invalid_token_id( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": "Test query", - "items": ["item1"], + "queries": "Test query", + "documents": ["item1"], "label_token_ids": [9999999999], # Way out of vocab range }, ) @@ -203,36 +149,14 @@ async def test_generative_score_validation_invalid_token_id( assert "out of vocabulary range" in data["error"]["message"].lower() @pytest.mark.asyncio - async def test_generative_score_validation_mixed_item_types( - self, server: RemoteOpenAIServer - ): - """Test that mixed item types (string and token list) returns a validation error. - - Note: Pydantic validates types at request parsing, so this returns a 422 - Unprocessable Entity error, not a 400 from our validation logic. - """ - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "query": "Test query", - "items": ["string item", [100, 200]], # Mixed types - "label_token_ids": [100], - }, - ) - - # Pydantic returns 422 for type validation errors - assert response.status_code == 422 - - @pytest.mark.asyncio - async def test_generative_score_usage_tracking(self, server: RemoteOpenAIServer): + async def test_score_usage_tracking(self, server: RemoteOpenAIServer): """Test that usage info is properly tracked.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "query": "A test query with multiple tokens ", - "items": ["item one", "item two", "item three"], + "queries": "A test query with multiple tokens ", + "documents": ["item one", "item two", "item three"], "label_token_ids": [100, 200], }, ) @@ -248,6 +172,61 @@ async def test_generative_score_usage_tracking(self, server: RemoteOpenAIServer) assert usage["completion_tokens"] > 0 assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + @pytest.mark.asyncio + async def test_score_consistency(self, server: RemoteOpenAIServer): + """Test that scores are consistent across identical requests.""" + request_body = { + "model": MODEL_NAME, + "queries": "Is this consistent? ", + "documents": ["Yes it is."], + "label_token_ids": [100, 200, 300], + } + + response1 = requests.post( + server.url_for("v1/score"), + json=request_body, + ) + response2 = requests.post( + server.url_for("v1/score"), + json=request_body, + ) + + assert response1.status_code == 200 + assert response2.status_code == 200 + + data1 = response1.json() + data2 = response2.json() + + # Scores should be identical for deterministic inference + score1 = data1["data"][0]["score"] + score2 = data2["data"][0]["score"] + + assert abs(score1 - score2) < 1e-6 + + @pytest.mark.asyncio + async def test_score_with_many_label_tokens(self, server: RemoteOpenAIServer): + """Test score with many label tokens.""" + # Use a larger set of label tokens + label_token_ids = list(range(100, 200)) # 100 tokens + + response = requests.post( + server.url_for("v1/score"), + json={ + "model": MODEL_NAME, + "queries": "Test query ", + "documents": ["item1"], + "label_token_ids": label_token_ids, + }, + ) + + assert response.status_code == 200, f"Response: {response.text}" + data = response.json() + + # Score should be the probability of the first label token + result = data["data"][0] + assert "score" in result + assert 0.0 <= result["score"] <= 1.0 + class TestLogprobTokenIds: """Tests to verify logprob_token_ids feature works correctly. @@ -276,66 +255,6 @@ async def test_logprob_token_ids_via_completion(self, server: RemoteOpenAIServer assert choice.logprobs is not None assert len(choice.logprobs.top_logprobs) > 0 - @pytest.mark.asyncio - async def test_generative_score_with_many_label_tokens( - self, server: RemoteOpenAIServer - ): - """Test generative score with many label tokens to stress test logprob_token_ids.""" - # Use a larger set of label tokens - label_token_ids = list(range(100, 200)) # 100 tokens - - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "query": "Test query ", - "items": ["item1"], - "label_token_ids": label_token_ids, - "apply_softmax": True, - }, - ) - - assert response.status_code == 200, f"Response: {response.text}" - data = response.json() - - # Should have probs for all 100 tokens - result = data["results"][0] - assert len(result["token_probs"]) == 100 - - @pytest.mark.asyncio - async def test_generative_score_consistency(self, server: RemoteOpenAIServer): - """Test that generative scores are consistent across identical requests.""" - request_body = { - "model": MODEL_NAME, - "query": "Is this consistent? ", - "items": ["Yes it is."], - "label_token_ids": [100, 200, 300], - "apply_softmax": True, - "temperature": 0.0, # Deterministic - } - - response1 = requests.post( - server.url_for("v1/score"), - json=request_body, - ) - response2 = requests.post( - server.url_for("v1/score"), - json=request_body, - ) - - assert response1.status_code == 200 - assert response2.status_code == 200 - - data1 = response1.json() - data2 = response2.json() - - # Probabilities should be identical for deterministic inference - probs1 = data1["results"][0]["token_probs"] - probs2 = data2["results"][0]["token_probs"] - - for token_id in probs1: - assert abs(probs1[token_id] - probs2[token_id]) < 1e-6 - if __name__ == "__main__": pytest.main([__file__, "-v"]) From c169f6f286c5ad12f19b137cd7a170cfd581eaf2 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 5 Feb 2026 18:59:53 +0000 Subject: [PATCH 09/28] remove circular import safety net --- vllm/entrypoints/pooling/score/serving.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index 018dc01af2c9..18480c8927fa 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -4,7 +4,7 @@ import time from collections.abc import AsyncGenerator, Mapping from concurrent.futures import ThreadPoolExecutor -from typing import TYPE_CHECKING, Any +from typing import Any from fastapi import Request @@ -43,11 +43,6 @@ from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.async_utils import make_async, merge_async_iterators -if TYPE_CHECKING: - from vllm.entrypoints.pooling.score.generative_scores import ( - OpenAIServingGenerativeScores, - ) - logger = init_logger(__name__) @@ -60,7 +55,7 @@ def __init__( request_logger: RequestLogger | None, score_template: str | None = None, log_error_stack: bool = False, - generative_scores_handler: "OpenAIServingGenerativeScores | None" = None, + generative_scores_handler: Any = None, # OpenAIServingGenerativeScores ) -> None: super().__init__( engine_client=engine_client, From 1a259d279f41a70223758778c76f0cd298d5d842 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 5 Feb 2026 22:11:05 +0000 Subject: [PATCH 10/28] move files into the correct test folder --- .../{openai => pooling/score}/test_generative_scores.py | 0 .../{openai => pooling/score}/test_generative_scores_e2e.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/entrypoints/{openai => pooling/score}/test_generative_scores.py (100%) rename tests/entrypoints/{openai => pooling/score}/test_generative_scores_e2e.py (99%) diff --git a/tests/entrypoints/openai/test_generative_scores.py b/tests/entrypoints/pooling/score/test_generative_scores.py similarity index 100% rename from tests/entrypoints/openai/test_generative_scores.py rename to tests/entrypoints/pooling/score/test_generative_scores.py diff --git a/tests/entrypoints/openai/test_generative_scores_e2e.py b/tests/entrypoints/pooling/score/test_generative_scores_e2e.py similarity index 99% rename from tests/entrypoints/openai/test_generative_scores_e2e.py rename to tests/entrypoints/pooling/score/test_generative_scores_e2e.py index 5f46e39dff2c..40ff9185358b 100644 --- a/tests/entrypoints/openai/test_generative_scores_e2e.py +++ b/tests/entrypoints/pooling/score/test_generative_scores_e2e.py @@ -9,7 +9,7 @@ import pytest import requests -from ...utils import RemoteOpenAIServer +from ....utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" From f4b81b57026cbc89c5b948e6f3cd52e32bb61d3d Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Fri, 6 Feb 2026 22:54:16 +0000 Subject: [PATCH 11/28] allow for heterogenous token_id batching to occur in a batch and limit number of token ids in request to 2 --- vllm/entrypoints/pooling/score/protocol.py | 17 ++++++++- vllm/v1/sample/sampler.py | 42 ++++++++++++++-------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index b9e3b58b538f..079fd05b040d 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -6,6 +6,7 @@ from pydantic import ( BaseModel, Field, + field_validator, ) from vllm import PoolingParams @@ -21,6 +22,9 @@ ) from vllm.utils import random_uuid +# Maximum number of token IDs allowed in label_token_ids +MAX_LABEL_TOKEN_IDS = 2 + class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): # --8<-- [start:score-extra-params] @@ -32,11 +36,22 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): default=None, description=( "List of token IDs to compute probabilities for when using " - "CausalLM models. Required for generative scoring." + f"CausalLM models. Required for generative scoring. " + f"Maximum {MAX_LABEL_TOKEN_IDS} token IDs allowed." ), ) # --8<-- [end:score-extra-params] + @field_validator('label_token_ids') + @classmethod + def validate_label_token_ids(cls, v: list[int] | None) -> list[int] | None: + if v is not None and len(v) > MAX_LABEL_TOKEN_IDS: + raise ValueError( + f"label_token_ids must contain at most {MAX_LABEL_TOKEN_IDS} " + f"token IDs, but got {len(v)}" + ) + return v + def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 6bf4e488fda4..8046d8a9acd4 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -153,6 +153,9 @@ def gather_specific_token_logprobs( This is more efficient than computing full vocab logprobs when you only need logprobs for a small set of tokens (e.g., for scoring tasks). + Handles heterogeneous token ID lists across requests in a batch by + padding shorter lists to max length and masking invalid positions. + Args: logits: [batch_size, vocab_size] tensor of logits logprob_token_ids: dict mapping req_index -> list of token IDs @@ -166,29 +169,38 @@ def gather_specific_token_logprobs( return None batch_size = logits.shape[0] - vocab_size = logits.shape[1] - - # For now, assume all requests in the batch have the same token IDs - # (this is the common case for generative_scores API) - # Get the first request's token IDs as the common set - first_token_ids = next(iter(logprob_token_ids.values())) - num_tokens = len(first_token_ids) - - # Create token_ids tensor: [batch_size, num_tokens] - # Include sampled token as first element (like gather_logprobs does) + + # Find max number of tokens across all requests + max_num_tokens = max(len(tids) for tids in logprob_token_ids.values()) + + # Create padded token_ids tensor: [batch_size, max_num_tokens + 1] + # +1 for sampled token in first position token_ids_tensor = torch.zeros( - batch_size, num_tokens + 1, dtype=torch.int64, device=logits.device + batch_size, max_num_tokens + 1, dtype=torch.int64, device=logits.device ) token_ids_tensor[:, 0] = sampled # First column is sampled token - token_ids_tensor[:, 1:] = torch.tensor( - first_token_ids, dtype=torch.int64, device=logits.device + + # Create mask for valid positions (True = valid, False = padded) + valid_mask = torch.zeros( + batch_size, max_num_tokens + 1, dtype=torch.bool, device=logits.device ) + valid_mask[:, 0] = True # Sampled token is always valid + + # Fill in token IDs for each request + for req_idx, token_ids in logprob_token_ids.items(): + num_tokens = len(token_ids) + token_ids_tensor[req_idx, 1:num_tokens + 1] = torch.tensor( + token_ids, dtype=torch.int64, device=logits.device + ) + valid_mask[req_idx, 1:num_tokens + 1] = True - # Compute logprobs efficiently using the Triton kernel + # Compute logprobs using the Triton kernel logprobs = compute_token_logprobs(logits, token_ids_tensor) + + # Mask invalid positions with -inf + logprobs = logprobs.masked_fill(~valid_mask, float('-inf')) # Compute ranks for the sampled token - token_ranks = torch.empty(batch_size, dtype=torch.int64, device=logits.device) sampled_logits = logits.gather(-1, sampled.unsqueeze(-1)) token_ranks = (logits > sampled_logits).sum(dim=-1) From d973f816d4611e3f9765f5af66d6551fc3d30b61 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Fri, 6 Feb 2026 23:16:27 +0000 Subject: [PATCH 12/28] require exactly 2 token ids for generative scoring --- docs/serving/openai_compatible_server.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 8adc98aa7ffb..df16a8c716b0 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -1064,14 +1064,14 @@ The following extra parameters are supported: #### CausalLM Models (Generative Scoring) -When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the endpoint computes the probability of specified token IDs appearing as the next token. This is useful for classification tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens. +When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the endpoint computes the probability of specified token IDs appearing as the next token. This is useful for generative scoring tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens. **Requirements for CausalLM models:** -- The `label_token_ids` parameter is **required** and specifies which token IDs to compute probabilities for. -- The score returned is the probability of the first token in `label_token_ids`. +- The `label_token_ids` parameter is **required** and must contain **exactly 2 token IDs** (for generative scoring). +- The score is computed as: `P(label_token_ids[0]) / (P(label_token_ids[0]) + P(label_token_ids[1]))` -##### Example: Classification with CausalLM +##### Example: Score with CausalLM ```bash curl -X POST http://localhost:8000/v1/score \ @@ -1105,9 +1105,9 @@ curl -X POST http://localhost:8000/v1/score \ 1. **Prompt Construction**: For each document, builds `prompt = query + document` 2. **Forward Pass**: Runs the model to get next-token logits -3. **Probability Extraction**: Extracts probabilities for specified `label_token_ids` -4. **Softmax Normalization**: Applies softmax over only the label tokens (probabilities sum to 1 over label tokens) -5. **Score Selection**: Returns the probability of the first token in `label_token_ids` as the score +3. **Probability Extraction**: Extracts logprobs for the 2 specified `label_token_ids` +4. **Softmax Normalization**: Applies softmax over only the 2 label tokens +5. **Score Computation**: Returns `P(token[0]) / (P(token[0]) + P(token[1]))` as the score ##### Finding Token IDs From 8a9316fcf9a7a2577a72ddefbf1ddb9769c25d05 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Fri, 6 Feb 2026 23:16:36 +0000 Subject: [PATCH 13/28] require exactly 2 token ids for generative scoring --- vllm/entrypoints/pooling/score/protocol.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 079fd05b040d..f46ed251d5db 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -22,8 +22,8 @@ ) from vllm.utils import random_uuid -# Maximum number of token IDs allowed in label_token_ids -MAX_LABEL_TOKEN_IDS = 2 +# Exact number of token IDs required in label_token_ids for generative scoring +REQUIRED_LABEL_TOKEN_IDS = 2 class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): @@ -37,7 +37,7 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): description=( "List of token IDs to compute probabilities for when using " f"CausalLM models. Required for generative scoring. " - f"Maximum {MAX_LABEL_TOKEN_IDS} token IDs allowed." + f"Must contain exactly {REQUIRED_LABEL_TOKEN_IDS} token IDs." ), ) # --8<-- [end:score-extra-params] @@ -45,10 +45,10 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): @field_validator('label_token_ids') @classmethod def validate_label_token_ids(cls, v: list[int] | None) -> list[int] | None: - if v is not None and len(v) > MAX_LABEL_TOKEN_IDS: + if v is not None and len(v) != REQUIRED_LABEL_TOKEN_IDS: raise ValueError( - f"label_token_ids must contain at most {MAX_LABEL_TOKEN_IDS} " - f"token IDs, but got {len(v)}" + f"label_token_ids must contain exactly {REQUIRED_LABEL_TOKEN_IDS} " + f"token IDs for generative scoring, but got {len(v)}" ) return v From 396fb3546253eafb75d43ee48ed1ae81e6770433 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Sat, 7 Feb 2026 00:17:02 +0000 Subject: [PATCH 14/28] move imports to top and solve vircular import --- vllm/config/model.py | 2 +- .../pooling/score/generative_scores.py | 12 +++---- vllm/entrypoints/pooling/score/serving.py | 10 +++--- vllm/v1/sample/sampler.py | 33 ++++++++++--------- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 1c6b5b813cca..0ef93cac5cbb 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import re import warnings from collections.abc import Callable from dataclasses import InitVar, field @@ -1438,7 +1439,6 @@ def is_causal_lm(self) -> bool: Returns True if any architecture in hf_config.architectures matches the pattern .*ForCausalLM.* (e.g., LlamaForCausalLM, Qwen2ForCausalLM). """ - import re architectures = getattr(self.hf_config, "architectures", []) pattern = re.compile(r".*ForCausalLM.*") return any(pattern.match(arch) for arch in architectures) diff --git a/vllm/entrypoints/pooling/score/generative_scores.py b/vllm/entrypoints/pooling/score/generative_scores.py index 4f885c2db40e..c64b7c9d3f44 100644 --- a/vllm/entrypoints/pooling/score/generative_scores.py +++ b/vllm/entrypoints/pooling/score/generative_scores.py @@ -31,6 +31,11 @@ from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.tracing import ( + contains_trace_headers, + extract_trace_headers, + log_tracing_disabled_warning, +) from vllm.utils import random_uuid from vllm.utils.async_utils import merge_async_iterators @@ -480,12 +485,6 @@ async def _get_trace_headers( headers: Mapping[str, str], ) -> Mapping[str, str] | None: """Extract trace headers from request headers.""" - from vllm.tracing import ( - contains_trace_headers, - extract_trace_headers, - log_tracing_disabled_warning, - ) - if not contains_trace_headers(headers): return None @@ -506,7 +505,6 @@ def _base_request_id( if raw_request: return getattr(raw_request.state, "request_id", None) or \ str(id(raw_request)) - from vllm.utils import random_uuid return random_uuid() def _log_inputs( diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index 18480c8927fa..bccf1590eb24 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -26,6 +26,10 @@ ScoreResponse, ScoreResponseData, ) +from vllm.entrypoints.pooling.score.generative_scores import ( + GenerativeScoreRequest, + OpenAIServingGenerativeScores, +) from vllm.entrypoints.pooling.score.utils import ( ScoreContentPartParam, ScoreMultiModalParam, @@ -55,7 +59,7 @@ def __init__( request_logger: RequestLogger | None, score_template: str | None = None, log_error_stack: bool = False, - generative_scores_handler: Any = None, # OpenAIServingGenerativeScores + generative_scores_handler: OpenAIServingGenerativeScores | None = None, ) -> None: super().__init__( engine_client=engine_client, @@ -358,10 +362,6 @@ async def _generative_score( generative scores handler, and converts the response back to the standard score response format. """ - from vllm.entrypoints.pooling.score.generative_scores import ( - GenerativeScoreRequest, - ) - if self.generative_scores_handler is None: return self.create_error_response( "Generative scores handler not initialized. " diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 8046d8a9acd4..21e472802a56 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -148,13 +148,15 @@ def gather_specific_token_logprobs( logprob_token_ids: dict[int, list[int]], sampled: torch.Tensor, ) -> LogprobsTensors | None: - """Compute logprobs for specific token IDs. + """Compute logprobs for specific token IDs using Triton kernel. - This is more efficient than computing full vocab logprobs when you only - need logprobs for a small set of tokens (e.g., for scoring tasks). + This method handles heterogeneous token ID lists across requests by + padding shorter lists to max length and using a fused Triton kernel + for efficient log_softmax + gather computation. - Handles heterogeneous token ID lists across requests in a batch by - padding shorter lists to max length and masking invalid positions. + Benchmarks show the Triton kernel approach is ~1.4x faster than sparse + gather for batch sizes > 1 due to the fused kernel reducing memory + bandwidth requirements. Args: logits: [batch_size, vocab_size] tensor of logits @@ -169,35 +171,36 @@ def gather_specific_token_logprobs( return None batch_size = logits.shape[0] - + device = logits.device + # Find max number of tokens across all requests max_num_tokens = max(len(tids) for tids in logprob_token_ids.values()) - + # Create padded token_ids tensor: [batch_size, max_num_tokens + 1] # +1 for sampled token in first position token_ids_tensor = torch.zeros( - batch_size, max_num_tokens + 1, dtype=torch.int64, device=logits.device + batch_size, max_num_tokens + 1, dtype=torch.int64, device=device ) token_ids_tensor[:, 0] = sampled # First column is sampled token - + # Create mask for valid positions (True = valid, False = padded) valid_mask = torch.zeros( - batch_size, max_num_tokens + 1, dtype=torch.bool, device=logits.device + batch_size, max_num_tokens + 1, dtype=torch.bool, device=device ) valid_mask[:, 0] = True # Sampled token is always valid - + # Fill in token IDs for each request for req_idx, token_ids in logprob_token_ids.items(): num_tokens = len(token_ids) token_ids_tensor[req_idx, 1:num_tokens + 1] = torch.tensor( - token_ids, dtype=torch.int64, device=logits.device + token_ids, dtype=torch.int64, device=device ) valid_mask[req_idx, 1:num_tokens + 1] = True - # Compute logprobs using the Triton kernel + # Compute logprobs using the fused Triton kernel (log_softmax + gather) logprobs = compute_token_logprobs(logits, token_ids_tensor) - - # Mask invalid positions with -inf + + # Mask invalid (padded) positions with -inf logprobs = logprobs.masked_fill(~valid_mask, float('-inf')) # Compute ranks for the sampled token From 933be09a5f85aeafcaeef90cdc652948506118af Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Sat, 7 Feb 2026 00:53:50 +0000 Subject: [PATCH 15/28] add truncation for tokens --- vllm/entrypoints/pooling/score/generative_scores.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/pooling/score/generative_scores.py b/vllm/entrypoints/pooling/score/generative_scores.py index c64b7c9d3f44..26f812a15b0f 100644 --- a/vllm/entrypoints/pooling/score/generative_scores.py +++ b/vllm/entrypoints/pooling/score/generative_scores.py @@ -241,7 +241,7 @@ async def create_generative_score( # Build prompts for each item try: engine_prompts, prompt_token_counts = await self._build_prompts( - request, tokenizer + request, tokenizer, self.model_config.max_model_len ) except (ValueError, TypeError) as e: logger.exception("Error building prompts") @@ -388,12 +388,14 @@ async def _build_prompts( self, request: GenerativeScoreRequest, tokenizer, + max_model_len: int, ) -> tuple[list[TokensPrompt], list[int]]: """Build prompts by concatenating query and items. Args: request: The request containing query, items, and settings. tokenizer: The tokenizer to use. + max_model_len: Maximum model context length for truncation. Returns: Tuple of (list of TokensPrompt, list of prompt token counts). @@ -432,6 +434,11 @@ async def _build_prompts( else: prompt_token_ids = query_token_ids + item_token_ids + # Truncate to max_model_len - 1 to leave room for 1 output token + max_prompt_len = max_model_len - 1 + if len(prompt_token_ids) > max_prompt_len: + prompt_token_ids = prompt_token_ids[:max_prompt_len] + engine_prompts.append( TokensPrompt(prompt_token_ids=prompt_token_ids) ) From 8e5b19c7baffd7aec1e2fc312d24697bdd7bd126 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Sat, 7 Feb 2026 01:18:52 +0000 Subject: [PATCH 16/28] consolidate tests and include changes from recent updates --- .../pooling/score/test_generative_scores.py | 375 ++++++------------ .../score/test_generative_scores_e2e.py | 203 ++-------- 2 files changed, 163 insertions(+), 415 deletions(-) diff --git a/tests/entrypoints/pooling/score/test_generative_scores.py b/tests/entrypoints/pooling/score/test_generative_scores.py index cefb0ecf8095..fd022fa27abd 100644 --- a/tests/entrypoints/pooling/score/test_generative_scores.py +++ b/tests/entrypoints/pooling/score/test_generative_scores.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the Generative Scores API. -These tests verify: -1. Request/Response protocol models -2. Probability computation (apply_softmax=True and apply_softmax=False) +Tests cover: +1. Protocol models (request/response construction) +2. Probability computation (softmax normalization) 3. Input validation -4. Error handling +4. Score formula: P(token[0]) / (P(token[0]) + P(token[1])) +5. Prompt building and item ordering """ import math @@ -22,8 +23,6 @@ GenerativeScoreItemResult, GenerativeScoreRequest, GenerativeScoreResponse, -) -from vllm.entrypoints.pooling.score.generative_scores import ( OpenAIServingGenerativeScores, ) from vllm.entrypoints.openai.models.protocol import BaseModelPath @@ -33,12 +32,9 @@ from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM -# Use local model path for testing MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" -BASE_MODEL_PATHS = [ - BaseModelPath(name=MODEL_NAME, model_path=MODEL_PATH), -] +BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_PATH)] @dataclass @@ -66,7 +62,7 @@ class MockModelConfig: generation_config: str = "auto" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) skip_tokenizer_init = False - vocab_size = 151936 # Qwen3-0.6B vocab size + vocab_size = 151936 def get_diff_sampling_param(self): return self.diff_sampling_param or {} @@ -75,11 +71,6 @@ def get_vocab_size(self): return self.vocab_size -# ============================================================================ -# Test Fixtures and Helpers -# ============================================================================ - - def _create_mock_engine(): """Create a mock AsyncLLM engine.""" mock_engine = MagicMock(spec=AsyncLLM) @@ -97,32 +88,23 @@ def _create_serving(mock_engine) -> OpenAIServingGenerativeScores: engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) - return OpenAIServingGenerativeScores( - mock_engine, - models, - request_logger=None, - ) + return OpenAIServingGenerativeScores(mock_engine, models, request_logger=None) -def _create_mock_request_output( - logprobs_dict: dict[int, float], - token_id: int = 100, -) -> RequestOutput: +def _create_mock_request_output(logprobs_dict: dict[int, float]) -> RequestOutput: """Create a mock RequestOutput with specified logprobs.""" logprobs_with_objs = { tid: Logprob(logprob=lp, rank=i + 1) for i, (tid, lp) in enumerate(logprobs_dict.items()) } - completion_output = CompletionOutput( index=0, text="", - token_ids=[token_id], + token_ids=[100], cumulative_logprob=-1.0, logprobs=[logprobs_with_objs], finish_reason="length", ) - return RequestOutput( request_id="test-request", prompt="test prompt", @@ -133,242 +115,174 @@ def _create_mock_request_output( ) -# ============================================================================ -# Protocol Tests (Parameterized) -# ============================================================================ - +class TestProtocolModels: + """Tests for GenerativeScoreRequest and GenerativeScoreResponse.""" -class TestGenerativeScoreProtocol: - """Tests for protocol models - parameterized for efficiency.""" - - @pytest.mark.parametrize( - "query,items,label_ids,extra_kwargs,expected_attrs", - [ - # Basic string input with defaults - ( - "Is this city the capital?", - ["Paris", "London"], - [1234, 5678], - {}, - { - "apply_softmax": True, - "item_first": False, - "add_special_tokens": True, - }, - ), - # Pre-tokenized input - ( - [100, 200, 300], - [[400, 500], [600, 700, 800]], - [1234], - {}, - {"apply_softmax": True, "item_first": False}, - ), - # Custom options - ( - "Test query", - ["Item1"], - [100], - { - "apply_softmax": False, - "item_first": True, - "add_special_tokens": False, - }, - { - "apply_softmax": False, - "item_first": True, - "add_special_tokens": False, - }, - ), - ], - ids=["basic_defaults", "pretokenized", "custom_options"], - ) - def test_request_construction( - self, query, items, label_ids, extra_kwargs, expected_attrs - ): - """Test request construction with various inputs and options.""" - request = GenerativeScoreRequest( - query=query, - items=items, - label_token_ids=label_ids, - **extra_kwargs, + def test_request_and_response_all_fields(self): + """Test request construction with all field types and response structure.""" + # Test request with string inputs + req_str = GenerativeScoreRequest( + query="Is this the capital?", + items=["Paris", "London"], + label_token_ids=[9454, 2753], ) - assert request.query == query - assert request.items == items - assert request.label_token_ids == label_ids - for attr, expected in expected_attrs.items(): - assert getattr(request, attr) == expected, f"{attr} mismatch" - - def test_response_structure(self): - """Test response model structure.""" + assert req_str.query == "Is this the capital?" + assert req_str.items == ["Paris", "London"] + assert req_str.label_token_ids == [9454, 2753] + assert req_str.apply_softmax is True # default + assert req_str.item_first is False # default + assert req_str.add_special_tokens is True # default + + # Test request with pre-tokenized inputs and custom options + req_tok = GenerativeScoreRequest( + query=[100, 200, 300], + items=[[400, 500], [600, 700]], + label_token_ids=[1234, 5678], + apply_softmax=False, + item_first=True, + add_special_tokens=False, + ) + assert req_tok.query == [100, 200, 300] + assert req_tok.items == [[400, 500], [600, 700]] + assert req_tok.apply_softmax is False + assert req_tok.item_first is True + assert req_tok.add_special_tokens is False + + # Test response structure response = GenerativeScoreResponse( model="test-model", results=[ - GenerativeScoreItemResult( - index=0, - token_probs={"1234": 0.7, "5678": 0.3}, - ) + GenerativeScoreItemResult(index=0, token_probs={"9454": 0.7, "2753": 0.3}), + GenerativeScoreItemResult(index=1, token_probs={"9454": 0.4, "2753": 0.6}), ], - usage={ - "prompt_tokens": 10, - "total_tokens": 11, - "completion_tokens": 1, - }, + usage={"prompt_tokens": 10, "total_tokens": 12, "completion_tokens": 2}, ) assert response.object == "generative_score" assert response.model == "test-model" - assert len(response.results) == 1 - assert response.results[0].token_probs["1234"] == 0.7 - - -# ============================================================================ -# Probability Computation Tests (Parameterized - replaces 2 test classes) -# ============================================================================ + assert len(response.results) == 2 + assert response.results[0].token_probs["9454"] == 0.7 + assert response.results[1].token_probs["2753"] == 0.6 + assert response.usage.prompt_tokens == 10 class TestProbabilityComputation: - """Unified tests for probability computation - covers both softmax modes.""" + """Tests for _compute_probabilities with both softmax modes.""" @pytest.mark.parametrize( "label_logprobs,apply_softmax,should_sum_to_one", [ - # apply_softmax=True cases (subset softmax, sums to 1) ({100: -1.0, 200: -2.0}, True, True), - ({10: -0.5, 20: -1.5}, True, True), - # Numerical stability with extreme values - ({100: -100.0, 200: -100.5}, True, True), - # apply_softmax=False cases (true probs, don't sum to 1) + ({100: -100.0, 200: -100.5}, True, True), # numerical stability ({100: -1.0, 200: -2.0}, False, False), - ({10: -0.5, 20: -1.5}, False, False), - ], - ids=[ - "softmax_basic", - "softmax_different_values", - "softmax_numerical_stability", - "true_probs_basic", - "true_probs_different_values", ], + ids=["softmax_basic", "softmax_extreme_values", "true_probs"], ) - def test_compute_probabilities( - self, label_logprobs, apply_softmax, should_sum_to_one - ): - """Test probability computation with various inputs and modes.""" - serving = OpenAIServingGenerativeScores.__new__( - OpenAIServingGenerativeScores - ) + def test_compute_probabilities(self, label_logprobs, apply_softmax, should_sum_to_one): + """Test probability computation for softmax and true probability modes.""" + serving = OpenAIServingGenerativeScores.__new__(OpenAIServingGenerativeScores) + probs = serving._compute_probabilities(label_logprobs, apply_softmax=apply_softmax) - probs = serving._compute_probabilities( - label_logprobs, apply_softmax=apply_softmax - ) - - # Check sum behavior + # Verify sum behavior total = sum(probs.values()) if should_sum_to_one: - assert abs(total - 1.0) < 1e-6, f"Expected sum=1, got {total}" + assert abs(total - 1.0) < 1e-6 else: - assert total < 1.0, f"True probs should sum <1, got {total}" - - # Verify ordering is preserved (higher logprob = higher prob) - sorted_logprobs = sorted( - label_logprobs.items(), key=lambda x: x[1], reverse=True - ) - sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True) - assert [x[0] for x in sorted_logprobs] == [x[0] for x in sorted_probs] + assert total < 1.0 - # Verify math for specific cases + # Verify math if apply_softmax: - # softmax: exp(x_i - max) / sum(exp(x_j - max)) max_lp = max(label_logprobs.values()) exp_vals = {k: math.exp(v - max_lp) for k, v in label_logprobs.items()} sum_exp = sum(exp_vals.values()) - for token_id, logprob in label_logprobs.items(): - expected = exp_vals[token_id] / sum_exp - assert abs(probs[token_id] - expected) < 1e-9 + for tid, lp in label_logprobs.items(): + assert abs(probs[tid] - exp_vals[tid] / sum_exp) < 1e-9 else: - # true probs: just exp(logprob) - for token_id, logprob in label_logprobs.items(): - expected = math.exp(logprob) - assert abs(probs[token_id] - expected) < 1e-9 - - -# ============================================================================ -# Validation Tests (Parameterized) -# ============================================================================ + for tid, lp in label_logprobs.items(): + assert abs(probs[tid] - math.exp(lp)) < 1e-9 + + def test_score_formula(self): + """Test the score formula: P(token[0]) / (P(token[0]) + P(token[1])).""" + serving = OpenAIServingGenerativeScores.__new__(OpenAIServingGenerativeScores) + + # With logprobs -0.5 and -2.0, softmax gives higher prob to first token + logprobs = {9454: -0.5, 2753: -2.0} + probs = serving._compute_probabilities(logprobs, apply_softmax=True) + + # Score = P(9454) / (P(9454) + P(2753)) = P(9454) since they sum to 1 + score = probs[9454] + + # Manual calculation + exp_0 = math.exp(-0.5) + exp_1 = math.exp(-2.0) + expected_score = exp_0 / (exp_0 + exp_1) + + assert abs(score - expected_score) < 1e-9 + assert score > 0.5 # First token has higher logprob, so higher probability class TestValidation: - """Tests for input validation - parameterized.""" + """Tests for input validation errors.""" @pytest.mark.asyncio @pytest.mark.parametrize( - "request_kwargs,expected_error_substring", + "request_kwargs,expected_error", [ - # Out of range token ID - ( - { - "query": "test query", - "items": ["item1"], - "label_token_ids": [999999], - }, - "out of vocabulary range", - ), - # Empty label_token_ids - ( - { - "query": "test query", - "items": ["item1"], - "label_token_ids": [], - }, - "at least one token", - ), - # Empty items - ( - { - "query": "test query", - "items": [], - "label_token_ids": [100], - }, - "at least one item", - ), - # Note: mixed_item_types (string and token list) is validated by - # Pydantic before our code runs, so we test it in e2e tests instead + ({"query": "q", "items": ["i"], "label_token_ids": [999999, 999998]}, "out of vocabulary"), + ({"query": "q", "items": ["i"], "label_token_ids": [100]}, "at least one token"), + ({"query": "q", "items": [], "label_token_ids": [100, 200]}, "at least one item"), ], - ids=["invalid_token_id", "empty_label_tokens", "empty_items"], + ids=["invalid_token_id", "single_token", "empty_items"], ) - async def test_validation_errors(self, request_kwargs, expected_error_substring): + async def test_validation_errors(self, request_kwargs, expected_error): """Test that invalid inputs return appropriate errors.""" mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - request = GenerativeScoreRequest(model=MODEL_NAME, **request_kwargs) result = await serving.create_generative_score(request, None) assert isinstance(result, ErrorResponse) - assert expected_error_substring in result.error.message.lower() + assert expected_error in result.error.message.lower() -# ============================================================================ -# Integration Tests (with mocked engine) -# ============================================================================ +class TestPromptBuilding: + """Tests for prompt construction and item ordering.""" + @pytest.mark.asyncio + @pytest.mark.parametrize( + "item_first,expected", + [ + (False, [[100, 101, 200, 201], [100, 101, 300, 301]]), # query + item + (True, [[200, 201, 100, 101], [300, 301, 100, 101]]), # item + query + ], + ids=["query_first", "item_first"], + ) + async def test_item_ordering(self, item_first, expected): + """Test that item_first flag controls prompt concatenation order.""" + mock_engine = _create_mock_engine() + serving = _create_serving(mock_engine) -class TestGenerativeScoreGeneration: + request = GenerativeScoreRequest( + query=[100, 101], + items=[[200, 201], [300, 301]], + label_token_ids=[500, 501], + item_first=item_first, + ) + engine_prompts, _ = await serving._build_prompts(request, MagicMock()) + + for i, exp in enumerate(expected): + assert engine_prompts[i]["prompt_token_ids"] == exp + + +class TestGeneration: """Tests for the full generation flow with mocked engine.""" @pytest.mark.asyncio async def test_successful_generation(self): - """Test successful score generation with mocked engine output.""" + """Test successful score generation returns valid response.""" mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - label_token_ids = [1234, 5678] - mock_logprobs = { - 1234: -0.5, - 5678: -2.0, - 100: -3.0, - 200: -4.0, - } - + mock_logprobs = {1234: -0.5, 5678: -2.0, 100: -3.0} mock_output = _create_mock_request_output(mock_logprobs) async def mock_generate(*args, **kwargs): @@ -378,61 +292,18 @@ async def mock_generate(*args, **kwargs): request = GenerativeScoreRequest( model=MODEL_NAME, - query="Is Paris the capital of France?", + query="Is Paris the capital?", items=["Yes", "No"], - label_token_ids=label_token_ids, - apply_softmax=True, + label_token_ids=[1234, 5678], ) - result = await serving.create_generative_score(request, None) assert isinstance(result, GenerativeScoreResponse) assert len(result.results) == 2 - - # Check probabilities are in valid range for item_result in result.results: for prob in item_result.token_probs.values(): assert 0.0 <= prob <= 1.0 - @pytest.mark.asyncio - @pytest.mark.parametrize( - "item_first,expected_prompts", - [ - ( - False, - [ - [100, 101, 102, 200, 201], - [100, 101, 102, 300, 301], - ], - ), - ( - True, - [ - [200, 201, 100, 101, 102], - [300, 301, 100, 101, 102], - ], - ), - ], - ids=["query_first", "item_first"], - ) - async def test_item_ordering(self, item_first, expected_prompts): - """Test that item_first flag correctly controls prompt ordering.""" - mock_engine = _create_mock_engine() - serving = _create_serving(mock_engine) - tokenizer = MagicMock() - - request = GenerativeScoreRequest( - query=[100, 101, 102], - items=[[200, 201], [300, 301]], - label_token_ids=[500], - item_first=item_first, - ) - - engine_prompts, _ = await serving._build_prompts(request, tokenizer) - - for i, expected in enumerate(expected_prompts): - assert engine_prompts[i]["prompt_token_ids"] == expected - if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/entrypoints/pooling/score/test_generative_scores_e2e.py b/tests/entrypoints/pooling/score/test_generative_scores_e2e.py index 40ff9185358b..8aea7480abf0 100644 --- a/tests/entrypoints/pooling/score/test_generative_scores_e2e.py +++ b/tests/entrypoints/pooling/score/test_generative_scores_e2e.py @@ -2,8 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """End-to-end tests for the Score API with CausalLM models. -These tests verify the full HTTP request/response flow using RemoteOpenAIServer. -The Score API with label_token_ids enables generative scoring for CausalLM models. +Tests verify the full HTTP request/response flow using RemoteOpenAIServer. """ import pytest @@ -18,57 +17,54 @@ @pytest.fixture(scope="module") def server(): args = [ - "--dtype", - "bfloat16", - "--max-model-len", - "512", + "--dtype", "bfloat16", + "--max-model-len", "512", "--enforce-eager", - "--max-num-seqs", - "32", + "--max-num-seqs", "32", ] - with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: yield remote_server class TestScoreAPIWithCausalLM: - """End-to-end tests for Score API with CausalLM models using label_token_ids.""" + """End-to-end tests for Score API with CausalLM models.""" @pytest.mark.asyncio - async def test_basic_score_request(self, server: RemoteOpenAIServer): - """Test basic score request with label_token_ids for CausalLM.""" + async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServer): + """Test basic score request and verify response structure.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, - "queries": "Is Paris the capital of France? Answer with Yes or No: ", + "queries": "Is Paris the capital of France? Answer Yes or No: ", "documents": ["Paris is beautiful.", "London is rainy."], - "label_token_ids": [9454, 2753], # Common token IDs + "label_token_ids": [9454, 2753], }, ) - assert response.status_code == 200, f"Response: {response.text}" data = response.json() - # Verify response structure matches ScoreResponse format - assert "id" in data + # Verify response structure assert data["id"].startswith("score-") assert data["object"] == "list" assert "model" in data - assert "data" in data assert "usage" in data assert len(data["data"]) == 2 - # Verify each result has expected structure + # Verify each result for i, result in enumerate(data["data"]): assert result["index"] == i assert result["object"] == "score" - assert "score" in result - # Score should be between 0 and 1 (probability) assert 0.0 <= result["score"] <= 1.0 + # Verify usage tracking + usage = data["usage"] + assert usage["prompt_tokens"] > 0 + assert usage["completion_tokens"] > 0 + assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + @pytest.mark.asyncio - async def test_score_with_multiple_documents(self, server: RemoteOpenAIServer): + async def test_multiple_documents(self, server: RemoteOpenAIServer): """Test score request with multiple documents.""" response = requests.post( server.url_for("v1/score"), @@ -79,19 +75,12 @@ async def test_score_with_multiple_documents(self, server: RemoteOpenAIServer): "label_token_ids": [9454, 2753], }, ) - - assert response.status_code == 200, f"Response: {response.text}" + assert response.status_code == 200 data = response.json() - assert len(data["data"]) == 5 - for i, result in enumerate(data["data"]): - assert result["index"] == i - assert 0.0 <= result["score"] <= 1.0 @pytest.mark.asyncio - async def test_score_validation_missing_label_token_ids( - self, server: RemoteOpenAIServer - ): + async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServer): """Test that missing label_token_ids returns an error for CausalLM.""" response = requests.post( server.url_for("v1/score"), @@ -99,161 +88,49 @@ async def test_score_validation_missing_label_token_ids( "model": MODEL_NAME, "queries": "Test query", "documents": ["doc1", "doc2"], - # label_token_ids missing - should error for CausalLM }, ) - assert response.status_code == 400 - data = response.json() - assert "error" in data - assert "label_token_ids" in data["error"]["message"].lower() + assert "label_token_ids" in response.json()["error"]["message"].lower() @pytest.mark.asyncio - async def test_score_validation_empty_label_tokens( - self, server: RemoteOpenAIServer - ): - """Test that empty label_token_ids returns an error.""" + @pytest.mark.parametrize( + "label_token_ids,expected_status", + [ + ([100], 422), # Wrong count (1 instead of 2) + ([100, 200, 300], 422), # Wrong count (3 instead of 2) + ([9999999999, 9999999998], 400), # Out of vocab range + ], + ids=["single_token", "three_tokens", "invalid_token_ids"], + ) + async def test_validation_errors(self, server: RemoteOpenAIServer, label_token_ids, expected_status): + """Test validation errors for various invalid inputs.""" response = requests.post( server.url_for("v1/score"), json={ "model": MODEL_NAME, "queries": "Test query", "documents": ["item1"], - "label_token_ids": [], - }, - ) - - assert response.status_code == 400 - data = response.json() - assert "error" in data - assert "at least one token" in data["error"]["message"].lower() - - @pytest.mark.asyncio - async def test_score_validation_invalid_token_id( - self, server: RemoteOpenAIServer - ): - """Test that out-of-range token IDs return an error.""" - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "queries": "Test query", - "documents": ["item1"], - "label_token_ids": [9999999999], # Way out of vocab range - }, - ) - - assert response.status_code == 400 - data = response.json() - assert "error" in data - assert "out of vocabulary range" in data["error"]["message"].lower() - - @pytest.mark.asyncio - async def test_score_usage_tracking(self, server: RemoteOpenAIServer): - """Test that usage info is properly tracked.""" - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "queries": "A test query with multiple tokens ", - "documents": ["item one", "item two", "item three"], - "label_token_ids": [100, 200], + "label_token_ids": label_token_ids, }, ) - - assert response.status_code == 200 - data = response.json() - - usage = data["usage"] - assert "prompt_tokens" in usage - assert "completion_tokens" in usage - assert "total_tokens" in usage - assert usage["prompt_tokens"] > 0 - assert usage["completion_tokens"] > 0 - assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + assert response.status_code == expected_status @pytest.mark.asyncio async def test_score_consistency(self, server: RemoteOpenAIServer): - """Test that scores are consistent across identical requests.""" + """Test that scores are deterministic across identical requests.""" request_body = { "model": MODEL_NAME, "queries": "Is this consistent? ", "documents": ["Yes it is."], - "label_token_ids": [100, 200, 300], + "label_token_ids": [100, 200], } - response1 = requests.post( - server.url_for("v1/score"), - json=request_body, - ) - response2 = requests.post( - server.url_for("v1/score"), - json=request_body, - ) - - assert response1.status_code == 200 - assert response2.status_code == 200 - - data1 = response1.json() - data2 = response2.json() - - # Scores should be identical for deterministic inference - score1 = data1["data"][0]["score"] - score2 = data2["data"][0]["score"] - - assert abs(score1 - score2) < 1e-6 - - @pytest.mark.asyncio - async def test_score_with_many_label_tokens(self, server: RemoteOpenAIServer): - """Test score with many label tokens.""" - # Use a larger set of label tokens - label_token_ids = list(range(100, 200)) # 100 tokens - - response = requests.post( - server.url_for("v1/score"), - json={ - "model": MODEL_NAME, - "queries": "Test query ", - "documents": ["item1"], - "label_token_ids": label_token_ids, - }, - ) - - assert response.status_code == 200, f"Response: {response.text}" - data = response.json() - - # Score should be the probability of the first label token - result = data["data"][0] - assert "score" in result - assert 0.0 <= result["score"] <= 1.0 - - -class TestLogprobTokenIds: - """Tests to verify logprob_token_ids feature works correctly. - - These tests verify that the logprob_token_ids field in SamplingParams - works correctly, which is the underlying mechanism for generative scores. - """ - - @pytest.mark.asyncio - async def test_logprob_token_ids_via_completion(self, server: RemoteOpenAIServer): - """Test that logprob_token_ids returns correct logprobs for specified tokens.""" - # Use the completions API directly to test logprob_token_ids - client = server.get_client() - - # Request completion with logprobs - response = client.completions.create( - model=MODEL_NAME, - prompt="The capital of France is", - max_tokens=1, - logprobs=5, # Get top 5 logprobs - temperature=0.0, - ) + r1 = requests.post(server.url_for("v1/score"), json=request_body) + r2 = requests.post(server.url_for("v1/score"), json=request_body) - assert len(response.choices) == 1 - choice = response.choices[0] - assert choice.logprobs is not None - assert len(choice.logprobs.top_logprobs) > 0 + assert r1.status_code == 200 and r2.status_code == 200 + assert abs(r1.json()["data"][0]["score"] - r2.json()["data"][0]["score"]) < 1e-6 if __name__ == "__main__": From 55444b513ee44b4babf7b18598b3590cc4f40f22 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Fri, 13 Feb 2026 21:56:22 +0000 Subject: [PATCH 17/28] optimize batch processing --- vllm/entrypoints/pooling/score/serving.py | 63 ++++++++++++++--------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index bccf1590eb24..6ac0b3daf349 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time +from collections import OrderedDict from collections.abc import AsyncGenerator, Mapping from concurrent.futures import ThreadPoolExecutor from typing import Any @@ -386,24 +387,45 @@ async def _generative_score( # Validate input lens _validate_score_input_lens(data_1, data_2) - # If data_1 has single item, expand to match data_2 - if len(data_1) == 1: - data_1 = data_1 * len(data_2) - request_id = f"score-{self._base_request_id(raw_request)}" created_time = int(time.time()) - # Build generative score requests for each pair - # Each pair becomes: query=data_1[i], items=[data_2[i]] - all_results: list[ScoreResponseData] = [] + # Group documents by query to maximize batching. + # create_generative_score already handles multiple items per query + # in parallel via merge_async_iterators, so we batch all documents + # sharing the same query into a single call. + # + # Common case: 1 query, N documents -> 1 batched call (not N calls) + # N:N case: group by unique query -> 1 call per unique query + if len(data_1) == 1: + # Fast path: single query, all documents batched in one call + query_groups = [ + (data_1[0], list(data_2), list(range(len(data_2)))) + ] + else: + # N:N case: group documents by identical query text + groups = OrderedDict() + for idx, (q, d) in enumerate(zip(data_1, data_2)): + key = q if isinstance(q, str) else str(q) + if key not in groups: + groups[key] = ([], []) + groups[key][0].append(d) + groups[key][1].append(idx) + query_groups = [ + (q, items, indices) + for q, (items, indices) in groups.items() + ] + + all_results = [None] * len(data_2) if len(data_1) == 1 else [None] * len(data_1) # type: ignore[list-item] total_prompt_tokens = 0 total_completion_tokens = 0 + first_token_id = str(request.label_token_ids[0]) - for idx, (d1, d2) in enumerate(zip(data_1, data_2)): + for query, items, indices in query_groups: gen_request = GenerativeScoreRequest( model=request.model, - query=d1, - items=[d2], + query=query, + items=items, label_token_ids=request.label_token_ids, apply_softmax=True, # Always use softmax for normalized probabilities item_first=False, @@ -418,18 +440,13 @@ async def _generative_score( if isinstance(gen_response, ErrorResponse): return gen_response - # Extract the first label token probability as the score - if gen_response.results and len(gen_response.results) > 0: - token_probs = gen_response.results[0].token_probs - # Get probability of the first label token - first_token_id = str(request.label_token_ids[0]) - score = token_probs.get(first_token_id, 0.0) - - all_results.append( - ScoreResponseData( - index=idx, - score=score, - ) + # Map results back to their original indices + for result in gen_response.results: + original_idx = indices[result.index] + score = result.token_probs.get(first_token_id, 0.0) + all_results[original_idx] = ScoreResponseData( + index=original_idx, + score=score, ) total_prompt_tokens += gen_response.usage.prompt_tokens @@ -439,7 +456,7 @@ async def _generative_score( id=request_id, created=created_time, model=self.models.model_name(), - data=all_results, + data=[r for r in all_results if r is not None], usage=UsageInfo( prompt_tokens=total_prompt_tokens, total_tokens=total_prompt_tokens + total_completion_tokens, From a97b472b784a8f9c63980abcffda3ab1791f6c05 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Mon, 23 Mar 2026 13:23:27 -0700 Subject: [PATCH 18/28] Move generative scoring out of pooling into standalone /generative_scores endpoint - Create vllm/entrypoints/openai/generative_scores/ with its own api_router, serving, and protocol - Register /generative_scores route when task is "generate", independent of pooling tasks - Remove is_causal_lm routing hack; works with any generative model - Remove generative_scores_handler from pooling ServingScores - Remove label_token_ids from pooling ScoreRequestMixin - Move tests to tests/entrypoints/openai/generative_scores/ Co-Authored-By: Claude Opus 4.6 (1M context) --- .../openai/generative_scores/__init__.py | 2 + .../test_generative_scores.py | 10 +- .../test_generative_scores_e2e.py | 86 +++++++----- vllm/entrypoints/openai/api_server.py | 34 ++--- .../openai/generative_scores/__init__.py | 2 + .../openai/generative_scores/api_router.py | 90 ++++++++++++ .../generative_scores/serving.py} | 16 +-- vllm/entrypoints/pooling/__init__.py | 20 --- vllm/entrypoints/pooling/score/protocol.py | 28 +--- vllm/entrypoints/pooling/score/serving.py | 130 ------------------ 10 files changed, 170 insertions(+), 248 deletions(-) create mode 100644 tests/entrypoints/openai/generative_scores/__init__.py rename tests/entrypoints/{pooling/score => openai/generative_scores}/test_generative_scores.py (99%) rename tests/entrypoints/{pooling/score => openai/generative_scores}/test_generative_scores_e2e.py (54%) create mode 100644 vllm/entrypoints/openai/generative_scores/__init__.py create mode 100644 vllm/entrypoints/openai/generative_scores/api_router.py rename vllm/entrypoints/{pooling/score/generative_scores.py => openai/generative_scores/serving.py} (98%) diff --git a/tests/entrypoints/openai/generative_scores/__init__.py b/tests/entrypoints/openai/generative_scores/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/tests/entrypoints/openai/generative_scores/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/tests/entrypoints/pooling/score/test_generative_scores.py b/tests/entrypoints/openai/generative_scores/test_generative_scores.py similarity index 99% rename from tests/entrypoints/pooling/score/test_generative_scores.py rename to tests/entrypoints/openai/generative_scores/test_generative_scores.py index fd022fa27abd..bbaed4d43ca2 100644 --- a/tests/entrypoints/pooling/score/test_generative_scores.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores.py @@ -19,7 +19,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.pooling.score.generative_scores import ( +from vllm.entrypoints.openai.generative_scores.serving import ( GenerativeScoreItemResult, GenerativeScoreRequest, GenerativeScoreResponse, @@ -203,19 +203,19 @@ def test_compute_probabilities(self, label_logprobs, apply_softmax, should_sum_t def test_score_formula(self): """Test the score formula: P(token[0]) / (P(token[0]) + P(token[1])).""" serving = OpenAIServingGenerativeScores.__new__(OpenAIServingGenerativeScores) - + # With logprobs -0.5 and -2.0, softmax gives higher prob to first token logprobs = {9454: -0.5, 2753: -2.0} probs = serving._compute_probabilities(logprobs, apply_softmax=True) - + # Score = P(9454) / (P(9454) + P(2753)) = P(9454) since they sum to 1 score = probs[9454] - + # Manual calculation exp_0 = math.exp(-0.5) exp_1 = math.exp(-2.0) expected_score = exp_0 / (exp_0 + exp_1) - + assert abs(score - expected_score) < 1e-9 assert score > 0.5 # First token has higher logprob, so higher probability diff --git a/tests/entrypoints/pooling/score/test_generative_scores_e2e.py b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py similarity index 54% rename from tests/entrypoints/pooling/score/test_generative_scores_e2e.py rename to tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py index 8aea7480abf0..13fcb85d9074 100644 --- a/tests/entrypoints/pooling/score/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""End-to-end tests for the Score API with CausalLM models. +"""End-to-end tests for the Generative Scores API. Tests verify the full HTTP request/response flow using RemoteOpenAIServer. """ @@ -26,18 +26,18 @@ def server(): yield remote_server -class TestScoreAPIWithCausalLM: - """End-to-end tests for Score API with CausalLM models.""" +class TestGenerativeScoresAPI: + """End-to-end tests for the Generative Scores API.""" @pytest.mark.asyncio async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServer): - """Test basic score request and verify response structure.""" + """Test basic generative score request and verify response structure.""" response = requests.post( - server.url_for("v1/score"), + server.url_for("generative_scores"), json={ "model": MODEL_NAME, - "queries": "Is Paris the capital of France? Answer Yes or No: ", - "documents": ["Paris is beautiful.", "London is rainy."], + "query": "Is Paris the capital of France? Answer Yes or No: ", + "items": ["Paris is beautiful.", "London is rainy."], "label_token_ids": [9454, 2753], }, ) @@ -45,17 +45,18 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ data = response.json() # Verify response structure - assert data["id"].startswith("score-") - assert data["object"] == "list" + assert data["id"].startswith("genscore-") + assert data["object"] == "generative_score" assert "model" in data assert "usage" in data - assert len(data["data"]) == 2 + assert len(data["results"]) == 2 # Verify each result - for i, result in enumerate(data["data"]): + for i, result in enumerate(data["results"]): assert result["index"] == i - assert result["object"] == "score" - assert 0.0 <= result["score"] <= 1.0 + assert "token_probs" in result + for prob in result["token_probs"].values(): + assert 0.0 <= prob <= 1.0 # Verify usage tracking usage = data["usage"] @@ -64,53 +65,65 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] @pytest.mark.asyncio - async def test_multiple_documents(self, server: RemoteOpenAIServer): - """Test score request with multiple documents.""" + async def test_multiple_items(self, server: RemoteOpenAIServer): + """Test generative score request with multiple items.""" response = requests.post( - server.url_for("v1/score"), + server.url_for("generative_scores"), json={ "model": MODEL_NAME, - "queries": "Is this city a capital? ", - "documents": ["Paris", "London", "Berlin", "New York", "Tokyo"], + "query": "Is this city a capital? ", + "items": ["Paris", "London", "Berlin", "New York", "Tokyo"], "label_token_ids": [9454, 2753], }, ) assert response.status_code == 200 data = response.json() - assert len(data["data"]) == 5 + assert len(data["results"]) == 5 @pytest.mark.asyncio async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServer): - """Test that missing label_token_ids returns an error for CausalLM.""" + """Test that missing label_token_ids returns a validation error.""" response = requests.post( - server.url_for("v1/score"), + server.url_for("generative_scores"), json={ "model": MODEL_NAME, - "queries": "Test query", - "documents": ["doc1", "doc2"], + "query": "Test query", + "items": ["item1", "item2"], + }, + ) + # Pydantic validation error for missing required field + assert response.status_code == 422 + + @pytest.mark.asyncio + async def test_validation_empty_items(self, server: RemoteOpenAIServer): + """Test that empty items returns an error.""" + response = requests.post( + server.url_for("generative_scores"), + json={ + "model": MODEL_NAME, + "query": "Test query", + "items": [], + "label_token_ids": [100, 200], }, ) assert response.status_code == 400 - assert "label_token_ids" in response.json()["error"]["message"].lower() @pytest.mark.asyncio @pytest.mark.parametrize( "label_token_ids,expected_status", [ - ([100], 422), # Wrong count (1 instead of 2) - ([100, 200, 300], 422), # Wrong count (3 instead of 2) ([9999999999, 9999999998], 400), # Out of vocab range ], - ids=["single_token", "three_tokens", "invalid_token_ids"], + ids=["invalid_token_ids"], ) async def test_validation_errors(self, server: RemoteOpenAIServer, label_token_ids, expected_status): """Test validation errors for various invalid inputs.""" response = requests.post( - server.url_for("v1/score"), + server.url_for("generative_scores"), json={ "model": MODEL_NAME, - "queries": "Test query", - "documents": ["item1"], + "query": "Test query", + "items": ["item1"], "label_token_ids": label_token_ids, }, ) @@ -121,16 +134,19 @@ async def test_score_consistency(self, server: RemoteOpenAIServer): """Test that scores are deterministic across identical requests.""" request_body = { "model": MODEL_NAME, - "queries": "Is this consistent? ", - "documents": ["Yes it is."], + "query": "Is this consistent? ", + "items": ["Yes it is."], "label_token_ids": [100, 200], } - r1 = requests.post(server.url_for("v1/score"), json=request_body) - r2 = requests.post(server.url_for("v1/score"), json=request_body) + r1 = requests.post(server.url_for("generative_scores"), json=request_body) + r2 = requests.post(server.url_for("generative_scores"), json=request_body) assert r1.status_code == 200 and r2.status_code == 200 - assert abs(r1.json()["data"][0]["score"] - r2.json()["data"][0]["score"]) < 1e-6 + r1_probs = r1.json()["results"][0]["token_probs"] + r2_probs = r2.json()["results"][0]["token_probs"] + for key in r1_probs: + assert abs(r1_probs[key] - r2_probs[key]) < 1e-6 if __name__ == "__main__": diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 944a34cdecd0..2fcd4bd659b6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -251,11 +251,13 @@ def build_app( from vllm.entrypoints.pooling import register_pooling_api_routers register_pooling_api_routers(app, supported_tasks) - elif "generate" in supported_tasks: - # For CausalLM models, register score routes to enable generative scoring - from vllm.entrypoints.pooling.score.api_router import router as score_router - app.include_router(score_router) + if "generate" in supported_tasks: + from vllm.entrypoints.openai.generative_scores.api_router import ( + register_generative_scores_api_router, + ) + + register_generative_scores_api_router(app) app.root_path = args.root_path app.add_middleware( @@ -405,26 +407,14 @@ async def init_app_state( from vllm.entrypoints.pooling import init_pooling_state init_pooling_state(engine_client, state, args, request_logger, supported_tasks) - elif "generate" in supported_tasks: - # For CausalLM models, initialize score state for generative scoring - from vllm.entrypoints.pooling.score.generative_scores import ( - OpenAIServingGenerativeScores, - ) - from vllm.entrypoints.pooling.score.serving import ServingScores - generative_scores_handler = OpenAIServingGenerativeScores( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - log_error_stack=args.log_error_stack, + if "generate" in supported_tasks: + from vllm.entrypoints.openai.generative_scores.api_router import ( + init_generative_scores_state, ) - state.openai_serving_scores = ServingScores( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - score_template=None, - log_error_stack=args.log_error_stack, - generative_scores_handler=generative_scores_handler, + + await init_generative_scores_state( + engine_client, state, args, request_logger ) state.enable_server_load_tracking = args.enable_server_load_tracking diff --git a/vllm/entrypoints/openai/generative_scores/__init__.py b/vllm/entrypoints/openai/generative_scores/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py new file mode 100644 index 000000000000..1a471e47d773 --- /dev/null +++ b/vllm/entrypoints/openai/generative_scores/api_router.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from http import HTTPStatus +from typing import TYPE_CHECKING + +from fastapi import APIRouter, Depends, FastAPI, Request +from fastapi.responses import JSONResponse + +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.generative_scores.serving import ( + GenerativeScoreResponse, + OpenAIServingGenerativeScores, +) +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.logger import init_logger + +if TYPE_CHECKING: + from argparse import Namespace + + from starlette.datastructures import State + + from vllm.engine.protocol import EngineClient + from vllm.entrypoints.logger import RequestLogger + +router = APIRouter() + +logger = init_logger(__name__) + + +def generative_scores(request: Request) -> OpenAIServingGenerativeScores | None: + return request.app.state.serving_generative_scores + + +@router.post( + "/generative_scores", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_generative_score(request: Request): + handler = generative_scores(request) + if handler is None: + raise NotImplementedError( + "The model does not support the Generative Scores API" + ) + + raw_body = await request.json() + + from vllm.entrypoints.openai.generative_scores.serving import ( + GenerativeScoreRequest, + ) + + gen_request = GenerativeScoreRequest(**raw_body) + result = await handler.create_generative_score(gen_request, request) + + if isinstance(result, ErrorResponse): + return JSONResponse( + content=result.model_dump(), status_code=result.error.code + ) + elif isinstance(result, GenerativeScoreResponse): + return JSONResponse(content=result.model_dump()) + + raise ValueError(f"Unexpected response type: {type(result)}") + + +def register_generative_scores_api_router(app: FastAPI): + app.include_router(router) + + +async def init_generative_scores_state( + engine_client: "EngineClient", + state: "State", + args: "Namespace", + request_logger: "RequestLogger | None", +): + from vllm.entrypoints.openai.generative_scores.serving import ( + OpenAIServingGenerativeScores, + ) + + state.serving_generative_scores = OpenAIServingGenerativeScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + ) diff --git a/vllm/entrypoints/pooling/score/generative_scores.py b/vllm/entrypoints/openai/generative_scores/serving.py similarity index 98% rename from vllm/entrypoints/pooling/score/generative_scores.py rename to vllm/entrypoints/openai/generative_scores/serving.py index 26f812a15b0f..6992d7c92ad9 100644 --- a/vllm/entrypoints/pooling/score/generative_scores.py +++ b/vllm/entrypoints/openai/generative_scores/serving.py @@ -1,11 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Generative Scores implementation for CausalLM models. +"""Generative Scores implementation for generative models. This module implements generative scoring functionality that computes the probability of specified token IDs appearing as the next token after a -given query+item prompt. This is used internally by the score endpoint -when the model architecture is a CausalLM. +given query+item prompt. This works on any generative model that produces +logits (task="generate"). """ import asyncio @@ -50,8 +50,6 @@ class GenerativeScoreRequest(OpenAIBaseModel): """Request for computing generative scores. - This is used internally when routing score requests to CausalLM models. - Attributes: model: The model to use for scoring. Optional, follows existing patterns. query: The query text or pre-tokenized query token IDs. @@ -148,14 +146,14 @@ class GenerativeScoreResponse(OpenAIBaseModel): class OpenAIServingGenerativeScores(OpenAIServing): """Serving class for generative scores computation. - This class handles computing the probability of specified token IDs + This class handles computing the probability of specified token IDs appearing as the next token after concatenating query and item prompts. The key operation is: 1. For each item, build a prompt: query + item (or item + query if item_first) 2. Run a forward pass to get the next token distribution 3. Extract probabilities for the specified label_token_ids - 4. Normalize either over the full vocab (apply_softmax=False) or + 4. Normalize either over the full vocab (apply_softmax=False) or over just the label_token_ids (apply_softmax=True) """ @@ -182,7 +180,7 @@ async def create_generative_score( """Create generative scores for the given request. Args: - request: The GenerativeScoreRequest containing query, items, and + request: The GenerativeScoreRequest containing query, items, and label_token_ids. raw_request: The raw FastAPI request object. @@ -480,7 +478,7 @@ def _compute_probabilities( } else: # Return true model probabilities - # Since logprobs are already log(softmax(logits)), + # Since logprobs are already log(softmax(logits)), # we just need to exp() them return { token_id: math.exp(logprob) diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py index 4ddd4cf792b7..f64675e56b68 100644 --- a/vllm/entrypoints/pooling/__init__.py +++ b/vllm/entrypoints/pooling/__init__.py @@ -58,9 +58,6 @@ def init_pooling_state( from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.embed.serving import ServingEmbedding from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling - from vllm.entrypoints.pooling.score.generative_scores import ( - OpenAIServingGenerativeScores, - ) from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.tasks import POOLING_TASKS @@ -104,22 +101,6 @@ def init_pooling_state( if "classify" in supported_tasks else None ) - - # Initialize generative scores handler for CausalLM models - # This handler is used to route /v1/score requests to generative scores - # when the model architecture is CausalLM - generative_scores_handler = None - if "embed" in supported_tasks or "score" in supported_tasks: - # Check if we should create the generative scores handler - # by checking if generate task is supported (CausalLM models) - if "generate" in supported_tasks: - generative_scores_handler = OpenAIServingGenerativeScores( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - log_error_stack=args.log_error_stack, - ) - # Score API handles score/rerank for: # - "score" task (score_type: cross-encoder models) # - "embed" task (score_type: bi-encoder models) @@ -131,7 +112,6 @@ def init_pooling_state( request_logger=request_logger, score_template=resolved_chat_template, log_error_stack=args.log_error_stack, - generative_scores_handler=generative_scores_handler, ) if any(t in supported_tasks for t in ("embed", "score", "token_embed")) else None diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 60dd9530c975..fafa185bd03a 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -3,11 +3,7 @@ import time from typing import Any, TypeAlias -from pydantic import ( - BaseModel, - Field, - field_validator, -) +from pydantic import BaseModel, Field from vllm import PoolingParams from vllm.config import ModelConfig @@ -25,36 +21,14 @@ from vllm.tasks import PoolingTask from vllm.utils import random_uuid -# Exact number of token IDs required in label_token_ids for generative scoring -REQUIRED_LABEL_TOKEN_IDS = 2 - - class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): # --8<-- [start:score-extra-params] mm_processor_kwargs: dict[str, Any] | None = Field( default=None, description=("Additional kwargs to pass to the HF processor."), ) - label_token_ids: list[int] | None = Field( - default=None, - description=( - "List of token IDs to compute probabilities for when using " - f"CausalLM models. Required for generative scoring. " - f"Must contain exactly {REQUIRED_LABEL_TOKEN_IDS} token IDs." - ), - ) # --8<-- [end:score-extra-params] - @field_validator('label_token_ids') - @classmethod - def validate_label_token_ids(cls, v: list[int] | None) -> list[int] | None: - if v is not None and len(v) != REQUIRED_LABEL_TOKEN_IDS: - raise ValueError( - f"label_token_ids must contain exactly {REQUIRED_LABEL_TOKEN_IDS} " - f"token IDs for generative scoring, but got {len(v)}" - ) - return v - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index d80b1de50383..c58fe6d36c07 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time -from collections import OrderedDict from collections.abc import AsyncGenerator, Mapping from concurrent.futures import ThreadPoolExecutor from typing import Any @@ -27,10 +26,6 @@ ScoreResponse, ScoreResponseData, ) -from vllm.entrypoints.pooling.score.generative_scores import ( - GenerativeScoreRequest, - OpenAIServingGenerativeScores, -) from vllm.entrypoints.pooling.score.utils import ( ScoreData, ScoreInputs, @@ -64,7 +59,6 @@ def __init__( request_logger: RequestLogger | None, score_template: str | None = None, log_error_stack: bool = False, - generative_scores_handler: OpenAIServingGenerativeScores | None = None, ) -> None: super().__init__( engine_client=engine_client, @@ -72,7 +66,6 @@ def __init__( request_logger=request_logger, ) self.score_template = score_template - self.generative_scores_handler = generative_scores_handler self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) @@ -517,119 +510,6 @@ async def _run_scoring( trace_headers=trace_headers, ) - async def _generative_score( - self, - request: ScoreRequest, - raw_request: Request | None = None, - ) -> ScoreResponse | ErrorResponse: - """ - Handle scoring for CausalLM models using the generative scores API. - - Converts the score request to generative score format, calls the - generative scores handler, and converts the response back to - the standard score response format. - """ - if self.generative_scores_handler is None: - return self.create_error_response( - "Generative scores handler not initialized. " - "This is required for CausalLM models." - ) - - # Extract data from request - data_1 = request.data_1 - data_2 = request.data_2 - - # Normalize data_1 and data_2 to lists - if isinstance(data_1, str): - data_1 = [data_1] - elif isinstance(data_1, dict): - data_1 = data_1.get("content", []) - - if isinstance(data_2, str): - data_2 = [data_2] - elif isinstance(data_2, dict): - data_2 = data_2.get("content", []) - - # Validate input lens - _validate_score_input_lens(data_1, data_2) - - request_id = f"score-{self._base_request_id(raw_request)}" - created_time = int(time.time()) - - # Group documents by query to maximize batching. - # create_generative_score already handles multiple items per query - # in parallel via merge_async_iterators, so we batch all documents - # sharing the same query into a single call. - # - # Common case: 1 query, N documents -> 1 batched call (not N calls) - # N:N case: group by unique query -> 1 call per unique query - if len(data_1) == 1: - # Fast path: single query, all documents batched in one call - query_groups = [ - (data_1[0], list(data_2), list(range(len(data_2)))) - ] - else: - # N:N case: group documents by identical query text - groups = OrderedDict() - for idx, (q, d) in enumerate(zip(data_1, data_2)): - key = q if isinstance(q, str) else str(q) - if key not in groups: - groups[key] = ([], []) - groups[key][0].append(d) - groups[key][1].append(idx) - query_groups = [ - (q, items, indices) - for q, (items, indices) in groups.items() - ] - - all_results = [None] * len(data_2) if len(data_1) == 1 else [None] * len(data_1) # type: ignore[list-item] - total_prompt_tokens = 0 - total_completion_tokens = 0 - first_token_id = str(request.label_token_ids[0]) - - for query, items, indices in query_groups: - gen_request = GenerativeScoreRequest( - model=request.model, - query=query, - items=items, - label_token_ids=request.label_token_ids, - apply_softmax=True, # Always use softmax for normalized probabilities - item_first=False, - add_special_tokens=True, - priority=request.priority, - ) - - gen_response = await self.generative_scores_handler.create_generative_score( - gen_request, raw_request - ) - - if isinstance(gen_response, ErrorResponse): - return gen_response - - # Map results back to their original indices - for result in gen_response.results: - original_idx = indices[result.index] - score = result.token_probs.get(first_token_id, 0.0) - all_results[original_idx] = ScoreResponseData( - index=original_idx, - score=score, - ) - - total_prompt_tokens += gen_response.usage.prompt_tokens - total_completion_tokens += (gen_response.usage.completion_tokens or 0) - - return ScoreResponse( - id=request_id, - created=created_time, - model=self.models.model_name(), - data=[r for r in all_results if r is not None], - usage=UsageInfo( - prompt_tokens=total_prompt_tokens, - total_tokens=total_prompt_tokens + total_completion_tokens, - completion_tokens=total_completion_tokens, - ), - ) - async def create_score( self, request: ScoreRequest, @@ -644,16 +524,6 @@ async def create_score( if error_check_ret is not None: return error_check_ret - # Check if model is CausalLM and route accordingly - if self.model_config.is_causal_lm: - # For CausalLM models, require label_token_ids - if request.label_token_ids is None: - return self.create_error_response( - "label_token_ids is required for CausalLM models. " - "Please provide a list of token IDs to compute probabilities for." - ) - return await self._generative_score(request, raw_request) - request_id = f"score-{self._base_request_id(raw_request)}" created_time = int(time.time()) From 3b300473e76946f692e7d9db16f78deb91419c28 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Mon, 23 Mar 2026 23:39:11 +0000 Subject: [PATCH 19/28] refactor generative score api --- .../test_generative_scores.py | 25 ++++++------- .../test_generative_scores_e2e.py | 36 +++++++++--------- vllm/compilation/compiler_interface.py | 11 +++++- .../openai/generative_scores/api_router.py | 11 +++--- .../openai/generative_scores/serving.py | 37 +++++++++---------- vllm/v1/sample/sampler.py | 2 +- 6 files changed, 61 insertions(+), 61 deletions(-) diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores.py b/tests/entrypoints/openai/generative_scores/test_generative_scores.py index bbaed4d43ca2..a8ced69ad2cc 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores.py @@ -151,17 +151,18 @@ def test_request_and_response_all_fields(self): # Test response structure response = GenerativeScoreResponse( model="test-model", - results=[ - GenerativeScoreItemResult(index=0, token_probs={"9454": 0.7, "2753": 0.3}), - GenerativeScoreItemResult(index=1, token_probs={"9454": 0.4, "2753": 0.6}), + data=[ + GenerativeScoreItemResult(index=0, score=0.7), + GenerativeScoreItemResult(index=1, score=0.4), ], usage={"prompt_tokens": 10, "total_tokens": 12, "completion_tokens": 2}, ) - assert response.object == "generative_score" + assert response.object == "list" assert response.model == "test-model" - assert len(response.results) == 2 - assert response.results[0].token_probs["9454"] == 0.7 - assert response.results[1].token_probs["2753"] == 0.6 + assert len(response.data) == 2 + assert response.data[0].score == 0.7 + assert response.data[0].object == "score" + assert response.data[1].score == 0.4 assert response.usage.prompt_tokens == 10 @@ -228,10 +229,9 @@ class TestValidation: "request_kwargs,expected_error", [ ({"query": "q", "items": ["i"], "label_token_ids": [999999, 999998]}, "out of vocabulary"), - ({"query": "q", "items": ["i"], "label_token_ids": [100]}, "at least one token"), ({"query": "q", "items": [], "label_token_ids": [100, 200]}, "at least one item"), ], - ids=["invalid_token_id", "single_token", "empty_items"], + ids=["invalid_token_id", "empty_items"], ) async def test_validation_errors(self, request_kwargs, expected_error): """Test that invalid inputs return appropriate errors.""" @@ -299,10 +299,9 @@ async def mock_generate(*args, **kwargs): result = await serving.create_generative_score(request, None) assert isinstance(result, GenerativeScoreResponse) - assert len(result.results) == 2 - for item_result in result.results: - for prob in item_result.token_probs.values(): - assert 0.0 <= prob <= 1.0 + assert len(result.data) == 2 + for item_result in result.data: + assert 0.0 <= item_result.score <= 1.0 if __name__ == "__main__": diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py index 13fcb85d9074..c6cf6f28adb3 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py @@ -33,7 +33,7 @@ class TestGenerativeScoresAPI: async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServer): """Test basic generative score request and verify response structure.""" response = requests.post( - server.url_for("generative_scores"), + server.url_for("generative_score"), json={ "model": MODEL_NAME, "query": "Is Paris the capital of France? Answer Yes or No: ", @@ -45,18 +45,17 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ data = response.json() # Verify response structure - assert data["id"].startswith("genscore-") - assert data["object"] == "generative_score" + assert data["id"].startswith("generative-score-") + assert data["object"] == "list" assert "model" in data assert "usage" in data - assert len(data["results"]) == 2 + assert len(data["data"]) == 2 # Verify each result - for i, result in enumerate(data["results"]): + for i, result in enumerate(data["data"]): assert result["index"] == i - assert "token_probs" in result - for prob in result["token_probs"].values(): - assert 0.0 <= prob <= 1.0 + assert result["object"] == "score" + assert 0.0 <= result["score"] <= 1.0 # Verify usage tracking usage = data["usage"] @@ -68,7 +67,7 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ async def test_multiple_items(self, server: RemoteOpenAIServer): """Test generative score request with multiple items.""" response = requests.post( - server.url_for("generative_scores"), + server.url_for("generative_score"), json={ "model": MODEL_NAME, "query": "Is this city a capital? ", @@ -78,13 +77,13 @@ async def test_multiple_items(self, server: RemoteOpenAIServer): ) assert response.status_code == 200 data = response.json() - assert len(data["results"]) == 5 + assert len(data["data"]) == 5 @pytest.mark.asyncio async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServer): """Test that missing label_token_ids returns a validation error.""" response = requests.post( - server.url_for("generative_scores"), + server.url_for("generative_score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -98,7 +97,7 @@ async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServ async def test_validation_empty_items(self, server: RemoteOpenAIServer): """Test that empty items returns an error.""" response = requests.post( - server.url_for("generative_scores"), + server.url_for("generative_score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -119,7 +118,7 @@ async def test_validation_empty_items(self, server: RemoteOpenAIServer): async def test_validation_errors(self, server: RemoteOpenAIServer, label_token_ids, expected_status): """Test validation errors for various invalid inputs.""" response = requests.post( - server.url_for("generative_scores"), + server.url_for("generative_score"), json={ "model": MODEL_NAME, "query": "Test query", @@ -139,14 +138,13 @@ async def test_score_consistency(self, server: RemoteOpenAIServer): "label_token_ids": [100, 200], } - r1 = requests.post(server.url_for("generative_scores"), json=request_body) - r2 = requests.post(server.url_for("generative_scores"), json=request_body) + r1 = requests.post(server.url_for("generative_score"), json=request_body) + r2 = requests.post(server.url_for("generative_score"), json=request_body) assert r1.status_code == 200 and r2.status_code == 200 - r1_probs = r1.json()["results"][0]["token_probs"] - r2_probs = r2.json()["results"][0]["token_probs"] - for key in r1_probs: - assert abs(r1_probs[key] - r2_probs[key]) < 1e-6 + r1_score = r1.json()["data"][0]["score"] + r2_score = r2.json()["data"][0]["score"] + assert abs(r1_score - r2_score) < 1e-6 if __name__ == "__main__": diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 2242f03045fb..e5f72887cb06 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -373,8 +373,15 @@ def compile( break if input_fake_mode is not None: - fake_mode_ctx: Any = patch( - "torch._inductor.standalone_compile.FakeTensorMode", + # NOTE: torch._inductor.__init__ does + # `from .standalone_compile import standalone_compile` + # which shadows the module name with the function in the + # torch._inductor namespace. We must patch via sys.modules + # to target the actual module, not the function. + import sys + sc_module = sys.modules["torch._inductor.standalone_compile"] + fake_mode_ctx: Any = patch.object( + sc_module, "FakeTensorMode", lambda *a, **kw: input_fake_mode, ) else: diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scores/api_router.py index 1a471e47d773..6e970942b1f7 100644 --- a/vllm/entrypoints/openai/generative_scores/api_router.py +++ b/vllm/entrypoints/openai/generative_scores/api_router.py @@ -33,7 +33,7 @@ def generative_scores(request: Request) -> OpenAIServingGenerativeScores | None: @router.post( - "/generative_scores", + "/generative_score", dependencies=[Depends(validate_json_request)], responses={ HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, @@ -42,21 +42,21 @@ def generative_scores(request: Request) -> OpenAIServingGenerativeScores | None: ) @with_cancellation @load_aware_call -async def create_generative_score(request: Request): - handler = generative_scores(request) +async def create_generative_score(raw_request: Request): + handler = generative_scores(raw_request) if handler is None: raise NotImplementedError( "The model does not support the Generative Scores API" ) - raw_body = await request.json() + raw_body = await raw_request.json() from vllm.entrypoints.openai.generative_scores.serving import ( GenerativeScoreRequest, ) gen_request = GenerativeScoreRequest(**raw_body) - result = await handler.create_generative_score(gen_request, request) + result = await handler.create_generative_score(gen_request, raw_request) if isinstance(result, ErrorResponse): return JSONResponse( @@ -86,5 +86,4 @@ async def init_generative_scores_state( engine_client, state.openai_serving_models, request_logger=request_logger, - log_error_stack=args.log_error_stack, ) diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/openai/generative_scores/serving.py index 6992d7c92ad9..7c0ac494fef9 100644 --- a/vllm/entrypoints/openai/generative_scores/serving.py +++ b/vllm/entrypoints/openai/generative_scores/serving.py @@ -109,13 +109,13 @@ class GenerativeScoreItemResult(OpenAIBaseModel): Attributes: index: The index of this item in the input items list. - token_probs: Dictionary mapping token IDs (as strings) to their probabilities. + object: Type of object, always "score". + score: The probability score for the first label token. """ index: int - token_probs: dict[str, float] = Field( - description="Mapping of token ID (as string) to probability." - ) + object: Literal["score"] = "score" + score: float class GenerativeScoreResponse(OpenAIBaseModel): @@ -123,18 +123,18 @@ class GenerativeScoreResponse(OpenAIBaseModel): Attributes: id: Unique identifier for this response. - object: Type of object, always "generative_score". + object: Type of object, always "list". created: Unix timestamp of when the response was created. model: The model used for scoring. - results: List of scoring results, one per input item. + data: List of scoring results, one per input item. usage: Token usage information. """ id: str = Field(default="") - object: Literal["generative_score"] = "generative_score" + object: Literal["list"] = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - results: list[GenerativeScoreItemResult] + data: list[GenerativeScoreItemResult] usage: UsageInfo @@ -163,13 +163,11 @@ def __init__( models: OpenAIServingModels, *, request_logger: RequestLogger | None, - log_error_stack: bool = False, ) -> None: super().__init__( engine_client=engine_client, models=models, request_logger=request_logger, - log_error_stack=log_error_stack, ) async def create_generative_score( @@ -233,7 +231,7 @@ async def create_generative_score( logger.exception("Error preparing request components") return self.create_error_response(e) - request_id = f"genscore-{self._base_request_id(raw_request, request.request_id)}" + request_id = f"generative-score-{self._base_request_id(raw_request, request.request_id)}" created_time = int(time.time()) # Build prompts for each item @@ -355,10 +353,14 @@ async def create_generative_score( apply_softmax=request.apply_softmax, ) + # Use the first label token's probability as the score + first_label_id = request.label_token_ids[0] + score = token_probs[first_label_id] + item_results.append( GenerativeScoreItemResult( index=i, - token_probs={str(k): v for k, v in token_probs.items()}, + score=score, ) ) @@ -372,7 +374,7 @@ async def create_generative_score( id=request_id, created=created_time, model=model_name, - results=item_results, + data=item_results, usage=UsageInfo( prompt_tokens=total_prompt_tokens, total_tokens=total_prompt_tokens + total_completion_tokens, @@ -398,16 +400,12 @@ async def _build_prompts( Returns: Tuple of (list of TokensPrompt, list of prompt token counts). """ - # Get async tokenizer once for efficiency - async_tokenizer = self._get_async_tokenizer(tokenizer) - # Tokenize query if it's a string if isinstance(request.query, str): - query_result = await async_tokenizer( + query_token_ids = tokenizer.encode( request.query, add_special_tokens=request.add_special_tokens, ) - query_token_ids = query_result.input_ids else: query_token_ids = request.query @@ -418,11 +416,10 @@ async def _build_prompts( # Tokenize item if it's a string if isinstance(item, str): # Don't add special tokens for items to avoid duplicate BOS/EOS - item_result = await async_tokenizer( + item_token_ids = tokenizer.encode( item, add_special_tokens=False, ) - item_token_ids = item_result.input_ids else: item_token_ids = item diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 26e1b0e5f475..ffe9602f799e 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -104,7 +104,7 @@ def forward( sampled = sampled.long() # Handle logprob_token_ids if specified (more efficient than full vocab) - # This is used by generative_scores API to get logprobs for specific tokens + # This is used by generative_score API to get logprobs for specific tokens logprob_token_ids_tensors = None if sampling_metadata.logprob_token_ids: logprob_token_ids_tensors = self.gather_specific_token_logprobs( From 0ce2afee7c4118219914004454c49a7e0c5862db Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Tue, 24 Mar 2026 00:10:31 +0000 Subject: [PATCH 20/28] Remove unrelated protocol.py changes --- vllm/entrypoints/pooling/score/protocol.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 7a642d35b211..bb633fc28b3c 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any, TypeAlias +from typing import TypeAlias from pydantic import BaseModel, Field @@ -21,14 +21,8 @@ from vllm.tasks import PoolingTask from vllm.utils import random_uuid -class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): - # --8<-- [start:score-extra-params] - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - # --8<-- [end:score-extra-params] +class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} From 9de4626bb589a4cb3811390f4bb9bd9f585228b0 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Tue, 24 Mar 2026 00:19:26 +0000 Subject: [PATCH 21/28] Code review fixes: remove dead code, fix docs, fix test paths - Remove unused is_causal_lm property from ModelConfig - Fix docs: correct endpoint (/generative_score), explain how items/query form prompts, update label_token_ids to 'at least 1', fix response id prefix - Fix tests: use MODEL_NAME instead of hardcoded local path --- docs/serving/openai_compatible_server.md | 27 ++++++++++--------- .../test_generative_scores.py | 7 +++-- .../test_generative_scores_e2e.py | 3 +-- vllm/config/model.py | 12 --------- 4 files changed, 19 insertions(+), 30 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index daba0d350e1d..ec32ce125a33 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -484,31 +484,34 @@ Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](. #### CausalLM Models (Generative Scoring) -When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the endpoint computes the probability of specified token IDs appearing as the next token. This is useful for generative scoring tasks, sentiment analysis, or any scenario where you want to score the likelihood of specific tokens. +When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the `/generative_score` endpoint computes the probability of specified token IDs appearing as the next token. Each item (document) is concatenated with the query to form a prompt, and the model predicts how likely each label token is as the next token after that prompt. This lets you score items against a query — for example, asking "Is this the capital of France?" and scoring each city by how likely the model is to answer "Yes". **Requirements for CausalLM models:** -- The `label_token_ids` parameter is **required** and must contain **exactly 2 token IDs** (for generative scoring). -- The score is computed as: `P(label_token_ids[0]) / (P(label_token_ids[0]) + P(label_token_ids[1]))` +- The `label_token_ids` parameter is **required** and must contain **at least 1 token ID**. +- When 2 label tokens are provided, the score equals `P(label_token_ids[0]) / (P(label_token_ids[0]) + P(label_token_ids[1]))` (softmax over the two labels). +- When more labels are provided, the score is the softmax-normalized probability of the first label token across all label tokens. ##### Example: Score with CausalLM ```bash -curl -X POST http://localhost:8000/v1/score \ +curl -X POST http://localhost:8000/generative_score \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen3-0.6B", - "queries": "Is this city the capital of France?", - "documents": ["Paris", "London", "Berlin"], + "query": "Is this city the capital of France?", + "items": ["Paris", "London", "Berlin"], "label_token_ids": [9454, 2753] }' ``` +Here, each item is appended to the query to form prompts like `"Is this city the capital of France? Paris"`, `"... London"`, etc. The model then predicts the next token, and the score reflects the probability of "Yes" (token 9454) vs "No" (token 2753). + ??? console "Response" ```json { - "id": "score-abc123", + "id": "generative-score-abc123", "object": "list", "created": 1234567890, "model": "Qwen/Qwen3-0.6B", @@ -523,11 +526,11 @@ curl -X POST http://localhost:8000/v1/score \ ##### How it works -1. **Prompt Construction**: For each document, builds `prompt = query + document` -2. **Forward Pass**: Runs the model to get next-token logits -3. **Probability Extraction**: Extracts logprobs for the 2 specified `label_token_ids` -4. **Softmax Normalization**: Applies softmax over only the 2 label tokens -5. **Score Computation**: Returns `P(token[0]) / (P(token[0]) + P(token[1]))` as the score +1. **Prompt Construction**: For each item, builds `prompt = query + item` (or `item + query` if `item_first=true`) +2. **Forward Pass**: Runs the model on each prompt to get next-token logits +3. **Probability Extraction**: Extracts logprobs for the specified `label_token_ids` +4. **Softmax Normalization**: Applies softmax over only the label tokens (when `apply_softmax=true`) +5. **Score**: Returns the normalized probability of the first label token ##### Finding Token IDs diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores.py b/tests/entrypoints/openai/generative_scores/test_generative_scores.py index a8ced69ad2cc..1e8886910772 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores.py @@ -33,8 +33,7 @@ from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "Qwen/Qwen3-0.6B" -MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" -BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_PATH)] +BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] @dataclass @@ -46,7 +45,7 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" - tokenizer = MODEL_PATH + tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" max_model_len = 100 @@ -74,7 +73,7 @@ def get_vocab_size(self): def _create_mock_engine(): """Create a mock AsyncLLM engine.""" mock_engine = MagicMock(spec=AsyncLLM) - mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_PATH) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py index c6cf6f28adb3..2eb75a072e2e 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py @@ -11,7 +11,6 @@ from ....utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen3-0.6B" -MODEL_PATH = "/shared/public/elr-models/Qwen/Qwen3-0.6B/e6de91484c29aa9480d55605af694f39b081c455/" @pytest.fixture(scope="module") @@ -22,7 +21,7 @@ def server(): "--enforce-eager", "--max-num-seqs", "32", ] - with RemoteOpenAIServer(MODEL_PATH, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/vllm/config/model.py b/vllm/config/model.py index 9e8042519da6..122d5eabd722 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import re import warnings from collections.abc import Callable from dataclasses import InitVar, field @@ -1476,17 +1475,6 @@ def score_type(self) -> ScoreType: else self._model_info.score_type ) - @property - def is_causal_lm(self) -> bool: - """Check if the model architecture is a CausalLM model. - - Returns True if any architecture in hf_config.architectures matches - the pattern .*ForCausalLM.* (e.g., LlamaForCausalLM, Qwen2ForCausalLM). - """ - architectures = getattr(self.hf_config, "architectures", []) - pattern = re.compile(r".*ForCausalLM.*") - return any(pattern.match(arch) for arch in architectures) - @property def is_pp_supported(self) -> bool: return self._model_info.supports_pp From 98490d2d772e0c81e9c471a32a29930901a3aad2 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Wed, 25 Mar 2026 18:15:15 -0700 Subject: [PATCH 22/28] change name to generative_scoring --- docs/serving/openai_compatible_server.md | 6 +-- .../__init__.py | 0 .../test_generative_scoring.py} | 46 +++++++++---------- .../test_generative_scoring_e2e.py} | 26 +++++------ vllm/entrypoints/openai/api_server.py | 12 ++--- .../__init__.py | 0 .../api_router.py | 38 +++++++-------- .../serving.py | 44 +++++++++--------- vllm/v1/sample/sampler.py | 2 +- 9 files changed, 87 insertions(+), 87 deletions(-) rename tests/entrypoints/openai/{generative_scores => generative_scoring}/__init__.py (100%) rename tests/entrypoints/openai/{generative_scores/test_generative_scores.py => generative_scoring/test_generative_scoring.py} (87%) rename tests/entrypoints/openai/{generative_scores/test_generative_scores_e2e.py => generative_scoring/test_generative_scoring_e2e.py} (85%) rename vllm/entrypoints/openai/{generative_scores => generative_scoring}/__init__.py (100%) rename vllm/entrypoints/openai/{generative_scores => generative_scoring}/api_router.py (62%) rename vllm/entrypoints/openai/{generative_scores => generative_scoring}/serving.py (93%) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index ec32ce125a33..1c06d7c1e7bf 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -484,7 +484,7 @@ Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](. #### CausalLM Models (Generative Scoring) -When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the `/generative_score` endpoint computes the probability of specified token IDs appearing as the next token. Each item (document) is concatenated with the query to form a prompt, and the model predicts how likely each label token is as the next token after that prompt. This lets you score items against a query — for example, asking "Is this the capital of France?" and scoring each city by how likely the model is to answer "Yes". +When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the `/generative_scoring` endpoint computes the probability of specified token IDs appearing as the next token. Each item (document) is concatenated with the query to form a prompt, and the model predicts how likely each label token is as the next token after that prompt. This lets you score items against a query — for example, asking "Is this the capital of France?" and scoring each city by how likely the model is to answer "Yes". **Requirements for CausalLM models:** @@ -495,7 +495,7 @@ When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the ##### Example: Score with CausalLM ```bash -curl -X POST http://localhost:8000/generative_score \ +curl -X POST http://localhost:8000/generative_scoring \ -H "Content-Type: application/json" \ -d '{ "model": "Qwen/Qwen3-0.6B", @@ -511,7 +511,7 @@ Here, each item is appended to the query to form prompts like `"Is this city the ```json { - "id": "generative-score-abc123", + "id": "generative-scoring-abc123", "object": "list", "created": 1234567890, "model": "Qwen/Qwen3-0.6B", diff --git a/tests/entrypoints/openai/generative_scores/__init__.py b/tests/entrypoints/openai/generative_scoring/__init__.py similarity index 100% rename from tests/entrypoints/openai/generative_scores/__init__.py rename to tests/entrypoints/openai/generative_scoring/__init__.py diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py similarity index 87% rename from tests/entrypoints/openai/generative_scores/test_generative_scores.py rename to tests/entrypoints/openai/generative_scoring/test_generative_scoring.py index 1e8886910772..45cda905ddbf 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for the Generative Scores API. +"""Tests for the Generative Scoring API. Tests cover: 1. Protocol models (request/response construction) @@ -19,11 +19,11 @@ from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scores.serving import ( - GenerativeScoreItemResult, - GenerativeScoreRequest, - GenerativeScoreResponse, - OpenAIServingGenerativeScores, +from vllm.entrypoints.openai.generative_scoring.serving import ( + GenerativeScoringItemResult, + GenerativeScoringRequest, + GenerativeScoringResponse, + OpenAIServingGenerativeScoring, ) from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels @@ -81,13 +81,13 @@ def _create_mock_engine(): return mock_engine -def _create_serving(mock_engine) -> OpenAIServingGenerativeScores: - """Create an OpenAIServingGenerativeScores instance with mocks.""" +def _create_serving(mock_engine) -> OpenAIServingGenerativeScoring: + """Create an OpenAIServingGenerativeScoring instance with mocks.""" models = OpenAIServingModels( engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) - return OpenAIServingGenerativeScores(mock_engine, models, request_logger=None) + return OpenAIServingGenerativeScoring(mock_engine, models, request_logger=None) def _create_mock_request_output(logprobs_dict: dict[int, float]) -> RequestOutput: @@ -115,12 +115,12 @@ def _create_mock_request_output(logprobs_dict: dict[int, float]) -> RequestOutpu class TestProtocolModels: - """Tests for GenerativeScoreRequest and GenerativeScoreResponse.""" + """Tests for GenerativeScoringRequest and GenerativeScoringResponse.""" def test_request_and_response_all_fields(self): """Test request construction with all field types and response structure.""" # Test request with string inputs - req_str = GenerativeScoreRequest( + req_str = GenerativeScoringRequest( query="Is this the capital?", items=["Paris", "London"], label_token_ids=[9454, 2753], @@ -133,7 +133,7 @@ def test_request_and_response_all_fields(self): assert req_str.add_special_tokens is True # default # Test request with pre-tokenized inputs and custom options - req_tok = GenerativeScoreRequest( + req_tok = GenerativeScoringRequest( query=[100, 200, 300], items=[[400, 500], [600, 700]], label_token_ids=[1234, 5678], @@ -148,11 +148,11 @@ def test_request_and_response_all_fields(self): assert req_tok.add_special_tokens is False # Test response structure - response = GenerativeScoreResponse( + response = GenerativeScoringResponse( model="test-model", data=[ - GenerativeScoreItemResult(index=0, score=0.7), - GenerativeScoreItemResult(index=1, score=0.4), + GenerativeScoringItemResult(index=0, score=0.7), + GenerativeScoringItemResult(index=1, score=0.4), ], usage={"prompt_tokens": 10, "total_tokens": 12, "completion_tokens": 2}, ) @@ -179,7 +179,7 @@ class TestProbabilityComputation: ) def test_compute_probabilities(self, label_logprobs, apply_softmax, should_sum_to_one): """Test probability computation for softmax and true probability modes.""" - serving = OpenAIServingGenerativeScores.__new__(OpenAIServingGenerativeScores) + serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring) probs = serving._compute_probabilities(label_logprobs, apply_softmax=apply_softmax) # Verify sum behavior @@ -202,7 +202,7 @@ def test_compute_probabilities(self, label_logprobs, apply_softmax, should_sum_t def test_score_formula(self): """Test the score formula: P(token[0]) / (P(token[0]) + P(token[1])).""" - serving = OpenAIServingGenerativeScores.__new__(OpenAIServingGenerativeScores) + serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring) # With logprobs -0.5 and -2.0, softmax gives higher prob to first token logprobs = {9454: -0.5, 2753: -2.0} @@ -236,8 +236,8 @@ async def test_validation_errors(self, request_kwargs, expected_error): """Test that invalid inputs return appropriate errors.""" mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - request = GenerativeScoreRequest(model=MODEL_NAME, **request_kwargs) - result = await serving.create_generative_score(request, None) + request = GenerativeScoringRequest(model=MODEL_NAME, **request_kwargs) + result = await serving.create_generative_scoring(request, None) assert isinstance(result, ErrorResponse) assert expected_error in result.error.message.lower() @@ -260,7 +260,7 @@ async def test_item_ordering(self, item_first, expected): mock_engine = _create_mock_engine() serving = _create_serving(mock_engine) - request = GenerativeScoreRequest( + request = GenerativeScoringRequest( query=[100, 101], items=[[200, 201], [300, 301]], label_token_ids=[500, 501], @@ -289,15 +289,15 @@ async def mock_generate(*args, **kwargs): mock_engine.generate = mock_generate - request = GenerativeScoreRequest( + request = GenerativeScoringRequest( model=MODEL_NAME, query="Is Paris the capital?", items=["Yes", "No"], label_token_ids=[1234, 5678], ) - result = await serving.create_generative_score(request, None) + result = await serving.create_generative_scoring(request, None) - assert isinstance(result, GenerativeScoreResponse) + assert isinstance(result, GenerativeScoringResponse) assert len(result.data) == 2 for item_result in result.data: assert 0.0 <= item_result.score <= 1.0 diff --git a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py similarity index 85% rename from tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py rename to tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py index 2eb75a072e2e..783061f17921 100644 --- a/tests/entrypoints/openai/generative_scores/test_generative_scores_e2e.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""End-to-end tests for the Generative Scores API. +"""End-to-end tests for the Generative Scoring API. Tests verify the full HTTP request/response flow using RemoteOpenAIServer. """ @@ -25,14 +25,14 @@ def server(): yield remote_server -class TestGenerativeScoresAPI: - """End-to-end tests for the Generative Scores API.""" +class TestGenerativeScoringAPI: + """End-to-end tests for the Generative Scoring API.""" @pytest.mark.asyncio async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServer): - """Test basic generative score request and verify response structure.""" + """Test basic generative scoring request and verify response structure.""" response = requests.post( - server.url_for("generative_score"), + server.url_for("generative_scoring"), json={ "model": MODEL_NAME, "query": "Is Paris the capital of France? Answer Yes or No: ", @@ -44,7 +44,7 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ data = response.json() # Verify response structure - assert data["id"].startswith("generative-score-") + assert data["id"].startswith("generative-scoring-") assert data["object"] == "list" assert "model" in data assert "usage" in data @@ -64,9 +64,9 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ @pytest.mark.asyncio async def test_multiple_items(self, server: RemoteOpenAIServer): - """Test generative score request with multiple items.""" + """Test generative scoring request with multiple items.""" response = requests.post( - server.url_for("generative_score"), + server.url_for("generative_scoring"), json={ "model": MODEL_NAME, "query": "Is this city a capital? ", @@ -82,7 +82,7 @@ async def test_multiple_items(self, server: RemoteOpenAIServer): async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServer): """Test that missing label_token_ids returns a validation error.""" response = requests.post( - server.url_for("generative_score"), + server.url_for("generative_scoring"), json={ "model": MODEL_NAME, "query": "Test query", @@ -96,7 +96,7 @@ async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServ async def test_validation_empty_items(self, server: RemoteOpenAIServer): """Test that empty items returns an error.""" response = requests.post( - server.url_for("generative_score"), + server.url_for("generative_scoring"), json={ "model": MODEL_NAME, "query": "Test query", @@ -117,7 +117,7 @@ async def test_validation_empty_items(self, server: RemoteOpenAIServer): async def test_validation_errors(self, server: RemoteOpenAIServer, label_token_ids, expected_status): """Test validation errors for various invalid inputs.""" response = requests.post( - server.url_for("generative_score"), + server.url_for("generative_scoring"), json={ "model": MODEL_NAME, "query": "Test query", @@ -137,8 +137,8 @@ async def test_score_consistency(self, server: RemoteOpenAIServer): "label_token_ids": [100, 200], } - r1 = requests.post(server.url_for("generative_score"), json=request_body) - r2 = requests.post(server.url_for("generative_score"), json=request_body) + r1 = requests.post(server.url_for("generative_scoring"), json=request_body) + r2 = requests.post(server.url_for("generative_scoring"), json=request_body) assert r1.status_code == 200 and r2.status_code == 200 r1_score = r1.json()["data"][0]["score"] diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6600e4f64154..999cb1a5cbe1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -247,11 +247,11 @@ def build_app( register_pooling_api_routers(app, supported_tasks, model_config) if "generate" in supported_tasks: - from vllm.entrypoints.openai.generative_scores.api_router import ( - register_generative_scores_api_router, + from vllm.entrypoints.openai.generative_scoring.api_router import ( + register_generative_scoring_api_router, ) - register_generative_scores_api_router(app) + register_generative_scoring_api_router(app) app.root_path = args.root_path app.add_middleware( @@ -421,11 +421,11 @@ async def init_app_state( init_pooling_state(engine_client, state, args, request_logger, supported_tasks) if "generate" in supported_tasks: - from vllm.entrypoints.openai.generative_scores.api_router import ( - init_generative_scores_state, + from vllm.entrypoints.openai.generative_scoring.api_router import ( + init_generative_scoring_state, ) - await init_generative_scores_state( + await init_generative_scoring_state( engine_client, state, args, request_logger ) diff --git a/vllm/entrypoints/openai/generative_scores/__init__.py b/vllm/entrypoints/openai/generative_scoring/__init__.py similarity index 100% rename from vllm/entrypoints/openai/generative_scores/__init__.py rename to vllm/entrypoints/openai/generative_scoring/__init__.py diff --git a/vllm/entrypoints/openai/generative_scores/api_router.py b/vllm/entrypoints/openai/generative_scoring/api_router.py similarity index 62% rename from vllm/entrypoints/openai/generative_scores/api_router.py rename to vllm/entrypoints/openai/generative_scoring/api_router.py index 6e970942b1f7..6a49e72dc082 100644 --- a/vllm/entrypoints/openai/generative_scores/api_router.py +++ b/vllm/entrypoints/openai/generative_scoring/api_router.py @@ -7,9 +7,9 @@ from fastapi.responses import JSONResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scores.serving import ( - GenerativeScoreResponse, - OpenAIServingGenerativeScores, +from vllm.entrypoints.openai.generative_scoring.serving import ( + GenerativeScoringResponse, + OpenAIServingGenerativeScoring, ) from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import load_aware_call, with_cancellation @@ -28,12 +28,12 @@ logger = init_logger(__name__) -def generative_scores(request: Request) -> OpenAIServingGenerativeScores | None: - return request.app.state.serving_generative_scores +def generative_scoring(request: Request) -> OpenAIServingGenerativeScoring | None: + return request.app.state.serving_generative_scoring @router.post( - "/generative_score", + "/generative_scoring", dependencies=[Depends(validate_json_request)], responses={ HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, @@ -42,47 +42,47 @@ def generative_scores(request: Request) -> OpenAIServingGenerativeScores | None: ) @with_cancellation @load_aware_call -async def create_generative_score(raw_request: Request): - handler = generative_scores(raw_request) +async def create_generative_scoring(raw_request: Request): + handler = generative_scoring(raw_request) if handler is None: raise NotImplementedError( - "The model does not support the Generative Scores API" + "The model does not support the Generative Scoring API" ) raw_body = await raw_request.json() - from vllm.entrypoints.openai.generative_scores.serving import ( - GenerativeScoreRequest, + from vllm.entrypoints.openai.generative_scoring.serving import ( + GenerativeScoringRequest, ) - gen_request = GenerativeScoreRequest(**raw_body) - result = await handler.create_generative_score(gen_request, raw_request) + gen_request = GenerativeScoringRequest(**raw_body) + result = await handler.create_generative_scoring(gen_request, raw_request) if isinstance(result, ErrorResponse): return JSONResponse( content=result.model_dump(), status_code=result.error.code ) - elif isinstance(result, GenerativeScoreResponse): + elif isinstance(result, GenerativeScoringResponse): return JSONResponse(content=result.model_dump()) raise ValueError(f"Unexpected response type: {type(result)}") -def register_generative_scores_api_router(app: FastAPI): +def register_generative_scoring_api_router(app: FastAPI): app.include_router(router) -async def init_generative_scores_state( +async def init_generative_scoring_state( engine_client: "EngineClient", state: "State", args: "Namespace", request_logger: "RequestLogger | None", ): - from vllm.entrypoints.openai.generative_scores.serving import ( - OpenAIServingGenerativeScores, + from vllm.entrypoints.openai.generative_scoring.serving import ( + OpenAIServingGenerativeScoring, ) - state.serving_generative_scores = OpenAIServingGenerativeScores( + state.serving_generative_scoring = OpenAIServingGenerativeScoring( engine_client, state.openai_serving_models, request_logger=request_logger, diff --git a/vllm/entrypoints/openai/generative_scores/serving.py b/vllm/entrypoints/openai/generative_scoring/serving.py similarity index 93% rename from vllm/entrypoints/openai/generative_scores/serving.py rename to vllm/entrypoints/openai/generative_scoring/serving.py index 7c0ac494fef9..e8d43e62f19d 100644 --- a/vllm/entrypoints/openai/generative_scores/serving.py +++ b/vllm/entrypoints/openai/generative_scoring/serving.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Generative Scores implementation for generative models. +"""Generative Scoring implementation for generative models. This module implements generative scoring functionality that computes the probability of specified token IDs appearing as the next token after a @@ -47,8 +47,8 @@ # ============================================================================ -class GenerativeScoreRequest(OpenAIBaseModel): - """Request for computing generative scores. +class GenerativeScoringRequest(OpenAIBaseModel): + """Request for computing generative scoring. Attributes: model: The model to use for scoring. Optional, follows existing patterns. @@ -104,8 +104,8 @@ class GenerativeScoreRequest(OpenAIBaseModel): ) -class GenerativeScoreItemResult(OpenAIBaseModel): - """Result for a single item in the generative scores response. +class GenerativeScoringItemResult(OpenAIBaseModel): + """Result for a single item in the generative scoring response. Attributes: index: The index of this item in the input items list. @@ -118,8 +118,8 @@ class GenerativeScoreItemResult(OpenAIBaseModel): score: float -class GenerativeScoreResponse(OpenAIBaseModel): - """Response from the generative scores computation. +class GenerativeScoringResponse(OpenAIBaseModel): + """Response from the generative scoring computation. Attributes: id: Unique identifier for this response. @@ -134,7 +134,7 @@ class GenerativeScoreResponse(OpenAIBaseModel): object: Literal["list"] = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str - data: list[GenerativeScoreItemResult] + data: list[GenerativeScoringItemResult] usage: UsageInfo @@ -143,8 +143,8 @@ class GenerativeScoreResponse(OpenAIBaseModel): # ============================================================================ -class OpenAIServingGenerativeScores(OpenAIServing): - """Serving class for generative scores computation. +class OpenAIServingGenerativeScoring(OpenAIServing): + """Serving class for generative scoring computation. This class handles computing the probability of specified token IDs appearing as the next token after concatenating query and item prompts. @@ -170,20 +170,20 @@ def __init__( request_logger=request_logger, ) - async def create_generative_score( + async def create_generative_scoring( self, - request: GenerativeScoreRequest, + request: GenerativeScoringRequest, raw_request: Request | None = None, - ) -> GenerativeScoreResponse | ErrorResponse: - """Create generative scores for the given request. + ) -> GenerativeScoringResponse | ErrorResponse: + """Create generative scoring for the given request. Args: - request: The GenerativeScoreRequest containing query, items, and + request: The GenerativeScoringRequest containing query, items, and label_token_ids. raw_request: The raw FastAPI request object. Returns: - GenerativeScoreResponse with probabilities for each item, or + GenerativeScoringResponse with probabilities for each item, or ErrorResponse if an error occurred. """ # Check model @@ -199,7 +199,7 @@ async def create_generative_score( tokenizer = self.renderer.tokenizer if tokenizer is None: return self.create_error_response( - "Tokenizer not available. Cannot process generative score request." + "Tokenizer not available. Cannot process generative scoring request." ) # Validate label_token_ids @@ -231,7 +231,7 @@ async def create_generative_score( logger.exception("Error preparing request components") return self.create_error_response(e) - request_id = f"generative-score-{self._base_request_id(raw_request, request.request_id)}" + request_id = f"generative-scoring-{self._base_request_id(raw_request, request.request_id)}" created_time = int(time.time()) # Build prompts for each item @@ -299,7 +299,7 @@ async def create_generative_score( return self.create_error_response(e) # Process results to extract label token probabilities - item_results: list[GenerativeScoreItemResult] = [] + item_results: list[GenerativeScoringItemResult] = [] total_prompt_tokens = 0 total_completion_tokens = 0 @@ -358,7 +358,7 @@ async def create_generative_score( score = token_probs[first_label_id] item_results.append( - GenerativeScoreItemResult( + GenerativeScoringItemResult( index=i, score=score, ) @@ -370,7 +370,7 @@ async def create_generative_score( # Build response model_name = self.models.model_name(lora_request) - response = GenerativeScoreResponse( + response = GenerativeScoringResponse( id=request_id, created=created_time, model=model_name, @@ -386,7 +386,7 @@ async def create_generative_score( async def _build_prompts( self, - request: GenerativeScoreRequest, + request: GenerativeScoringRequest, tokenizer, max_model_len: int, ) -> tuple[list[TokensPrompt], list[int]]: diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index ffe9602f799e..2aece7a03328 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -104,7 +104,7 @@ def forward( sampled = sampled.long() # Handle logprob_token_ids if specified (more efficient than full vocab) - # This is used by generative_score API to get logprobs for specific tokens + # This is used by generative_scoring API to get logprobs for specific tokens logprob_token_ids_tensors = None if sampling_metadata.logprob_token_ids: logprob_token_ids_tensors = self.gather_specific_token_logprobs( From 6f74962f9107f20101d9346656413d021a906932 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Thu, 26 Mar 2026 14:37:12 -0700 Subject: [PATCH 23/28] update docs to separate out pooling and gen scoring --- docs/serving/openai_compatible_server.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 1c06d7c1e7bf..28e643ea5f07 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -74,8 +74,10 @@ In addition, we have the following custom APIs: - Compatible with [Cohere's Embed API](https://docs.cohere.com/reference/embed) - Works with any [embedding model](../models/pooling_models/embed.md#supported-models), including multimodal models. - [Score API](../models/pooling_models/scoring.md#score-api) (`/score`, `/v1/score`) - - Applicable to [score models](../models/pooling_models/scoring.md) and [CausalLM models](../models/generative_models.md). - - For CausalLM models, computes next-token probabilities for specified `label_token_ids`. + - Applicable to [score models](../models/pooling_models/scoring.md) (cross-encoder, bi-encoder, late-interaction). +- [Generative Scoring API](#generative-scoring-api) (`/generative_scoring`) + - Applicable to [CausalLM models](../models/generative_models.md) (task `"generate"`). + - Computes next-token probabilities for specified `label_token_ids`. - [Rerank API](../models/pooling_models/scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) - Implements [Jina AI's v1 rerank API](https://jina.ai/reranker/) - Also compatible with [Cohere's v1 & v2 rerank APIs](https://docs.cohere.com/v2/reference/rerank) @@ -482,17 +484,19 @@ This approach is more robust than index-based access (`messages[0]`, `messages[1 Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) -#### CausalLM Models (Generative Scoring) +### Generative Scoring API -When using a CausalLM model (e.g., Llama, Qwen, Mistral) with the Score API, the `/generative_scoring` endpoint computes the probability of specified token IDs appearing as the next token. Each item (document) is concatenated with the query to form a prompt, and the model predicts how likely each label token is as the next token after that prompt. This lets you score items against a query — for example, asking "Is this the capital of France?" and scoring each city by how likely the model is to answer "Yes". +The `/generative_scoring` endpoint uses a CausalLM model (e.g., Llama, Qwen, Mistral) to compute the probability of specified token IDs appearing as the next token. Each item (document) is concatenated with the query to form a prompt, and the model predicts how likely each label token is as the next token after that prompt. This lets you score items against a query — for example, asking "Is this the capital of France?" and scoring each city by how likely the model is to answer "Yes". -**Requirements for CausalLM models:** +This endpoint is automatically available when the server is started with a generative model (task `"generate"`). It is separate from the pooling-based [Score API](#score-api), which uses cross-encoder, bi-encoder, or late-interaction models. + +**Requirements:** - The `label_token_ids` parameter is **required** and must contain **at least 1 token ID**. - When 2 label tokens are provided, the score equals `P(label_token_ids[0]) / (P(label_token_ids[0]) + P(label_token_ids[1]))` (softmax over the two labels). - When more labels are provided, the score is the softmax-normalized probability of the first label token across all label tokens. -##### Example: Score with CausalLM +#### Example ```bash curl -X POST http://localhost:8000/generative_scoring \ @@ -524,7 +528,7 @@ Here, each item is appended to the query to form prompts like `"Is this city the } ``` -##### How it works +#### How it works 1. **Prompt Construction**: For each item, builds `prompt = query + item` (or `item + query` if `item_first=true`) 2. **Forward Pass**: Runs the model on each prompt to get next-token logits @@ -532,7 +536,7 @@ Here, each item is appended to the query to form prompts like `"Is this city the 4. **Softmax Normalization**: Applies softmax over only the label tokens (when `apply_softmax=true`) 5. **Score**: Returns the normalized probability of the first label token -##### Finding Token IDs +#### Finding Token IDs To find the token IDs for your labels, use the tokenizer: From 24f1337a02c36e07eaec167171f32fece5190ab4 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Fri, 27 Mar 2026 19:22:53 -0700 Subject: [PATCH 24/28] update engine input to use renderer and lint fixes Signed-off-by: Vedant Jhaveri --- .../test_generative_scoring.py | 24 ++++-- .../test_generative_scoring_e2e.py | 19 +++-- vllm/entrypoints/openai/api_server.py | 4 +- .../openai/generative_scoring/api_router.py | 4 +- .../openai/generative_scoring/serving.py | 76 ++++++------------- vllm/v1/sample/sampler.py | 8 +- 6 files changed, 61 insertions(+), 74 deletions(-) diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py index 45cda905ddbf..7214ca82054b 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py @@ -177,10 +177,14 @@ class TestProbabilityComputation: ], ids=["softmax_basic", "softmax_extreme_values", "true_probs"], ) - def test_compute_probabilities(self, label_logprobs, apply_softmax, should_sum_to_one): + def test_compute_probabilities( + self, label_logprobs, apply_softmax, should_sum_to_one + ): """Test probability computation for softmax and true probability modes.""" serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring) - probs = serving._compute_probabilities(label_logprobs, apply_softmax=apply_softmax) + probs = serving._compute_probabilities( + label_logprobs, apply_softmax=apply_softmax + ) # Verify sum behavior total = sum(probs.values()) @@ -227,8 +231,14 @@ class TestValidation: @pytest.mark.parametrize( "request_kwargs,expected_error", [ - ({"query": "q", "items": ["i"], "label_token_ids": [999999, 999998]}, "out of vocabulary"), - ({"query": "q", "items": [], "label_token_ids": [100, 200]}, "at least one item"), + ( + {"query": "q", "items": ["i"], "label_token_ids": [999999, 999998]}, + "out of vocabulary", + ), + ( + {"query": "q", "items": [], "label_token_ids": [100, 200]}, + "at least one item", + ), ], ids=["invalid_token_id", "empty_items"], ) @@ -251,7 +261,7 @@ class TestPromptBuilding: "item_first,expected", [ (False, [[100, 101, 200, 201], [100, 101, 300, 301]]), # query + item - (True, [[200, 201, 100, 101], [300, 301, 100, 101]]), # item + query + (True, [[200, 201, 100, 101], [300, 301, 100, 101]]), # item + query ], ids=["query_first", "item_first"], ) @@ -266,10 +276,10 @@ async def test_item_ordering(self, item_first, expected): label_token_ids=[500, 501], item_first=item_first, ) - engine_prompts, _ = await serving._build_prompts(request, MagicMock()) + engine_inputs, _ = await serving._build_prompts(request, MagicMock()) for i, exp in enumerate(expected): - assert engine_prompts[i]["prompt_token_ids"] == exp + assert engine_inputs[i]["prompt_token_ids"] == exp class TestGeneration: diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py index 783061f17921..53923b1a668a 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py @@ -16,10 +16,13 @@ @pytest.fixture(scope="module") def server(): args = [ - "--dtype", "bfloat16", - "--max-model-len", "512", + "--dtype", + "bfloat16", + "--max-model-len", + "512", "--enforce-eager", - "--max-num-seqs", "32", + "--max-num-seqs", + "32", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -60,7 +63,9 @@ async def test_basic_score_and_response_structure(self, server: RemoteOpenAIServ usage = data["usage"] assert usage["prompt_tokens"] > 0 assert usage["completion_tokens"] > 0 - assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + assert ( + usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"] + ) @pytest.mark.asyncio async def test_multiple_items(self, server: RemoteOpenAIServer): @@ -110,11 +115,13 @@ async def test_validation_empty_items(self, server: RemoteOpenAIServer): @pytest.mark.parametrize( "label_token_ids,expected_status", [ - ([9999999999, 9999999998], 400), # Out of vocab range + ([9999999999, 9999999998], 400), # Out of vocab range ], ids=["invalid_token_ids"], ) - async def test_validation_errors(self, server: RemoteOpenAIServer, label_token_ids, expected_status): + async def test_validation_errors( + self, server: RemoteOpenAIServer, label_token_ids, expected_status + ): """Test validation errors for various invalid inputs.""" response = requests.post( server.url_for("generative_scoring"), diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 999cb1a5cbe1..bf23c7e2e96b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -425,9 +425,7 @@ async def init_app_state( init_generative_scoring_state, ) - await init_generative_scoring_state( - engine_client, state, args, request_logger - ) + await init_generative_scoring_state(engine_client, state, args, request_logger) state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 diff --git a/vllm/entrypoints/openai/generative_scoring/api_router.py b/vllm/entrypoints/openai/generative_scoring/api_router.py index 6a49e72dc082..ed0a81d149cc 100644 --- a/vllm/entrypoints/openai/generative_scoring/api_router.py +++ b/vllm/entrypoints/openai/generative_scoring/api_router.py @@ -59,9 +59,7 @@ async def create_generative_scoring(raw_request: Request): result = await handler.create_generative_scoring(gen_request, raw_request) if isinstance(result, ErrorResponse): - return JSONResponse( - content=result.model_dump(), status_code=result.error.code - ) + return JSONResponse(content=result.model_dump(), status_code=result.error.code) elif isinstance(result, GenerativeScoringResponse): return JSONResponse(content=result.model_dump()) diff --git a/vllm/entrypoints/openai/generative_scoring/serving.py b/vllm/entrypoints/openai/generative_scoring/serving.py index e8d43e62f19d..66f00e5b8637 100644 --- a/vllm/entrypoints/openai/generative_scoring/serving.py +++ b/vllm/entrypoints/openai/generative_scoring/serving.py @@ -26,9 +26,8 @@ ) from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.inputs.data import TokensPrompt +from vllm.inputs.data import ProcessorInputs, token_inputs from vllm.logger import init_logger -from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.tracing import ( @@ -94,8 +93,7 @@ class GenerativeScoringRequest(OpenAIBaseModel): priority: int = Field( default=0, description=( - "The priority of the request (lower means earlier handling; " - "default: 0)." + "The priority of the request (lower means earlier handling; default: 0)." ), ) request_id: str = Field( @@ -218,9 +216,7 @@ async def create_generative_scoring( # Validate items if len(request.items) == 0: - return self.create_error_response( - "items must contain at least one item." - ) + return self.create_error_response("items must contain at least one item.") # Note: Mixed item types (string and token list) are validated by # Pydantic at request parsing time, so we don't need to check here. @@ -231,12 +227,13 @@ async def create_generative_scoring( logger.exception("Error preparing request components") return self.create_error_response(e) - request_id = f"generative-scoring-{self._base_request_id(raw_request, request.request_id)}" + base_id = self._base_request_id(raw_request, request.request_id) + request_id = f"generative-scoring-{base_id}" created_time = int(time.time()) # Build prompts for each item try: - engine_prompts, prompt_token_counts = await self._build_prompts( + engine_inputs, prompt_token_counts = await self._build_prompts( request, tokenizer, self.model_config.max_model_len ) except (ValueError, TypeError) as e: @@ -263,20 +260,20 @@ async def create_generative_scoring( else await self._get_trace_headers(raw_request.headers) ) - # Schedule requests for all prompts + # Schedule requests for all inputs generators: list[AsyncGenerator[RequestOutput, None]] = [] - for i, engine_prompt in enumerate(engine_prompts): + for i, engine_input in enumerate(engine_inputs): request_id_item = f"{request_id}-{i}" self._log_inputs( request_id_item, - engine_prompt, + engine_input, params=sampling_params, lora_request=lora_request, ) generator = self.engine_client.generate( - engine_prompt, + engine_input, sampling_params, request_id_item, lora_request=lora_request, @@ -287,7 +284,7 @@ async def create_generative_scoring( # Collect results result_generator = merge_async_iterators(*generators) - results: list[RequestOutput | None] = [None] * len(engine_prompts) + results: list[RequestOutput | None] = [None] * len(engine_inputs) try: async for i, res in result_generator: @@ -311,15 +308,11 @@ async def create_generative_scoring( # Check for errors if result.outputs and result.outputs[0].finish_reason == "error": - return self.create_error_response( - f"Generation error for item {i}" - ) + return self.create_error_response(f"Generation error for item {i}") # Get logprobs from the generated token if not result.outputs or len(result.outputs) == 0: - return self.create_error_response( - f"No output generated for item {i}" - ) + return self.create_error_response(f"No output generated for item {i}") output = result.outputs[0] if output.logprobs is None or len(output.logprobs) == 0: @@ -389,16 +382,19 @@ async def _build_prompts( request: GenerativeScoringRequest, tokenizer, max_model_len: int, - ) -> tuple[list[TokensPrompt], list[int]]: + ) -> tuple[list[ProcessorInputs], list[int]]: """Build prompts by concatenating query and items. + Uses the Renderer's tokenizer to tokenize text inputs, then + creates ProcessorInputs via token_inputs() for engine consumption. + Args: request: The request containing query, items, and settings. tokenizer: The tokenizer to use. max_model_len: Maximum model context length for truncation. Returns: - Tuple of (list of TokensPrompt, list of prompt token counts). + Tuple of (list of ProcessorInputs, list of prompt token counts). """ # Tokenize query if it's a string if isinstance(request.query, str): @@ -409,7 +405,7 @@ async def _build_prompts( else: query_token_ids = request.query - engine_prompts: list[TokensPrompt] = [] + engine_inputs: list[ProcessorInputs] = [] prompt_token_counts: list[int] = [] for item in request.items: @@ -434,12 +430,10 @@ async def _build_prompts( if len(prompt_token_ids) > max_prompt_len: prompt_token_ids = prompt_token_ids[:max_prompt_len] - engine_prompts.append( - TokensPrompt(prompt_token_ids=prompt_token_ids) - ) + engine_inputs.append(token_inputs(prompt_token_ids)) prompt_token_counts.append(len(prompt_token_ids)) - return engine_prompts, prompt_token_counts + return engine_inputs, prompt_token_counts def _compute_probabilities( self, @@ -470,8 +464,7 @@ def _compute_probabilities( sum_exp = sum(exp_values.values()) return { - token_id: exp_val / sum_exp - for token_id, exp_val in exp_values.items() + token_id: exp_val / sum_exp for token_id, exp_val in exp_values.items() } else: # Return true model probabilities @@ -505,26 +498,7 @@ def _base_request_id( if request_id: return request_id if raw_request: - return getattr(raw_request.state, "request_id", None) or \ - str(id(raw_request)) + return getattr(raw_request.state, "request_id", None) or str( + id(raw_request) + ) return random_uuid() - - def _log_inputs( - self, - request_id: str, - prompt: TokensPrompt, - params: SamplingParams, - lora_request: LoRARequest | None, - ) -> None: - """Log request inputs.""" - if self.request_logger is None: - return - - self.request_logger.log_inputs( - request_id=request_id, - prompt=str(prompt.get("prompt_token_ids", [])[:10]) + "...", - prompt_token_ids=None, - prompt_embeds=None, - params=params, - lora_request=lora_request, - ) diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 2aece7a03328..acb969360f46 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -124,7 +124,7 @@ def forward( raw_logprobs, num_logprobs, token_ids=sampled ) - # If we have both num_logprobs and logprob_token_ids, prefer + # If we have both num_logprobs and logprob_token_ids, prefer # logprob_token_ids as it's more specific if logprob_token_ids_tensors is not None and num_logprobs is not None: logprobs_tensors = logprob_token_ids_tensors @@ -192,16 +192,16 @@ def gather_specific_token_logprobs( # Fill in token IDs for each request for req_idx, token_ids in logprob_token_ids.items(): num_tokens = len(token_ids) - token_ids_tensor[req_idx, 1:num_tokens + 1] = torch.tensor( + token_ids_tensor[req_idx, 1 : num_tokens + 1] = torch.tensor( token_ids, dtype=torch.int64, device=device ) - valid_mask[req_idx, 1:num_tokens + 1] = True + valid_mask[req_idx, 1 : num_tokens + 1] = True # Compute logprobs using the fused Triton kernel (log_softmax + gather) logprobs = compute_token_logprobs(logits, token_ids_tensor) # Mask invalid (padded) positions with -inf - logprobs = logprobs.masked_fill(~valid_mask, float('-inf')) + logprobs = logprobs.masked_fill(~valid_mask, float("-inf")) # Compute ranks for the sampled token sampled_logits = logits.gather(-1, sampled.unsqueeze(-1)) From 6fce10b37698eeca1978af2ff7b03dc0bcbea577 Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Sun, 29 Mar 2026 22:15:23 -0700 Subject: [PATCH 25/28] fix changes after integrating upstream for failing ci and testing locally Signed-off-by: Vedant Jhaveri --- .../test_generative_scoring.py | 4 ++- .../openai/generative_scoring/serving.py | 32 ++++++------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py index 7214ca82054b..b110454315c9 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py @@ -276,7 +276,9 @@ async def test_item_ordering(self, item_first, expected): label_token_ids=[500, 501], item_first=item_first, ) - engine_inputs, _ = await serving._build_prompts(request, MagicMock()) + engine_inputs, _ = await serving._build_prompts( + request, MagicMock(), max_model_len=4096 + ) for i, exp in enumerate(expected): assert engine_inputs[i]["prompt_token_ids"] == exp diff --git a/vllm/entrypoints/openai/generative_scoring/serving.py b/vllm/entrypoints/openai/generative_scoring/serving.py index 66f00e5b8637..4bdc65480712 100644 --- a/vllm/entrypoints/openai/generative_scoring/serving.py +++ b/vllm/entrypoints/openai/generative_scoring/serving.py @@ -26,7 +26,7 @@ ) from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.models.serving import OpenAIServingModels -from vllm.inputs.data import ProcessorInputs, token_inputs +from vllm.inputs import EngineInput, tokens_input from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -185,7 +185,7 @@ async def create_generative_scoring( ErrorResponse if an error occurred. """ # Check model - error_check_ret = await self._check_model(request) + error_check_ret = await self._check_model(request) # type: ignore[arg-type] if error_check_ret is not None: return error_check_ret @@ -222,12 +222,12 @@ async def create_generative_scoring( # Pydantic at request parsing time, so we don't need to check here. try: - lora_request = self._maybe_get_adapters(request) + lora_request = self._maybe_get_adapters(request) # type: ignore[arg-type] except (ValueError, TypeError, RuntimeError) as e: logger.exception("Error preparing request components") return self.create_error_response(e) - base_id = self._base_request_id(raw_request, request.request_id) + base_id = self._base_request_id(raw_request, default=request.request_id) request_id = f"generative-scoring-{base_id}" created_time = int(time.time()) @@ -382,11 +382,11 @@ async def _build_prompts( request: GenerativeScoringRequest, tokenizer, max_model_len: int, - ) -> tuple[list[ProcessorInputs], list[int]]: + ) -> tuple[list[EngineInput], list[int]]: """Build prompts by concatenating query and items. Uses the Renderer's tokenizer to tokenize text inputs, then - creates ProcessorInputs via token_inputs() for engine consumption. + creates EngineInput via tokens_input() for engine consumption. Args: request: The request containing query, items, and settings. @@ -394,7 +394,7 @@ async def _build_prompts( max_model_len: Maximum model context length for truncation. Returns: - Tuple of (list of ProcessorInputs, list of prompt token counts). + Tuple of (list of EngineInput, list of prompt token counts). """ # Tokenize query if it's a string if isinstance(request.query, str): @@ -405,7 +405,7 @@ async def _build_prompts( else: query_token_ids = request.query - engine_inputs: list[ProcessorInputs] = [] + engine_inputs: list[EngineInput] = [] prompt_token_counts: list[int] = [] for item in request.items: @@ -430,7 +430,7 @@ async def _build_prompts( if len(prompt_token_ids) > max_prompt_len: prompt_token_ids = prompt_token_ids[:max_prompt_len] - engine_inputs.append(token_inputs(prompt_token_ids)) + engine_inputs.append(tokens_input(prompt_token_ids)) prompt_token_counts.append(len(prompt_token_ids)) return engine_inputs, prompt_token_counts @@ -488,17 +488,3 @@ async def _get_trace_headers( return None return extract_trace_headers(headers) - - def _base_request_id( - self, - raw_request: Request | None, - request_id: str | None, - ) -> str: - """Get base request ID from raw request or generate one.""" - if request_id: - return request_id - if raw_request: - return getattr(raw_request.state, "request_id", None) or str( - id(raw_request) - ) - return random_uuid() From 264dd940ea4aa551c431bd52205bef0a51f3995c Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Mon, 30 Mar 2026 08:04:07 -0700 Subject: [PATCH 26/28] fix failing metadata CI Signed-off-by: Vedant Jhaveri --- vllm/v1/sample/metadata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index fda90bec6b96..4682cde1098b 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -22,11 +22,6 @@ class SamplingMetadata: # None means no logprobs, 0 means sampled token logprobs only max_num_logprobs: int | None - # Specific token IDs to compute logprobs for (more efficient than full vocab) - # When set, logprobs are computed only for these token IDs using gather - # req_index -> list of token IDs to get logprobs for - logprob_token_ids: dict[int, list[int]] | None - no_penalties: bool prompt_token_ids: torch.Tensor | None frequency_penalties: torch.Tensor @@ -45,5 +40,10 @@ class SamplingMetadata: # Loaded logits processors logitsprocs: LogitsProcessors + # Specific token IDs to compute logprobs for (more efficient than full vocab) + # When set, logprobs are computed only for these token IDs using gather + # req_index -> list of token IDs to get logprobs for + logprob_token_ids: dict[int, list[int]] | None = None + # Speculative token ids spec_token_ids: list[list[int]] | None = None From 814db14a4c282e2ce3a90aa5a418a0f2f177db6e Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Mon, 30 Mar 2026 08:09:26 -0700 Subject: [PATCH 27/28] Fix test mocks: add renderer to mock engine, fix expected status code Signed-off-by: Vedant Jhaveri --- .../openai/generative_scoring/test_generative_scoring.py | 6 ++++++ .../generative_scoring/test_generative_scoring_e2e.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py index b110454315c9..a260027af0fc 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py @@ -78,6 +78,12 @@ def _create_mock_engine(): mock_engine.model_config = MockModelConfig() mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() + + # renderer is accessed by OpenAIServing.__init__ and serving.py + mock_renderer = MagicMock() + mock_renderer.tokenizer = get_tokenizer(MODEL_NAME) + mock_engine.renderer = mock_renderer + return mock_engine diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py index 53923b1a668a..64a59b270f14 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py +++ b/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py @@ -94,8 +94,8 @@ async def test_validation_missing_label_token_ids(self, server: RemoteOpenAIServ "items": ["item1", "item2"], }, ) - # Pydantic validation error for missing required field - assert response.status_code == 422 + # Missing required field returns 400 (manual JSON parsing) + assert response.status_code == 400 @pytest.mark.asyncio async def test_validation_empty_items(self, server: RemoteOpenAIServer): From c4c0c1d1ff4633cc20be5684182d9d0025f37dae Mon Sep 17 00:00:00 2001 From: Vedant Jhaveri Date: Tue, 31 Mar 2026 10:35:09 -0700 Subject: [PATCH 28/28] Add type annotation for tokenizer parameter to fix docs build Signed-off-by: Vedant Jhaveri --- vllm/entrypoints/openai/generative_scoring/serving.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/generative_scoring/serving.py b/vllm/entrypoints/openai/generative_scoring/serving.py index 4bdc65480712..fd8f89cadadf 100644 --- a/vllm/entrypoints/openai/generative_scoring/serving.py +++ b/vllm/entrypoints/openai/generative_scoring/serving.py @@ -30,6 +30,7 @@ from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.tokenizers import TokenizerLike from vllm.tracing import ( contains_trace_headers, extract_trace_headers, @@ -380,7 +381,7 @@ async def create_generative_scoring( async def _build_prompts( self, request: GenerativeScoringRequest, - tokenizer, + tokenizer: TokenizerLike, max_model_len: int, ) -> tuple[list[EngineInput], list[int]]: """Build prompts by concatenating query and items.