From 893be3bef8f5c7c7329e516eb70febc0676d1fc7 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 11:31:49 +0800 Subject: [PATCH 1/7] init Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 4 +++ .buildkite/test_areas/entrypoints.yaml | 4 ++- .../entrypoints}/generate/__init__.py | 0 .../generative_scoring/__init__.py | 0 .../test_generative_scoring.py | 16 ++++----- .../test_generative_scoring_e2e.py | 2 +- .../{openai => }/generate/api_router.py | 12 +++++++ .../{openai => }/generate/factories.py | 0 .../generative_scoring/__init__.py | 0 .../generative_scoring/api_router.py | 34 ++++--------------- .../generative_scoring/serving.py | 2 +- vllm/entrypoints/openai/api_server.py | 16 ++------- 12 files changed, 37 insertions(+), 53 deletions(-) rename {vllm/entrypoints/openai => tests/entrypoints}/generate/__init__.py (100%) rename tests/entrypoints/{openai => generate}/generative_scoring/__init__.py (100%) rename tests/entrypoints/{openai => generate}/generative_scoring/test_generative_scoring.py (95%) rename tests/entrypoints/{openai => generate}/generative_scoring/test_generative_scoring_e2e.py (99%) rename vllm/entrypoints/{openai => }/generate/api_router.py (95%) rename vllm/entrypoints/{openai => }/generate/factories.py (100%) rename vllm/entrypoints/{openai => generate}/generative_scoring/__init__.py (100%) rename vllm/entrypoints/{openai => generate}/generative_scoring/api_router.py (66%) rename vllm/entrypoints/{openai => generate}/generative_scoring/serving.py (99%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 50c13e4a89a6..620e5b66c62e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1275,10 +1275,12 @@ steps: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils + - tests/entrypoints/generate commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s tests/entrypoints/generate - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 @@ -2782,10 +2784,12 @@ steps: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils + - tests/entrypoints/generate commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s tests/entrypoints/generate - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 1ae8c79fab7f..b9f6806809f8 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -11,7 +11,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: Entrypoints Integration (LLM) key: entrypoints-integration-llm @@ -60,9 +60,11 @@ steps: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils + - tests/entrypoints/generate commands: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py + - pytest -v -s tests/entrypoints/generate mirror: amd: device: mi325_1 diff --git a/vllm/entrypoints/openai/generate/__init__.py b/tests/entrypoints/generate/__init__.py similarity index 100% rename from vllm/entrypoints/openai/generate/__init__.py rename to tests/entrypoints/generate/__init__.py diff --git a/tests/entrypoints/openai/generative_scoring/__init__.py b/tests/entrypoints/generate/generative_scoring/__init__.py similarity index 100% rename from tests/entrypoints/openai/generative_scoring/__init__.py rename to tests/entrypoints/generate/generative_scoring/__init__.py diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py b/tests/entrypoints/generate/generative_scoring/test_generative_scoring.py similarity index 95% rename from tests/entrypoints/openai/generative_scoring/test_generative_scoring.py rename to tests/entrypoints/generate/generative_scoring/test_generative_scoring.py index 632c4bcc90ae..d80082992297 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring.py +++ b/tests/entrypoints/generate/generative_scoring/test_generative_scoring.py @@ -18,13 +18,13 @@ import pytest from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scoring.serving import ( +from vllm.entrypoints.generate.generative_scoring.serving import ( GenerativeScoringItemResult, GenerativeScoringRequest, GenerativeScoringResponse, - OpenAIServingGenerativeScoring, + ServingGenerativeScoring, ) +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.logprobs import Logprob @@ -86,13 +86,13 @@ def _create_mock_engine(): return mock_engine -def _create_serving(mock_engine) -> OpenAIServingGenerativeScoring: - """Create an OpenAIServingGenerativeScoring instance with mocks.""" +def _create_serving(mock_engine) -> ServingGenerativeScoring: + """Create an ServingGenerativeScoring instance with mocks.""" models = OpenAIServingModels( engine_client=mock_engine, base_model_paths=BASE_MODEL_PATHS, ) - return OpenAIServingGenerativeScoring(mock_engine, models, request_logger=None) + return ServingGenerativeScoring(mock_engine, models, request_logger=None) def _create_mock_request_output(logprobs_dict: dict[int, float]) -> RequestOutput: @@ -186,7 +186,7 @@ def test_compute_probabilities( self, label_logprobs, apply_softmax, should_sum_to_one ): """Test probability computation for softmax and true probability modes.""" - serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring) + serving = ServingGenerativeScoring.__new__(ServingGenerativeScoring) probs = serving._compute_probabilities( label_logprobs, apply_softmax=apply_softmax ) @@ -211,7 +211,7 @@ def test_compute_probabilities( def test_score_formula(self): """Test the score formula: P(token[0]) / (P(token[0]) + P(token[1])).""" - serving = OpenAIServingGenerativeScoring.__new__(OpenAIServingGenerativeScoring) + serving = ServingGenerativeScoring.__new__(ServingGenerativeScoring) # With logprobs -0.5 and -2.0, softmax gives higher prob to first token logprobs = {9454: -0.5, 2753: -2.0} diff --git a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py b/tests/entrypoints/generate/generative_scoring/test_generative_scoring_e2e.py similarity index 99% rename from tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py rename to tests/entrypoints/generate/generative_scoring/test_generative_scoring_e2e.py index 64a59b270f14..4fe8dbe791bc 100644 --- a/tests/entrypoints/openai/generative_scoring/test_generative_scoring_e2e.py +++ b/tests/entrypoints/generate/generative_scoring/test_generative_scoring_e2e.py @@ -8,7 +8,7 @@ import pytest import requests -from ....utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen3-0.6B" diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/generate/api_router.py similarity index 95% rename from vllm/entrypoints/openai/generate/api_router.py rename to vllm/entrypoints/generate/api_router.py index 84a7fddeabe3..55f9493ac477 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/generate/api_router.py @@ -41,6 +41,10 @@ def register_generate_api_routers(app: FastAPI): register_anthropic_api_router(app) + from generative_scoring.api_router import register_generative_scoring_api_router + + register_generative_scoring_api_router(app) + async def init_generate_state( engine_client: "EngineClient", @@ -185,3 +189,11 @@ async def init_generate_state( if "generate" in supported_tasks else None ) + + from .generative_scoring.serving import ServingGenerativeScoring + + state.generative_scoring = ServingGenerativeScoring( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + ) diff --git a/vllm/entrypoints/openai/generate/factories.py b/vllm/entrypoints/generate/factories.py similarity index 100% rename from vllm/entrypoints/openai/generate/factories.py rename to vllm/entrypoints/generate/factories.py diff --git a/vllm/entrypoints/openai/generative_scoring/__init__.py b/vllm/entrypoints/generate/generative_scoring/__init__.py similarity index 100% rename from vllm/entrypoints/openai/generative_scoring/__init__.py rename to vllm/entrypoints/generate/generative_scoring/__init__.py diff --git a/vllm/entrypoints/openai/generative_scoring/api_router.py b/vllm/entrypoints/generate/generative_scoring/api_router.py similarity index 66% rename from vllm/entrypoints/openai/generative_scoring/api_router.py rename to vllm/entrypoints/generate/generative_scoring/api_router.py index ed0a81d149cc..2d3570252e35 100644 --- a/vllm/entrypoints/openai/generative_scoring/api_router.py +++ b/vllm/entrypoints/generate/generative_scoring/api_router.py @@ -6,29 +6,24 @@ from fastapi import APIRouter, Depends, FastAPI, Request from fastapi.responses import JSONResponse -from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.generative_scoring.serving import ( +from vllm.entrypoints.generate.generative_scoring.serving import ( GenerativeScoringResponse, - OpenAIServingGenerativeScoring, + ServingGenerativeScoring, ) +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.logger import init_logger if TYPE_CHECKING: - from argparse import Namespace - - from starlette.datastructures import State - - from vllm.engine.protocol import EngineClient - from vllm.entrypoints.logger import RequestLogger + pass router = APIRouter() logger = init_logger(__name__) -def generative_scoring(request: Request) -> OpenAIServingGenerativeScoring | None: +def generative_scoring(request: Request) -> ServingGenerativeScoring | None: return request.app.state.serving_generative_scoring @@ -51,7 +46,7 @@ async def create_generative_scoring(raw_request: Request): raw_body = await raw_request.json() - from vllm.entrypoints.openai.generative_scoring.serving import ( + from vllm.entrypoints.generate.generative_scoring.serving import ( GenerativeScoringRequest, ) @@ -68,20 +63,3 @@ async def create_generative_scoring(raw_request: Request): def register_generative_scoring_api_router(app: FastAPI): app.include_router(router) - - -async def init_generative_scoring_state( - engine_client: "EngineClient", - state: "State", - args: "Namespace", - request_logger: "RequestLogger | None", -): - from vllm.entrypoints.openai.generative_scoring.serving import ( - OpenAIServingGenerativeScoring, - ) - - state.serving_generative_scoring = OpenAIServingGenerativeScoring( - engine_client, - state.openai_serving_models, - request_logger=request_logger, - ) diff --git a/vllm/entrypoints/openai/generative_scoring/serving.py b/vllm/entrypoints/generate/generative_scoring/serving.py similarity index 99% rename from vllm/entrypoints/openai/generative_scoring/serving.py rename to vllm/entrypoints/generate/generative_scoring/serving.py index fd8f89cadadf..0592d0b29afb 100644 --- a/vllm/entrypoints/openai/generative_scoring/serving.py +++ b/vllm/entrypoints/generate/generative_scoring/serving.py @@ -142,7 +142,7 @@ class GenerativeScoringResponse(OpenAIBaseModel): # ============================================================================ -class OpenAIServingGenerativeScoring(OpenAIServing): +class ServingGenerativeScoring(OpenAIServing): """Serving class for generative scoring computation. This class handles computing the probability of specified token IDs diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 461128ed9053..5455f1ca427b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -196,7 +196,7 @@ def build_app( register_sagemaker_api_router(app, supported_tasks, model_config) if "generate" in supported_tasks: - from vllm.entrypoints.openai.generate.api_router import ( + from vllm.entrypoints.generate.api_router import ( register_generate_api_routers, ) @@ -220,12 +220,6 @@ def build_app( elastic_ep_attach_router(app) - from vllm.entrypoints.openai.generative_scoring.api_router import ( - register_generative_scoring_api_router, - ) - - register_generative_scoring_api_router(app) - if "generate" in supported_tasks or "render" in supported_tasks: from vllm.entrypoints.serve.render.api_router import ( attach_router as attach_render_router, @@ -402,18 +396,12 @@ async def init_app_state( ) if "generate" in supported_tasks: - from vllm.entrypoints.openai.generate.api_router import init_generate_state + from vllm.entrypoints.generate.api_router import init_generate_state await init_generate_state( engine_client, state, args, request_logger, supported_tasks ) - from vllm.entrypoints.openai.generative_scoring.api_router import ( - init_generative_scoring_state, - ) - - await init_generative_scoring_state(engine_client, state, args, request_logger) - if "transcription" in supported_tasks or "realtime" in supported_tasks: from vllm.entrypoints.speech_to_text.factories import init_speech_to_text_state From a0d92c77207f00ebbc03781b5f291bfa1df2dff6 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 11:36:14 +0800 Subject: [PATCH 2/7] refine Signed-off-by: wang.yuqi --- vllm/entrypoints/generate/generative_scoring/api_router.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/entrypoints/generate/generative_scoring/api_router.py b/vllm/entrypoints/generate/generative_scoring/api_router.py index 2d3570252e35..e6918b7f03b0 100644 --- a/vllm/entrypoints/generate/generative_scoring/api_router.py +++ b/vllm/entrypoints/generate/generative_scoring/api_router.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus -from typing import TYPE_CHECKING from fastapi import APIRouter, Depends, FastAPI, Request from fastapi.responses import JSONResponse @@ -15,9 +14,6 @@ from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.logger import init_logger -if TYPE_CHECKING: - pass - router = APIRouter() logger = init_logger(__name__) From cf8d93aa28497cbf1f109683bb6d701e81eb599d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 12:42:49 +0800 Subject: [PATCH 3/7] update test-amd.yaml Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 620e5b66c62e..ee43e1b357e4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1370,7 +1370,7 @@ steps: - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling --ignore=entrypoints/speech_to_text --ignore=tests/entrypoints/generate - label: OpenAI API correctness # TBD timeout_in_minutes: 180 From 3f61ff4eef3453e9f6d70bfc46c4f97dfb47165e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 12:43:24 +0800 Subject: [PATCH 4/7] refine Signed-off-by: wang.yuqi --- vllm/entrypoints/generate/api_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/generate/api_router.py b/vllm/entrypoints/generate/api_router.py index 55f9493ac477..099a7f649d31 100644 --- a/vllm/entrypoints/generate/api_router.py +++ b/vllm/entrypoints/generate/api_router.py @@ -41,7 +41,7 @@ def register_generate_api_routers(app: FastAPI): register_anthropic_api_router(app) - from generative_scoring.api_router import register_generative_scoring_api_router + from .generative_scoring.api_router import register_generative_scoring_api_router register_generative_scoring_api_router(app) From df83e231fc98636408e43804f923b107f21f2d35 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 13:13:57 +0800 Subject: [PATCH 5/7] fix Signed-off-by: wang.yuqi --- vllm/entrypoints/sagemaker/api_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/sagemaker/api_router.py index b3b11cd07b4c..00dd7db28181 100644 --- a/vllm/entrypoints/sagemaker/api_router.py +++ b/vllm/entrypoints/sagemaker/api_router.py @@ -11,9 +11,9 @@ from fastapi.responses import JSONResponse, Response from vllm.config import ModelConfig +from vllm.entrypoints.generate.factories import get_generate_invocation_types from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing -from vllm.entrypoints.openai.generate.factories import get_generate_invocation_types from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.base.serving import PoolingServingBase from vllm.entrypoints.pooling.factories import get_pooling_invocation_types From 73ca9247c245264add1913f81971822ac6edfb99 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 13:24:57 +0800 Subject: [PATCH 6/7] fix Signed-off-by: wang.yuqi --- vllm/entrypoints/generate/api_router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/generate/api_router.py b/vllm/entrypoints/generate/api_router.py index 099a7f649d31..713e2566bc56 100644 --- a/vllm/entrypoints/generate/api_router.py +++ b/vllm/entrypoints/generate/api_router.py @@ -192,7 +192,7 @@ async def init_generate_state( from .generative_scoring.serving import ServingGenerativeScoring - state.generative_scoring = ServingGenerativeScoring( + state.serving_generative_scoring = ServingGenerativeScoring( engine_client, state.openai_serving_models, request_logger=request_logger, From efd6e06b57ef7b2dfff97f879dc2ba6437dd3b38 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Jun 2026 14:06:57 +0800 Subject: [PATCH 7/7] fix Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 4 ++-- .buildkite/test_areas/entrypoints.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ee43e1b357e4..a04e88b3d7e6 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1280,7 +1280,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s tests/entrypoints/generate + - pytest -v -s entrypoints/generate - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 @@ -2789,7 +2789,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s tests/entrypoints/generate + - pytest -v -s entrypoints/generate - label: Entrypoints Integration (API Server openai - Part 3) # TBD timeout_in_minutes: 180 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index b9f6806809f8..ebaec9954a33 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -64,7 +64,7 @@ steps: commands: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s tests/entrypoints/generate + - pytest -v -s entrypoints/generate mirror: amd: device: mi325_1