diff --git a/tests/entrypoints/openai/responses/test_errors.py b/tests/entrypoints/openai/responses/test_errors.py index f8ea17828883..7daa3d1fb58f 100644 --- a/tests/entrypoints/openai/responses/test_errors.py +++ b/tests/entrypoints/openai/responses/test_errors.py @@ -6,8 +6,8 @@ import pytest -from vllm.entrypoints.openai.protocol import ErrorResponse -from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/responses/test_function_call_parsing.py b/tests/entrypoints/openai/responses/test_function_call_parsing.py index 3c5a11c867eb..d487759c1af2 100644 --- a/tests/entrypoints/openai/responses/test_function_call_parsing.py +++ b/tests/entrypoints/openai/responses/test_function_call_parsing.py @@ -7,7 +7,7 @@ import pytest from openai.types.responses import ResponseFunctionToolCall -from vllm.entrypoints.openai.protocol import ResponsesRequest +from vllm.entrypoints.openai.engine.protocol import ResponsesRequest def test_function_call_dict_converted_to_object(): @@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog): } # Mock the logger to verify debug was called - with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger: + with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger: with pytest.raises(ValueError): ResponsesRequest(**request_data) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 4af4dd88b08f..c5e82d147f7b 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -9,8 +9,9 @@ import pytest from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.outputs import CompletionOutput, RequestOutput from vllm.tokenizers import get_tokenizer diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 77087ac21ea8..961ad40ca2c3 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -5,7 +5,7 @@ from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.tokenizers import get_tokenizer from ...models.registry import HF_EXAMPLE_MODELS diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index e1eb6d2e3024..9b4539d4766b 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -9,7 +9,7 @@ import pytest from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.outputs import CompletionOutput, RequestOutput diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py index fbfae4f268d5..d8ee91f77834 100644 --- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py +++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py @@ -8,7 +8,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( StructuredOutputsParams, ) from vllm.entrypoints.tool_server import ToolServer diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index ced230aff029..f740e7968ac8 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -9,7 +9,7 @@ import pytest from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.lora.request import LoRARequest diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py index e9b1cfb58b50..c4baf2f7784c 100644 --- a/tests/entrypoints/openai/test_protocol.py +++ b/tests/entrypoints/openai/test_protocol.py @@ -4,7 +4,10 @@ Message, ) -from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages +from vllm.entrypoints.openai.engine.protocol import ( + serialize_message, + serialize_messages, +) def test_serialize_message() -> None: diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d8a296e5f09d..9e4ce94a110f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -11,14 +11,16 @@ from openai import OpenAI from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.parser.harmony_utils import get_encoding -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponse, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, RequestResponseMetadata, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.parser.harmony_utils import get_encoding from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.outputs import CompletionOutput, RequestOutput from vllm.tokenizers import get_tokenizer @@ -1517,12 +1519,12 @@ class TestCreateRemainingArgsDelta: def test_preserves_id_type_name(self): """Test that id, type, and name are preserved from original delta.""" - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat + from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, ) - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat original_delta = DeltaMessage( tool_calls=[ @@ -1552,12 +1554,12 @@ def test_preserves_id_type_name(self): def test_matches_by_index(self): """Test that the correct tool call is matched by index.""" - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat + from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, ) - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat original_delta = DeltaMessage( tool_calls=[ @@ -1588,12 +1590,12 @@ def test_matches_by_index(self): def test_no_matching_tool_call(self): """Test graceful handling when no matching tool call is found.""" - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat + from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, ) - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat original_delta = DeltaMessage( tool_calls=[ @@ -1620,8 +1622,8 @@ def test_no_matching_tool_call(self): def test_function_is_none(self): """Test handling when original tool call has no function.""" - from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall - from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat + from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall original_delta = DeltaMessage( tool_calls=[ diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py index 1934d43d5cfb..b5483a2dff31 100644 --- a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py +++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py @@ -9,7 +9,7 @@ import pytest -from vllm.entrypoints.openai.serving_chat_stream_harmony import ( +from vllm.entrypoints.openai.chat_completion.stream_harmony import ( extract_harmony_streaming_delta, ) @@ -82,7 +82,7 @@ def test_analysis_channel_reasoning(self, include_reasoning, expected_has_messag assert tools_streamed is False @pytest.mark.parametrize("channel", ["commentary", "analysis"]) - @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id") + @patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id") def test_new_tool_call(self, mock_make_tool_call_id, channel): """Test new tool call creation when recipient changes.""" mock_make_tool_call_id.return_value = "call_test123" diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 192c7cafb749..c2bc82514362 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -8,7 +8,7 @@ import pytest from vllm.config import ModelConfig -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.tokenizers.mistral import MistralTokenizer diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index b585835a0667..e596b32ba69e 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -8,7 +8,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index 7d03dccec30d..a79ee66a9e3c 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -14,7 +14,7 @@ ) from vllm.entrypoints.context import ConversationContext -from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest +from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest from vllm.entrypoints.openai.serving_responses import ( OpenAIServingResponses, _extract_allowed_tools_from_mcp_requests, diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py index 6ac48317e8bc..634ec421f1c8 100644 --- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py @@ -9,7 +9,7 @@ run_tool_extraction, run_tool_extraction_streaming, ) -from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser, ToolParserManager diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index 8600aaf63943..626d845e1b44 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -5,7 +5,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py index 394457532139..89c91c2ec63f 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py @@ -11,7 +11,7 @@ run_tool_extraction, run_tool_extraction_streaming, ) -from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall from vllm.tool_parsers import ToolParser, ToolParserManager diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py index 3ce7801b4597..53948d577c15 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py @@ -5,7 +5,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation +from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index 3bd1ca7f528d..a0576db02ff7 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -9,7 +9,7 @@ run_tool_extraction, run_tool_extraction_streaming, ) -from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser, ToolParserManager diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py index 3774b3d1833e..dbd7e1d483c7 100644 --- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py @@ -9,7 +9,7 @@ run_tool_extraction, run_tool_extraction_streaming, ) -from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser, ToolParserManager diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index c4cad17fd2d0..8ab4c5a5a2d2 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -9,7 +9,7 @@ run_tool_extraction, run_tool_extraction_streaming, ) -from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall from vllm.tokenizers import TokenizerLike from vllm.tool_parsers import ToolParser, ToolParserManager diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index 0b32e5f899ff..c7dfdc461632 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -3,8 +3,8 @@ from collections.abc import Iterable -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, FunctionCall, diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py index 501f6dcc9154..da65b8ad50bd 100644 --- a/tests/entrypoints/openai/utils.py +++ b/tests/entrypoints/openai/utils.py @@ -4,11 +4,13 @@ from collections.abc import AsyncGenerator from typing import Any -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionStreamResponse, ChatMessage, +) +from vllm.entrypoints.openai.engine.protocol import ( UsageInfo, ) diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py index 165e91a2c79f..8c69f75a3bbc 100644 --- a/tests/reasoning/test_base_thinking_reasoning_parser.py +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -5,7 +5,7 @@ from transformers import AutoTokenizer from tests.reasoning.utils import run_reasoning_extraction -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py index 874fdef77811..4b0938d15520 100644 --- a/tests/reasoning/test_deepseekv3_reasoning_parser.py +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -4,7 +4,8 @@ import pytest from transformers import AutoTokenizer -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index a020fb8e9716..39ba52bc78f5 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning import ReasoningParser from vllm.tokenizers.mistral import MistralTokenizer diff --git a/tests/tool_parsers/test_ernie45_moe_tool_parser.py b/tests/tool_parsers/test_ernie45_moe_tool_parser.py index 533bd1ec3dff..a00e43894767 100644 --- a/tests/tool_parsers/test_ernie45_moe_tool_parser.py +++ b/tests/tool_parsers/test_ernie45_moe_tool_parser.py @@ -7,8 +7,8 @@ import pytest -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, FunctionCall, ToolCall, diff --git a/tests/tool_parsers/test_functiongemma_tool_parser.py b/tests/tool_parsers/test_functiongemma_tool_parser.py index a5a0a5a19131..d32aba3085e5 100644 --- a/tests/tool_parsers/test_functiongemma_tool_parser.py +++ b/tests/tool_parsers/test_functiongemma_tool_parser.py @@ -5,7 +5,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.tool_parsers.functiongemma_tool_parser import FunctionGemmaToolParser diff --git a/tests/tool_parsers/test_glm4_moe_tool_parser.py b/tests/tool_parsers/test_glm4_moe_tool_parser.py index 52f5a9198e9b..d9d88b8444d7 100644 --- a/tests/tool_parsers/test_glm4_moe_tool_parser.py +++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py @@ -6,7 +6,7 @@ import pytest -from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall from vllm.tokenizers import get_tokenizer from vllm.tool_parsers.glm4_moe_tool_parser import ( Glm4MoeModelToolParser, diff --git a/tests/tool_parsers/test_jamba_tool_parser.py b/tests/tool_parsers/test_jamba_tool_parser.py index ccad16ae2f6b..f0e7899c8aaf 100644 --- a/tests/tool_parsers/test_jamba_tool_parser.py +++ b/tests/tool_parsers/test_jamba_tool_parser.py @@ -8,7 +8,7 @@ import pytest from partial_json_parser.core.options import Allow -from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall +from vllm.entrypoints.openai.engine.protocol import DeltaMessage, FunctionCall, ToolCall from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally from vllm.tool_parsers.jamba_tool_parser import JambaToolParser diff --git a/tests/tool_parsers/test_kimi_k2_tool_parser.py b/tests/tool_parsers/test_kimi_k2_tool_parser.py index dc6140374d53..21b3d5adfde1 100644 --- a/tests/tool_parsers/test_kimi_k2_tool_parser.py +++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py @@ -6,7 +6,7 @@ import pytest -from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall from vllm.tokenizers import get_tokenizer from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser diff --git a/tests/tool_parsers/test_minimax_tool_parser.py b/tests/tool_parsers/test_minimax_tool_parser.py index 28cfc4ea7a17..08b2104277b8 100644 --- a/tests/tool_parsers/test_minimax_tool_parser.py +++ b/tests/tool_parsers/test_minimax_tool_parser.py @@ -7,8 +7,10 @@ import pytest -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, ToolCall, ) diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py index d2502079d0de..bf2fba8a8655 100644 --- a/tests/tool_parsers/test_mistral_tool_parser.py +++ b/tests/tool_parsers/test_mistral_tool_parser.py @@ -11,7 +11,7 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall from partial_json_parser.core.options import Allow -from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall +from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally from vllm.tokenizers.mistral import MistralTokenizer diff --git a/tests/tool_parsers/test_openai_tool_parser.py b/tests/tool_parsers/test_openai_tool_parser.py index 44b8c92745e9..e9e39ef4c029 100644 --- a/tests/tool_parsers/test_openai_tool_parser.py +++ b/tests/tool_parsers/test_openai_tool_parser.py @@ -14,7 +14,7 @@ load_harmony_encoding, ) -from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall from vllm.tokenizers import get_tokenizer from vllm.tool_parsers.openai_tool_parser import OpenAIToolParser diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py index 3a0a612d7fbf..3d46f73de612 100644 --- a/tests/tool_parsers/test_qwen3coder_tool_parser.py +++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py @@ -6,9 +6,11 @@ import pytest -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, FunctionCall, ToolCall, diff --git a/tests/tool_parsers/test_seed_oss_tool_parser.py b/tests/tool_parsers/test_seed_oss_tool_parser.py index c7f595830f34..88cc736f67a6 100644 --- a/tests/tool_parsers/test_seed_oss_tool_parser.py +++ b/tests/tool_parsers/test_seed_oss_tool_parser.py @@ -7,9 +7,11 @@ import pytest -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, FunctionCall, ToolCall, diff --git a/tests/tool_parsers/test_xlam_tool_parser.py b/tests/tool_parsers/test_xlam_tool_parser.py index 380792a9926a..a5cab218f72b 100644 --- a/tests/tool_parsers/test_xlam_tool_parser.py +++ b/tests/tool_parsers/test_xlam_tool_parser.py @@ -6,8 +6,8 @@ import pytest -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, FunctionCall, ToolCall, diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py index 50cd9e4279b2..69846f9adb12 100644 --- a/tests/tool_use/test_chat_completion_request_validations.py +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -3,7 +3,7 @@ import pytest -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest def test_chat_completion_request_with_no_tools(): diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index 6ff37255e48d..01c1360818eb 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -8,10 +8,10 @@ import regex as re from pydantic import TypeAdapter -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionToolsParam, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.tool_parsers.utils import get_json_schema_from_tools pytestmark = pytest.mark.cpu_test diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 11681cfcebca..270092faf8ce 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -11,12 +11,14 @@ from vllm.assets.image import ImageAsset from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponse, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.inputs import PromptType from vllm.outputs import RequestOutput diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py index 25c2d88a2c7a..5177d50f7c00 100644 --- a/vllm/entrypoints/anthropic/serving_messages.py +++ b/vllm/entrypoints/anthropic/serving_messages.py @@ -25,16 +25,18 @@ ) from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionStreamResponse, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, StreamOptions, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import OpenAIServingModels logger = logging.getLogger(__name__) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index c9bece08f188..6f61a0114597 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -22,6 +22,12 @@ ChatTemplateContentFormatOption, ) from vllm.entrypoints.constants import MCP_PREFIX +from vllm.entrypoints.openai.engine.protocol import ( + FunctionCall, + ResponseInputOutputItem, + ResponseRawMessageAndToken, + ResponsesRequest, +) from vllm.entrypoints.openai.parser.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, @@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.responses_parser import ( get_responses_parser_for_simple_context, ) -from vllm.entrypoints.openai.protocol import ( - FunctionCall, - ResponseInputOutputItem, - ResponseRawMessageAndToken, - ResponsesRequest, -) from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool_server import ToolServer diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index fad989284ca5..86942f22fbee 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args -from vllm.entrypoints.openai.orca_metrics import metrics_header -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, +from vllm.entrypoints.openai.engine.protocol import ( CompletionRequest, CompletionResponse, ErrorInfo, @@ -59,9 +57,9 @@ TranslationRequest, TranslationResponseVariant, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.entrypoints.openai.orca_metrics import metrics_header from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import ( BaseModelPath, OpenAIServingModels, @@ -475,47 +473,6 @@ def translate_error_response(response: ErrorResponse) -> JSONResponse: return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/chat/completions", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): - metrics_header_format = raw_request.headers.get( - ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" - ) - handler = chat(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Chat Completions API" - ) - try: - generator = await handler.create_chat_completion(request, raw_request) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - - elif isinstance(generator, ChatCompletionResponse): - return JSONResponse( - content=generator.model_dump(), - headers=metrics_header(metrics_header_format), - ) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - @router.post( "/v1/completions", dependencies=[Depends(validate_json_request)], @@ -735,8 +692,10 @@ async def send_with_request_id(message: Message) -> None: def _extract_content_from_chunk(chunk_data: dict) -> str: """Extract content from a streaming response chunk.""" try: - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionStreamResponse, + ) + from vllm.entrypoints.openai.engine.protocol import ( CompletionStreamResponse, ) @@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI: from vllm.entrypoints.serve import register_vllm_serve_api_routers register_vllm_serve_api_routers(app) + from vllm.entrypoints.openai.chat_completion.api_router import ( + attach_router as register_chat_api_router, + ) + register_chat_api_router(app) from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes register_sagemaker_routes(router) diff --git a/vllm/entrypoints/openai/chat_completion/__init__.py b/vllm/entrypoints/openai/chat_completion/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/vllm/entrypoints/openai/chat_completion/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py new file mode 100644 index 000000000000..e4010c5256a0 --- /dev/null +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, StreamingResponse + +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.utils import ( + load_aware_call, + with_cancellation, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() +ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format" + + +def chat(request: Request) -> OpenAIServingChat | None: + return request.app.state.openai_serving_chat + + +@router.post( + "/v1/chat/completions", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get( + ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" + ) + handler = chat(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Chat Completions API" + ) + try: + generator = await handler.create_chat_completion(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, ChatCompletionResponse): + return JSONResponse( + content=generator.model_dump(), + headers=metrics_header(metrics_header_format), + ) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py new file mode 100644 index 000000000000..299069925bbd --- /dev/null +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -0,0 +1,654 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +import json +import time +from typing import Annotated, Any, ClassVar, Literal + +import torch +from openai.types.chat.chat_completion_audio import ( + ChatCompletionAudio as OpenAIChatCompletionAudio, +) +from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation +from pydantic import ( + Field, + model_validator, +) + +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.entrypoints.openai.engine.protocol import ( + AnyResponseFormat, + DeltaMessage, + FunctionCall, + FunctionDefinition, + LegacyStructuralTagResponseFormat, + LogitsProcessors, + OpenAIBaseModel, + StreamOptions, + StructuralTagResponseFormat, + ToolCall, + UsageInfo, + get_logits_processors, +) +from vllm.exceptions import VLLMValidationError +from vllm.logger import init_logger +from vllm.logprobs import Logprob +from vllm.sampling_params import ( + BeamSearchParams, + RequestOutputKind, + SamplingParams, + StructuredOutputsParams, +) +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +_LONG_INFO = torch.iinfo(torch.long) + + +class ChatMessage(OpenAIBaseModel): + role: str + content: str | None = None + refusal: str | None = None + annotations: OpenAIAnnotation | None = None + audio: OpenAIChatCompletionAudio | None = None + function_call: FunctionCall | None = None + tool_calls: list[ToolCall] = Field(default_factory=list) + + # vLLM-specific fields that are not in OpenAI spec + reasoning: str | None = None + reasoning_content: str | None = None + """Deprecated: use `reasoning` instead.""" + + @model_validator(mode="after") + def handle_deprecated_reasoning_content(self): + """Copy reasoning to reasoning_content for backward compatibility.""" + self.reasoning_content = self.reasoning + return self + + +class ChatCompletionLogProb(OpenAIBaseModel): + token: str + logprob: float = -9999.0 + bytes: list[int] | None = None + + +class ChatCompletionLogProbsContent(ChatCompletionLogProb): + # Workaround: redefine fields name cache so that it's not + # shared with the super class. + field_names: ClassVar[set[str] | None] = None + top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list) + + +class ChatCompletionLogProbs(OpenAIBaseModel): + content: list[ChatCompletionLogProbsContent] | None = None + + +class ChatCompletionResponseChoice(OpenAIBaseModel): + index: int + message: ChatMessage + logprobs: ChatCompletionLogProbs | None = None + # per OpenAI spec this is the default + finish_reason: str | None = "stop" + # not part of the OpenAI spec but included in vLLM for legacy reasons + stop_reason: int | str | None = None + # not part of the OpenAI spec but is useful for tracing the tokens + # in agent scenarios + token_ids: list[int] | None = None + + +class ChatCompletionResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: Literal["chat.completion"] = "chat.completion" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[ChatCompletionResponseChoice] + service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None + system_fingerprint: str | None = None + usage: UsageInfo + + # vLLM-specific fields that are not in OpenAI spec + prompt_logprobs: list[dict[int, Logprob] | None] | None = None + prompt_token_ids: list[int] | None = None + kv_transfer_params: dict[str, Any] | None = Field( + default=None, description="KVTransfer parameters." + ) + + +class ChatCompletionResponseStreamChoice(OpenAIBaseModel): + index: int + delta: DeltaMessage + logprobs: ChatCompletionLogProbs | None = None + finish_reason: str | None = None + stop_reason: int | str | None = None + # not part of the OpenAI spec but for tracing the tokens + token_ids: list[int] | None = None + + +class ChatCompletionStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") + object: Literal["chat.completion.chunk"] = "chat.completion.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[ChatCompletionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + # not part of the OpenAI spec but for tracing the tokens + prompt_token_ids: list[int] | None = None + + +class ChatCompletionToolsParam(OpenAIBaseModel): + type: Literal["function"] = "function" + function: FunctionDefinition + + +class ChatCompletionNamedFunction(OpenAIBaseModel): + name: str + + +class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): + function: ChatCompletionNamedFunction + type: Literal["function"] = "function" + + +class ChatCompletionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/chat/create + messages: list[ChatCompletionMessageParam] + model: str | None = None + frequency_penalty: float | None = 0.0 + logit_bias: dict[str, float] | None = None + logprobs: bool | None = False + top_logprobs: int | None = 0 + max_tokens: int | None = Field( + default=None, + deprecated="max_tokens is deprecated in favor of " + "the max_completion_tokens field", + ) + max_completion_tokens: int | None = None + n: int | None = 1 + presence_penalty: float | None = 0.0 + response_format: AnyResponseFormat | None = None + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + stop: str | list[str] | None = [] + stream: bool | None = False + stream_options: StreamOptions | None = None + temperature: float | None = None + top_p: float | None = None + tools: list[ChatCompletionToolsParam] | None = None + tool_choice: ( + Literal["none"] + | Literal["auto"] + | Literal["required"] + | ChatCompletionNamedToolChoiceParam + | None + ) = "none" + reasoning_effort: Literal["low", "medium", "high"] | None = None + include_reasoning: bool = True + parallel_tool_calls: bool | None = True + + # NOTE this will be ignored by vLLM + user: str | None = None + + # --8<-- [start:chat-completion-sampling-params] + use_beam_search: bool = False + top_k: int | None = None + min_p: float | None = None + repetition_penalty: float | None = None + length_penalty: float = 1.0 + stop_token_ids: list[int] | None = [] + include_stop_str_in_output: bool = False + ignore_eos: bool = False + min_tokens: int = 0 + skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True + truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = ( + None + ) + prompt_logprobs: int | None = None + allowed_token_ids: list[int] | None = None + bad_words: list[str] = Field(default_factory=list) + # --8<-- [end:chat-completion-sampling-params] + + # --8<-- [start:chat-completion-extra-params] + echo: bool = Field( + default=False, + description=( + "If true, the new message will be prepended with the last message " + "if they belong to the same role." + ), + ) + add_generation_prompt: bool = Field( + default=True, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + continue_final_message: bool = Field( + default=False, + description=( + "If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + 'This allows you to "prefill" part of the model\'s response for it. ' + "Cannot be used at the same time as `add_generation_prompt`." + ), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + documents: list[dict[str, str]] | None = Field( + default=None, + description=( + "A list of dicts representing documents that will be accessible to " + "the model if it is performing RAG (retrieval-augmented generation)." + " If the template does not support RAG, this argument will have no " + "effect. We recommend that each document should be a dict containing " + '"title" and "text" keys.' + ), + ) + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + structured_outputs: StructuredOutputsParams | None = Field( + default=None, + description="Additional kwargs for structured outputs", + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + request_id: str = Field( + default_factory=random_uuid, + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + logits_processors: LogitsProcessors | None = Field( + default=None, + description=( + "A list of either qualified names of logits processors, or " + "constructor objects, to apply when sampling. A constructor is " + "a JSON object with a required 'qualname' field specifying the " + "qualified name of the processor class/factory, and optional " + "'args' and 'kwargs' fields containing positional and keyword " + "arguments. For example: {'qualname': " + "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " + "{'param': 'value'}}." + ), + ) + return_tokens_as_token_ids: bool | None = Field( + default=None, + description=( + "If specified with 'logprobs', tokens are represented " + " as strings of the form 'token_id:{token_id}' so that tokens " + "that are not JSON-encodable can be identified." + ), + ) + return_token_ids: bool | None = Field( + default=None, + description=( + "If specified, the result will include token IDs alongside the " + "generated text. In streaming mode, prompt_token_ids is included " + "only in the first chunk, and token_ids contains the delta tokens " + "for each chunk. This is useful for debugging or when you " + "need to map generated text back to input tokens." + ), + ) + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) + + vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field( + default=None, + description=( + "Additional request parameters with (list of) string or " + "numeric values, used by custom extensions." + ), + ) + + # --8<-- [end:chat-completion-extra-params] + + # Default sampling parameters for chat completion requests + _DEFAULT_SAMPLING_PARAMS: dict = { + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + "min_p": 0.0, + } + + def to_beam_search_params( + self, max_tokens: int, default_sampling_params: dict + ) -> BeamSearchParams: + n = self.n if self.n is not None else 1 + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + + return BeamSearchParams( + beam_width=n, + max_tokens=max_tokens, + ignore_eos=self.ignore_eos, + temperature=temperature, + length_penalty=self.length_penalty, + include_stop_str_in_output=self.include_stop_str_in_output, + ) + + def to_sampling_params( + self, + max_tokens: int, + logits_processor_pattern: str | None, + default_sampling_params: dict, + ) -> SamplingParams: + # Default parameters + if (repetition_penalty := self.repetition_penalty) is None: + repetition_penalty = default_sampling_params.get( + "repetition_penalty", + self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], + ) + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + if (min_p := self.min_p) is None: + min_p = default_sampling_params.get( + "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] + ) + + prompt_logprobs = self.prompt_logprobs + if prompt_logprobs is None and self.echo: + prompt_logprobs = self.top_logprobs + + response_format = self.response_format + if response_format is not None: + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() + + # Set structured output params for response format + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format + assert structural_tag is not None and isinstance( + structural_tag, + ( + LegacyStructuralTagResponseFormat, + StructuralTagResponseFormat, + ), + ) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structured_outputs.structural_tag = json.dumps(s_tag_obj) + + extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} + if self.kv_transfer_params: + # Pass in kv_transfer_params via extra_args + extra_args["kv_transfer_params"] = self.kv_transfer_params + return SamplingParams.from_optional( + n=self.n, + presence_penalty=self.presence_penalty, + frequency_penalty=self.frequency_penalty, + repetition_penalty=repetition_penalty, + temperature=temperature, + top_p=top_p, + top_k=top_k, + min_p=min_p, + seed=self.seed, + stop=self.stop, + stop_token_ids=self.stop_token_ids, + logprobs=self.top_logprobs if self.logprobs else None, + prompt_logprobs=prompt_logprobs, + ignore_eos=self.ignore_eos, + max_tokens=max_tokens, + min_tokens=self.min_tokens, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self.spaces_between_special_tokens, + logits_processors=get_logits_processors( + self.logits_processors, logits_processor_pattern + ), + include_stop_str_in_output=self.include_stop_str_in_output, + truncate_prompt_tokens=self.truncate_prompt_tokens, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + structured_outputs=self.structured_outputs, + logit_bias=self.logit_bias, + bad_words=self.bad_words, + allowed_token_ids=self.allowed_token_ids, + extra_args=extra_args or None, + skip_clone=True, # Created fresh per request, safe to skip clone + ) + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + if data.get("stream_options") and not data.get("stream"): + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter="stream_options", + ) + + return data + + @model_validator(mode="before") + @classmethod + def check_logprobs(cls, data): + if (prompt_logprobs := data.get("prompt_logprobs")) is not None: + if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): + raise VLLMValidationError( + "`prompt_logprobs` are not available when `stream=True`.", + parameter="prompt_logprobs", + ) + + if prompt_logprobs < 0 and prompt_logprobs != -1: + raise VLLMValidationError( + "`prompt_logprobs` must be a positive value or -1.", + parameter="prompt_logprobs", + value=prompt_logprobs, + ) + if (top_logprobs := data.get("top_logprobs")) is not None: + if top_logprobs < 0 and top_logprobs != -1: + raise VLLMValidationError( + "`top_logprobs` must be a positive value or -1.", + parameter="top_logprobs", + value=top_logprobs, + ) + + if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): + raise VLLMValidationError( + "when using `top_logprobs`, `logprobs` must be set to true.", + parameter="top_logprobs", + ) + + return data + + @model_validator(mode="before") + @classmethod + def check_structured_outputs_count(cls, data): + if isinstance(data, ValueError): + raise data + + if data.get("structured_outputs", None) is None: + return data + + structured_outputs_kwargs = data["structured_outputs"] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice") + ) + # you can only use one kind of constraints for structured outputs + if count > 1: + raise ValueError( + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice')." + ) + # you can only either use structured outputs or tools, not both + if count > 1 and data.get("tool_choice", "none") not in ( + "none", + "auto", + "required", + ): + raise ValueError( + "You can only either use constraints for structured outputs " + "or tools, not both." + ) + return data + + @model_validator(mode="before") + @classmethod + def check_tool_usage(cls, data): + # if "tool_choice" is not specified but tools are provided, + # default to "auto" tool_choice + if "tool_choice" not in data and data.get("tools"): + data["tool_choice"] = "auto" + + # if "tool_choice" is "none" -- no validation is needed for tools + if "tool_choice" in data and data["tool_choice"] == "none": + return data + + # if "tool_choice" is specified -- validation + if "tool_choice" in data and data["tool_choice"] is not None: + # ensure that if "tool choice" is specified, tools are present + if "tools" not in data or data["tools"] is None: + raise ValueError("When using `tool_choice`, `tools` must be set.") + + # make sure that tool choice is either a named tool + # OR that it's set to "auto" or "required" + if data["tool_choice"] not in ["auto", "required"] and not isinstance( + data["tool_choice"], dict + ): + raise ValueError( + f"Invalid value for `tool_choice`: {data['tool_choice']}! " + 'Only named tools, "none", "auto" or "required" ' + "are supported." + ) + + # if tool_choice is "required" but the "tools" list is empty, + # override the data to behave like "none" to align with + # OpenAI’s behavior. + if ( + data["tool_choice"] == "required" + and isinstance(data["tools"], list) + and len(data["tools"]) == 0 + ): + data["tool_choice"] = "none" + del data["tools"] + return data + + # ensure that if "tool_choice" is specified as an object, + # it matches a valid tool + correct_usage_message = ( + 'Correct usage: `{"type": "function",' + ' "function": {"name": "my_function"}}`' + ) + if isinstance(data["tool_choice"], dict): + valid_tool = False + function = data["tool_choice"].get("function") + if not isinstance(function, dict): + raise ValueError( + f"Invalid value for `function`: `{function}` in " + f"`tool_choice`! {correct_usage_message}" + ) + if "name" not in function: + raise ValueError( + f"Expected field `name` in `function` in " + f"`tool_choice`! {correct_usage_message}" + ) + function_name = function["name"] + if not isinstance(function_name, str) or len(function_name) == 0: + raise ValueError( + f"Invalid `name` in `function`: `{function_name}`" + f" in `tool_choice`! {correct_usage_message}" + ) + for tool in data["tools"]: + if tool["function"]["name"] == function_name: + valid_tool = True + break + if not valid_tool: + raise ValueError( + "The tool specified in `tool_choice` does not match any" + " of the specified `tools`" + ) + return data + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get("add_generation_prompt"): + raise ValueError( + "Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True." + ) + return data + + @model_validator(mode="before") + @classmethod + def check_cache_salt_support(cls, data): + if data.get("cache_salt") is not None and ( + not isinstance(data["cache_salt"], str) or not data["cache_salt"] + ): + raise ValueError( + "Parameter 'cache_salt' must be a non-empty string if provided." + ) + return data diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/chat_completion/serving.py similarity index 99% rename from vllm/entrypoints/openai/serving_chat.py rename to vllm/entrypoints/openai/chat_completion/serving.py index e65dba2b893b..2b0d0101f81b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -23,16 +23,7 @@ make_tool_call_id, ) from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.parser.harmony_utils import ( - get_developer_message, - get_stop_tokens_for_assistant_actions, - get_streamable_parser_for_assistant, - get_system_message, - parse_chat_inputs_to_harmony_messages, - parse_chat_output, - render_for_completion, -) -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionLogProbsContent, @@ -43,6 +34,11 @@ ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, +) +from vllm.entrypoints.openai.chat_completion.stream_harmony import ( + extract_harmony_streaming_delta, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -52,14 +48,20 @@ ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.serving_chat_stream_harmony import ( - extract_harmony_streaming_delta, -) -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( GenerationError, OpenAIServing, clamp_prompt_logprobs, ) +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_developer_message, + get_stop_tokens_for_assistant_actions, + get_streamable_parser_for_assistant, + get_system_message, + parse_chat_inputs_to_harmony_messages, + parse_chat_output, + render_for_completion, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import get_max_tokens, should_include_usage diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/chat_completion/stream_harmony.py similarity index 98% rename from vllm/entrypoints/openai/serving_chat_stream_harmony.py rename to vllm/entrypoints/openai/chat_completion/stream_harmony.py index 1b5ae620651c..9a95bc6e1c4a 100644 --- a/vllm/entrypoints/openai/serving_chat_stream_harmony.py +++ b/vllm/entrypoints/openai/chat_completion/stream_harmony.py @@ -10,7 +10,7 @@ from openai_harmony import StreamableParser from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/entrypoints/openai/engine/__init__.py b/vllm/entrypoints/openai/engine/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/vllm/entrypoints/openai/engine/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/engine/protocol.py similarity index 68% rename from vllm/entrypoints/openai/protocol.py rename to vllm/entrypoints/openai/engine/protocol.py index 845dae7c1bf1..9434e214f348 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -11,10 +11,6 @@ import regex as re import torch from fastapi import HTTPException, UploadFile -from openai.types.chat.chat_completion_audio import ( - ChatCompletionAudio as OpenAIChatCompletionAudio, -) -from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation from openai.types.responses import ( ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDoneEvent, @@ -234,20 +230,6 @@ class FunctionDefinition(OpenAIBaseModel): parameters: dict[str, Any] | None = None -class ChatCompletionToolsParam(OpenAIBaseModel): - type: Literal["function"] = "function" - function: FunctionDefinition - - -class ChatCompletionNamedFunction(OpenAIBaseModel): - name: str - - -class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): - function: ChatCompletionNamedFunction - type: Literal["function"] = "function" - - # extra="forbid" is a workaround to have kwargs as a field, # see https://github.com/pydantic/pydantic/issues/3125 class LogitsProcessorConstructor(BaseModel): @@ -414,609 +396,66 @@ def to_sampling_params( ) if (top_k := self.top_k) is None: top_k = default_sampling_params.get( - "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] - ) - stop_token_ids = default_sampling_params.get("stop_token_ids") - - # Structured output - structured_outputs = None - if self.text is not None and self.text.format is not None: - response_format = self.text.format - if ( - response_format.type == "json_schema" - and response_format.schema_ is not None - ): - structured_outputs = StructuredOutputsParams( - json=response_format.schema_ - ) - elif response_format.type == "json_object": - raise NotImplementedError("json_object is not supported") - - # TODO: add more parameters - return SamplingParams.from_optional( - temperature=temperature, - top_p=top_p, - top_k=top_k, - max_tokens=max_tokens, - logprobs=self.top_logprobs if self.is_include_output_logprobs() else None, - stop_token_ids=stop_token_ids, - output_kind=( - RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY - ), - structured_outputs=structured_outputs, - logit_bias=self.logit_bias, - skip_clone=True, # Created fresh per request, safe to skip clone - ) - - def is_include_output_logprobs(self) -> bool: - """Check if the request includes output logprobs.""" - if self.include is None: - return False - return ( - isinstance(self.include, list) - and "message.output_text.logprobs" in self.include - ) - - @model_validator(mode="before") - def validate_background(cls, data): - if not data.get("background"): - return data - if not data.get("store", True): - raise ValueError("background can only be used when `store` is true") - return data - - @model_validator(mode="before") - def validate_prompt(cls, data): - if data.get("prompt") is not None: - raise VLLMValidationError( - "prompt template is not supported", parameter="prompt" - ) - return data - - @model_validator(mode="before") - def check_cache_salt_support(cls, data): - if data.get("cache_salt") is not None and ( - not isinstance(data["cache_salt"], str) or not data["cache_salt"] - ): - raise ValueError( - "Parameter 'cache_salt' must be a non-empty string if provided." - ) - return data - - @model_validator(mode="before") - def function_call_parsing(cls, data): - """Parse function_call dictionaries into ResponseFunctionToolCall objects. - This ensures Pydantic can properly resolve union types in the input field. - Function calls provided as dicts are converted to ResponseFunctionToolCall - objects before validation, while invalid structures are left for Pydantic - to reject with appropriate error messages. - """ - - input_data = data.get("input") - - # Early return for None, strings, or bytes - # (strings are iterable but shouldn't be processed) - if input_data is None or isinstance(input_data, (str, bytes)): - return data - - # Convert iterators (like ValidatorIterator) to list - if not isinstance(input_data, list): - try: - input_data = list(input_data) - except TypeError: - # Not iterable, leave as-is for Pydantic to handle - return data - - processed_input = [] - for item in input_data: - if isinstance(item, dict) and item.get("type") == "function_call": - try: - processed_input.append(ResponseFunctionToolCall(**item)) - except ValidationError: - # Let Pydantic handle validation for malformed function calls - logger.debug( - "Failed to parse function_call to ResponseFunctionToolCall, " - "leaving for Pydantic validation" - ) - processed_input.append(item) - else: - processed_input.append(item) - - data["input"] = processed_input - return data - - -class ChatCompletionRequest(OpenAIBaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/chat/create - messages: list[ChatCompletionMessageParam] - model: str | None = None - frequency_penalty: float | None = 0.0 - logit_bias: dict[str, float] | None = None - logprobs: bool | None = False - top_logprobs: int | None = 0 - max_tokens: int | None = Field( - default=None, - deprecated="max_tokens is deprecated in favor of " - "the max_completion_tokens field", - ) - max_completion_tokens: int | None = None - n: int | None = 1 - presence_penalty: float | None = 0.0 - response_format: AnyResponseFormat | None = None - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: str | list[str] | None = [] - stream: bool | None = False - stream_options: StreamOptions | None = None - temperature: float | None = None - top_p: float | None = None - tools: list[ChatCompletionToolsParam] | None = None - tool_choice: ( - Literal["none"] - | Literal["auto"] - | Literal["required"] - | ChatCompletionNamedToolChoiceParam - | None - ) = "none" - reasoning_effort: Literal["low", "medium", "high"] | None = None - include_reasoning: bool = True - parallel_tool_calls: bool | None = True - - # NOTE this will be ignored by vLLM - user: str | None = None - - # --8<-- [start:chat-completion-sampling-params] - use_beam_search: bool = False - top_k: int | None = None - min_p: float | None = None - repetition_penalty: float | None = None - length_penalty: float = 1.0 - stop_token_ids: list[int] | None = [] - include_stop_str_in_output: bool = False - ignore_eos: bool = False - min_tokens: int = 0 - skip_special_tokens: bool = True - spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = ( - None - ) - prompt_logprobs: int | None = None - allowed_token_ids: list[int] | None = None - bad_words: list[str] = Field(default_factory=list) - # --8<-- [end:chat-completion-sampling-params] - - # --8<-- [start:chat-completion-extra-params] - echo: bool = Field( - default=False, - description=( - "If true, the new message will be prepended with the last message " - "if they belong to the same role." - ), - ) - add_generation_prompt: bool = Field( - default=True, - description=( - "If true, the generation prompt will be added to the chat template. " - "This is a parameter used by chat template in tokenizer config of the " - "model." - ), - ) - continue_final_message: bool = Field( - default=False, - description=( - "If this is set, the chat will be formatted so that the final " - "message in the chat is open-ended, without any EOS tokens. The " - "model will continue this message rather than starting a new one. " - 'This allows you to "prefill" part of the model\'s response for it. ' - "Cannot be used at the same time as `add_generation_prompt`." - ), - ) - add_special_tokens: bool = Field( - default=False, - description=( - "If true, special tokens (e.g. BOS) will be added to the prompt " - "on top of what is added by the chat template. " - "For most models, the chat template takes care of adding the " - "special tokens so this should be set to false (as is the " - "default)." - ), - ) - documents: list[dict[str, str]] | None = Field( - default=None, - description=( - "A list of dicts representing documents that will be accessible to " - "the model if it is performing RAG (retrieval-augmented generation)." - " If the template does not support RAG, this argument will have no " - "effect. We recommend that each document should be a dict containing " - '"title" and "text" keys.' - ), - ) - chat_template: str | None = Field( - default=None, - description=( - "A Jinja template to use for this conversion. " - "As of transformers v4.44, default chat template is no longer " - "allowed, so you must provide a chat template if the tokenizer " - "does not define one." - ), - ) - chat_template_kwargs: dict[str, Any] | None = Field( - default=None, - description=( - "Additional keyword args to pass to the template renderer. " - "Will be accessible by the chat template." - ), - ) - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - structured_outputs: StructuredOutputsParams | None = Field( - default=None, - description="Additional kwargs for structured outputs", - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling." - ), - ) - request_id: str = Field( - default_factory=random_uuid, - description=( - "The request_id related to this request. If the caller does " - "not set it, a random_uuid will be generated. This id is used " - "through out the inference process and return in response." - ), - ) - logits_processors: LogitsProcessors | None = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) - return_tokens_as_token_ids: bool | None = Field( - default=None, - description=( - "If specified with 'logprobs', tokens are represented " - " as strings of the form 'token_id:{token_id}' so that tokens " - "that are not JSON-encodable can be identified." - ), - ) - return_token_ids: bool | None = Field( - default=None, - description=( - "If specified, the result will include token IDs alongside the " - "generated text. In streaming mode, prompt_token_ids is included " - "only in the first chunk, and token_ids contains the delta tokens " - "for each chunk. This is useful for debugging or when you " - "need to map generated text back to input tokens." - ), - ) - cache_salt: str | None = Field( - default=None, - description=( - "If specified, the prefix cache will be salted with the provided " - "string to prevent an attacker to guess prompts in multi-user " - "environments. The salt should be random, protected from " - "access by 3rd parties, and long enough to be " - "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit)." - ), - ) - kv_transfer_params: dict[str, Any] | None = Field( - default=None, - description="KVTransfer parameters used for disaggregated serving.", - ) - - vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field( - default=None, - description=( - "Additional request parameters with (list of) string or " - "numeric values, used by custom extensions." - ), - ) - - # --8<-- [end:chat-completion-extra-params] - - # Default sampling parameters for chat completion requests - _DEFAULT_SAMPLING_PARAMS: dict = { - "repetition_penalty": 1.0, - "temperature": 1.0, - "top_p": 1.0, - "top_k": 0, - "min_p": 0.0, - } - - def to_beam_search_params( - self, max_tokens: int, default_sampling_params: dict - ) -> BeamSearchParams: - n = self.n if self.n is not None else 1 - if (temperature := self.temperature) is None: - temperature = default_sampling_params.get( - "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] - ) - - return BeamSearchParams( - beam_width=n, - max_tokens=max_tokens, - ignore_eos=self.ignore_eos, - temperature=temperature, - length_penalty=self.length_penalty, - include_stop_str_in_output=self.include_stop_str_in_output, - ) - - def to_sampling_params( - self, - max_tokens: int, - logits_processor_pattern: str | None, - default_sampling_params: dict, - ) -> SamplingParams: - # Default parameters - if (repetition_penalty := self.repetition_penalty) is None: - repetition_penalty = default_sampling_params.get( - "repetition_penalty", - self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], - ) - if (temperature := self.temperature) is None: - temperature = default_sampling_params.get( - "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] - ) - if (top_p := self.top_p) is None: - top_p = default_sampling_params.get( - "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] - ) - if (top_k := self.top_k) is None: - top_k = default_sampling_params.get( - "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] - ) - if (min_p := self.min_p) is None: - min_p = default_sampling_params.get( - "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] - ) - - prompt_logprobs = self.prompt_logprobs - if prompt_logprobs is None and self.echo: - prompt_logprobs = self.top_logprobs - - response_format = self.response_format - if response_format is not None: - # If structured outputs wasn't already enabled, - # we must enable it for these features to work - if self.structured_outputs is None: - self.structured_outputs = StructuredOutputsParams() - - # Set structured output params for response format - if response_format.type == "json_object": - self.structured_outputs.json_object = True - elif response_format.type == "json_schema": - json_schema = response_format.json_schema - assert json_schema is not None - self.structured_outputs.json = json_schema.json_schema - elif response_format.type == "structural_tag": - structural_tag = response_format - assert structural_tag is not None and isinstance( - structural_tag, - ( - LegacyStructuralTagResponseFormat, - StructuralTagResponseFormat, - ), - ) - s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structured_outputs.structural_tag = json.dumps(s_tag_obj) - - extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} - if self.kv_transfer_params: - # Pass in kv_transfer_params via extra_args - extra_args["kv_transfer_params"] = self.kv_transfer_params - return SamplingParams.from_optional( - n=self.n, - presence_penalty=self.presence_penalty, - frequency_penalty=self.frequency_penalty, - repetition_penalty=repetition_penalty, - temperature=temperature, - top_p=top_p, - top_k=top_k, - min_p=min_p, - seed=self.seed, - stop=self.stop, - stop_token_ids=self.stop_token_ids, - logprobs=self.top_logprobs if self.logprobs else None, - prompt_logprobs=prompt_logprobs, - ignore_eos=self.ignore_eos, - max_tokens=max_tokens, - min_tokens=self.min_tokens, - skip_special_tokens=self.skip_special_tokens, - spaces_between_special_tokens=self.spaces_between_special_tokens, - logits_processors=get_logits_processors( - self.logits_processors, logits_processor_pattern - ), - include_stop_str_in_output=self.include_stop_str_in_output, - truncate_prompt_tokens=self.truncate_prompt_tokens, - output_kind=RequestOutputKind.DELTA - if self.stream - else RequestOutputKind.FINAL_ONLY, - structured_outputs=self.structured_outputs, - logit_bias=self.logit_bias, - bad_words=self.bad_words, - allowed_token_ids=self.allowed_token_ids, - extra_args=extra_args or None, - skip_clone=True, # Created fresh per request, safe to skip clone - ) - - @model_validator(mode="before") - @classmethod - def validate_stream_options(cls, data): - if data.get("stream_options") and not data.get("stream"): - raise VLLMValidationError( - "Stream options can only be defined when `stream=True`.", - parameter="stream_options", - ) - - return data - - @model_validator(mode="before") - @classmethod - def check_logprobs(cls, data): - if (prompt_logprobs := data.get("prompt_logprobs")) is not None: - if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): - raise VLLMValidationError( - "`prompt_logprobs` are not available when `stream=True`.", - parameter="prompt_logprobs", - ) - - if prompt_logprobs < 0 and prompt_logprobs != -1: - raise VLLMValidationError( - "`prompt_logprobs` must be a positive value or -1.", - parameter="prompt_logprobs", - value=prompt_logprobs, - ) - if (top_logprobs := data.get("top_logprobs")) is not None: - if top_logprobs < 0 and top_logprobs != -1: - raise VLLMValidationError( - "`top_logprobs` must be a positive value or -1.", - parameter="top_logprobs", - value=top_logprobs, - ) - - if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): - raise VLLMValidationError( - "when using `top_logprobs`, `logprobs` must be set to true.", - parameter="top_logprobs", - ) - - return data - - @model_validator(mode="before") - @classmethod - def check_structured_outputs_count(cls, data): - if isinstance(data, ValueError): - raise data - - if data.get("structured_outputs", None) is None: - return data - - structured_outputs_kwargs = data["structured_outputs"] - count = sum( - structured_outputs_kwargs.get(k) is not None - for k in ("json", "regex", "choice") - ) - # you can only use one kind of constraints for structured outputs - if count > 1: - raise ValueError( - "You can only use one kind of constraints for structured " - "outputs ('json', 'regex' or 'choice')." - ) - # you can only either use structured outputs or tools, not both - if count > 1 and data.get("tool_choice", "none") not in ( - "none", - "auto", - "required", - ): - raise ValueError( - "You can only either use constraints for structured outputs " - "or tools, not both." + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] ) - return data - - @model_validator(mode="before") - @classmethod - def check_tool_usage(cls, data): - # if "tool_choice" is not specified but tools are provided, - # default to "auto" tool_choice - if "tool_choice" not in data and data.get("tools"): - data["tool_choice"] = "auto" - - # if "tool_choice" is "none" -- no validation is needed for tools - if "tool_choice" in data and data["tool_choice"] == "none": - return data - - # if "tool_choice" is specified -- validation - if "tool_choice" in data and data["tool_choice"] is not None: - # ensure that if "tool choice" is specified, tools are present - if "tools" not in data or data["tools"] is None: - raise ValueError("When using `tool_choice`, `tools` must be set.") + stop_token_ids = default_sampling_params.get("stop_token_ids") - # make sure that tool choice is either a named tool - # OR that it's set to "auto" or "required" - if data["tool_choice"] not in ["auto", "required"] and not isinstance( - data["tool_choice"], dict + # Structured output + structured_outputs = None + if self.text is not None and self.text.format is not None: + response_format = self.text.format + if ( + response_format.type == "json_schema" + and response_format.schema_ is not None ): - raise ValueError( - f"Invalid value for `tool_choice`: {data['tool_choice']}! " - 'Only named tools, "none", "auto" or "required" ' - "are supported." + structured_outputs = StructuredOutputsParams( + json=response_format.schema_ ) + elif response_format.type == "json_object": + raise NotImplementedError("json_object is not supported") - # if tool_choice is "required" but the "tools" list is empty, - # override the data to behave like "none" to align with - # OpenAI’s behavior. - if ( - data["tool_choice"] == "required" - and isinstance(data["tools"], list) - and len(data["tools"]) == 0 - ): - data["tool_choice"] = "none" - del data["tools"] - return data + # TODO: add more parameters + return SamplingParams.from_optional( + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_tokens=max_tokens, + logprobs=self.top_logprobs if self.is_include_output_logprobs() else None, + stop_token_ids=stop_token_ids, + output_kind=( + RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY + ), + structured_outputs=structured_outputs, + logit_bias=self.logit_bias, + skip_clone=True, # Created fresh per request, safe to skip clone + ) - # ensure that if "tool_choice" is specified as an object, - # it matches a valid tool - correct_usage_message = ( - 'Correct usage: `{"type": "function",' - ' "function": {"name": "my_function"}}`' - ) - if isinstance(data["tool_choice"], dict): - valid_tool = False - function = data["tool_choice"].get("function") - if not isinstance(function, dict): - raise ValueError( - f"Invalid value for `function`: `{function}` in " - f"`tool_choice`! {correct_usage_message}" - ) - if "name" not in function: - raise ValueError( - f"Expected field `name` in `function` in " - f"`tool_choice`! {correct_usage_message}" - ) - function_name = function["name"] - if not isinstance(function_name, str) or len(function_name) == 0: - raise ValueError( - f"Invalid `name` in `function`: `{function_name}`" - f" in `tool_choice`! {correct_usage_message}" - ) - for tool in data["tools"]: - if tool["function"]["name"] == function_name: - valid_tool = True - break - if not valid_tool: - raise ValueError( - "The tool specified in `tool_choice` does not match any" - " of the specified `tools`" - ) + def is_include_output_logprobs(self) -> bool: + """Check if the request includes output logprobs.""" + if self.include is None: + return False + return ( + isinstance(self.include, list) + and "message.output_text.logprobs" in self.include + ) + + @model_validator(mode="before") + def validate_background(cls, data): + if not data.get("background"): + return data + if not data.get("store", True): + raise ValueError("background can only be used when `store` is true") return data @model_validator(mode="before") - @classmethod - def check_generation_prompt(cls, data): - if data.get("continue_final_message") and data.get("add_generation_prompt"): - raise ValueError( - "Cannot set both `continue_final_message` and " - "`add_generation_prompt` to True." + def validate_prompt(cls, data): + if data.get("prompt") is not None: + raise VLLMValidationError( + "prompt template is not supported", parameter="prompt" ) return data @model_validator(mode="before") - @classmethod def check_cache_salt_support(cls, data): if data.get("cache_salt") is not None and ( not isinstance(data["cache_salt"], str) or not data["cache_salt"] @@ -1026,6 +465,48 @@ def check_cache_salt_support(cls, data): ) return data + @model_validator(mode="before") + def function_call_parsing(cls, data): + """Parse function_call dictionaries into ResponseFunctionToolCall objects. + This ensures Pydantic can properly resolve union types in the input field. + Function calls provided as dicts are converted to ResponseFunctionToolCall + objects before validation, while invalid structures are left for Pydantic + to reject with appropriate error messages. + """ + + input_data = data.get("input") + + # Early return for None, strings, or bytes + # (strings are iterable but shouldn't be processed) + if input_data is None or isinstance(input_data, (str, bytes)): + return data + + # Convert iterators (like ValidatorIterator) to list + if not isinstance(input_data, list): + try: + input_data = list(input_data) + except TypeError: + # Not iterable, leave as-is for Pydantic to handle + return data + + processed_input = [] + for item in input_data: + if isinstance(item, dict) and item.get("type") == "function_call": + try: + processed_input.append(ResponseFunctionToolCall(**item)) + except ValidationError: + # Let Pydantic handle validation for malformed function calls + logger.debug( + "Failed to parse function_call to ResponseFunctionToolCall, " + "leaving for Pydantic validation" + ) + processed_input.append(item) + else: + processed_input.append(item) + + data["input"] = processed_input + return data + class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -1486,75 +967,6 @@ class ExtractedToolCallInformation(BaseModel): content: str | None = None -class ChatMessage(OpenAIBaseModel): - role: str - content: str | None = None - refusal: str | None = None - annotations: OpenAIAnnotation | None = None - audio: OpenAIChatCompletionAudio | None = None - function_call: FunctionCall | None = None - tool_calls: list[ToolCall] = Field(default_factory=list) - - # vLLM-specific fields that are not in OpenAI spec - reasoning: str | None = None - reasoning_content: str | None = None - """Deprecated: use `reasoning` instead.""" - - @model_validator(mode="after") - def handle_deprecated_reasoning_content(self): - """Copy reasoning to reasoning_content for backward compatibility.""" - self.reasoning_content = self.reasoning - return self - - -class ChatCompletionLogProb(OpenAIBaseModel): - token: str - logprob: float = -9999.0 - bytes: list[int] | None = None - - -class ChatCompletionLogProbsContent(ChatCompletionLogProb): - # Workaround: redefine fields name cache so that it's not - # shared with the super class. - field_names: ClassVar[set[str] | None] = None - top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list) - - -class ChatCompletionLogProbs(OpenAIBaseModel): - content: list[ChatCompletionLogProbsContent] | None = None - - -class ChatCompletionResponseChoice(OpenAIBaseModel): - index: int - message: ChatMessage - logprobs: ChatCompletionLogProbs | None = None - # per OpenAI spec this is the default - finish_reason: str | None = "stop" - # not part of the OpenAI spec but included in vLLM for legacy reasons - stop_reason: int | str | None = None - # not part of the OpenAI spec but is useful for tracing the tokens - # in agent scenarios - token_ids: list[int] | None = None - - -class ChatCompletionResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") - object: Literal["chat.completion"] = "chat.completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: list[ChatCompletionResponseChoice] - service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None - system_fingerprint: str | None = None - usage: UsageInfo - - # vLLM-specific fields that are not in OpenAI spec - prompt_logprobs: list[dict[int, Logprob] | None] | None = None - prompt_token_ids: list[int] | None = None - kv_transfer_params: dict[str, Any] | None = Field( - default=None, description="KVTransfer parameters." - ) - - class DeltaMessage(OpenAIBaseModel): role: str | None = None content: str | None = None @@ -1570,27 +982,6 @@ def handle_deprecated_reasoning_content(self): return self -class ChatCompletionResponseStreamChoice(OpenAIBaseModel): - index: int - delta: DeltaMessage - logprobs: ChatCompletionLogProbs | None = None - finish_reason: str | None = None - stop_reason: int | str | None = None - # not part of the OpenAI spec but for tracing the tokens - token_ids: list[int] | None = None - - -class ChatCompletionStreamResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") - object: Literal["chat.completion.chunk"] = "chat.completion.chunk" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: list[ChatCompletionResponseStreamChoice] - usage: UsageInfo | None = Field(default=None) - # not part of the OpenAI spec but for tracing the tokens - prompt_token_ids: list[int] | None = None - - class TranscriptionResponseStreamChoice(OpenAIBaseModel): delta: DeltaMessage finish_reason: str | None = None @@ -1856,128 +1247,6 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent): ) -class TokenizeCompletionRequest(OpenAIBaseModel): - model: str | None = None - prompt: str - - add_special_tokens: bool = Field( - default=True, - description=( - "If true (the default), special tokens (e.g. BOS) will be added to " - "the prompt." - ), - ) - return_token_strs: bool | None = Field( - default=False, - description=( - "If true, also return the token strings corresponding to the token ids." - ), - ) - - -class TokenizeChatRequest(OpenAIBaseModel): - model: str | None = None - messages: list[ChatCompletionMessageParam] - - add_generation_prompt: bool = Field( - default=True, - description=( - "If true, the generation prompt will be added to the chat template. " - "This is a parameter used by chat template in tokenizer config of the " - "model." - ), - ) - return_token_strs: bool | None = Field( - default=False, - description=( - "If true, also return the token strings corresponding to the token ids." - ), - ) - continue_final_message: bool = Field( - default=False, - description=( - "If this is set, the chat will be formatted so that the final " - "message in the chat is open-ended, without any EOS tokens. The " - "model will continue this message rather than starting a new one. " - 'This allows you to "prefill" part of the model\'s response for it. ' - "Cannot be used at the same time as `add_generation_prompt`." - ), - ) - add_special_tokens: bool = Field( - default=False, - description=( - "If true, special tokens (e.g. BOS) will be added to the prompt " - "on top of what is added by the chat template. " - "For most models, the chat template takes care of adding the " - "special tokens so this should be set to false (as is the " - "default)." - ), - ) - chat_template: str | None = Field( - default=None, - description=( - "A Jinja template to use for this conversion. " - "As of transformers v4.44, default chat template is no longer " - "allowed, so you must provide a chat template if the tokenizer " - "does not define one." - ), - ) - chat_template_kwargs: dict[str, Any] | None = Field( - default=None, - description=( - "Additional keyword args to pass to the template renderer. " - "Will be accessible by the chat template." - ), - ) - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - tools: list[ChatCompletionToolsParam] | None = Field( - default=None, - description=("A list of tools the model may call."), - ) - - @model_validator(mode="before") - @classmethod - def check_generation_prompt(cls, data): - if data.get("continue_final_message") and data.get("add_generation_prompt"): - raise ValueError( - "Cannot set both `continue_final_message` and " - "`add_generation_prompt` to True." - ) - return data - - -TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest - - -class TokenizeResponse(OpenAIBaseModel): - count: int - max_model_len: int - tokens: list[int] - token_strs: list[str] | None = None - - -class DetokenizeRequest(OpenAIBaseModel): - model: str | None = None - tokens: list[int] - - -class DetokenizeResponse(OpenAIBaseModel): - prompt: str - - -class TokenizerInfoResponse(OpenAIBaseModel): - """ - Response containing tokenizer configuration - equivalent to tokenizer_config.json - """ - - model_config = ConfigDict(extra="allow") - tokenizer_class: str - - class LoadLoRAAdapterRequest(BaseModel): lora_name: str lora_path: str @@ -2537,30 +1806,3 @@ class GenerateRequest(BaseModel): default=None, description="KVTransfer parameters used for disaggregated serving.", ) - - -class GenerateResponseChoice(BaseModel): - index: int - logprobs: ChatCompletionLogProbs | None = None - # per OpenAI spec this is the default - finish_reason: str | None = "stop" - token_ids: list[int] | None = None - - -class GenerateResponse(BaseModel): - request_id: str = Field( - default_factory=random_uuid, - description=( - "The request_id related to this request. If the caller does " - "not set it, a random_uuid will be generated. This id is used " - "through out the inference process and return in response." - ), - ) - choices: list[GenerateResponseChoice] - - prompt_logprobs: list[dict[int, Logprob] | None] | None = None - - kv_transfer_params: dict[str, Any] | None = Field( - default=None, - description="KVTransfer parameters used for disaggregated serving.", - ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/engine/serving.py similarity index 99% rename from vllm/entrypoints/openai/serving_engine.py rename to vllm/entrypoints/openai/engine/serving.py index fac4ced42bcb..666d83c21aea 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -38,22 +38,20 @@ StreamingHarmonyContext, ) from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, +) +from vllm.entrypoints.openai.engine.protocol import ( CompletionRequest, CompletionResponse, - DetokenizeRequest, ErrorInfo, ErrorResponse, FunctionCall, FunctionDefinition, ResponseInputOutputItem, ResponsesRequest, - TokenizeChatRequest, - TokenizeCompletionRequest, - TokenizeResponse, TranscriptionRequest, TranscriptionResponse, TranslationRequest, @@ -86,6 +84,12 @@ construct_input_messages, ) from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse +from vllm.entrypoints.serve.tokenize.protocol import ( + DetokenizeRequest, + TokenizeChatRequest, + TokenizeCompletionRequest, + TokenizeResponse, +) from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import ( diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 533286c5906f..a3959c873be4 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -43,8 +43,8 @@ from openai_harmony import Role as OpenAIHarmonyRole from vllm import envs -from vllm.entrypoints.openai.protocol import ( - ChatCompletionToolsParam, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam +from vllm.entrypoints.openai.engine.protocol import ( ResponseInputOutputItem, ResponsesRequest, ) diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py index 14a6f5cb73e1..bda7e99520af 100644 --- a/vllm/entrypoints/openai/parser/responses_parser.py +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -16,7 +16,10 @@ ) from vllm.entrypoints.constants import MCP_PREFIX -from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest +from vllm.entrypoints.openai.engine.protocol import ( + ResponseInputOutputItem, + ResponsesRequest, +) from vllm.outputs import CompletionOutput from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.tokenizers import TokenizerLike diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 6bb6d0f3f97b..5b72dc663b89 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -19,13 +19,15 @@ from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponse, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, OpenAIBaseModel, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 6ef5ae3ef01c..187ccb64e9ba 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -12,7 +12,7 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( CompletionLogProbs, CompletionRequest, CompletionResponse, @@ -25,7 +25,7 @@ UsageInfo, VLLMValidationError, ) -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( GenerationError, OpenAIServing, clamp_prompt_logprobs, diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 6b03fa72fc59..4e6d4d5e0545 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -7,7 +7,7 @@ from http import HTTPStatus from vllm.engine.protocol import EngineClient -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorInfo, ErrorResponse, LoadLoRAAdapterRequest, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index f79dad8d9e5e..e870d6e92822 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -72,19 +72,7 @@ StreamingHarmonyContext, ) from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.parser.harmony_utils import ( - construct_harmony_previous_input_messages, - get_developer_message, - get_stop_tokens_for_assistant_actions, - get_system_message, - get_user_message, - has_custom_tools, - parse_output_message, - parse_remaining_state, - parse_response_input, - render_for_completion, -) -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ErrorResponse, InputTokensDetails, @@ -102,10 +90,22 @@ StreamingResponsesResponse, VLLMValidationError, ) -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( GenerationError, OpenAIServing, ) +from vllm.entrypoints.openai.parser.harmony_utils import ( + construct_harmony_previous_input_messages, + get_developer_message, + get_stop_tokens_for_assistant_actions, + get_system_message, + get_user_message, + has_custom_tools, + parse_output_message, + parse_remaining_state, + parse_response_input, + render_for_completion, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 189b532810b4..ef576153df96 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -6,7 +6,7 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, RequestResponseMetadata, TranscriptionRequest, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index b6332d1941c1..6fdfea2f405d 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -15,7 +15,7 @@ import vllm.envs as envs from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ErrorResponse, RequestResponseMetadata, @@ -32,7 +32,7 @@ UsageInfo, VLLMValidationError, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest +from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import PromptType from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py index 29db601af67f..55e59510f549 100644 --- a/vllm/entrypoints/openai/utils.py +++ b/vllm/entrypoints/openai/utils.py @@ -5,7 +5,7 @@ from fastapi import Request from fastapi.exceptions import RequestValidationError -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py index d6ced73c88eb..7bd170a9f144 100644 --- a/vllm/entrypoints/pooling/classify/api_router.py +++ b/vllm/entrypoints/pooling/classify/api_router.py @@ -6,7 +6,7 @@ from starlette.responses import JSONResponse from typing_extensions import assert_never -from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.classify.protocol import ( ClassificationRequest, diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py index 273bdd29ee58..89e927fd7759 100644 --- a/vllm/entrypoints/pooling/classify/protocol.py +++ b/vllm/entrypoints/pooling/classify/protocol.py @@ -11,7 +11,7 @@ from vllm import PoolingParams from vllm.config.pooler import get_use_activation from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo +from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo from vllm.utils import random_uuid diff --git a/vllm/entrypoints/pooling/classify/serving.py b/vllm/entrypoints/pooling/classify/serving.py index e166405a6f05..446366880047 100644 --- a/vllm/entrypoints/pooling/classify/serving.py +++ b/vllm/entrypoints/pooling/classify/serving.py @@ -11,12 +11,14 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( ClassificationServeContext, OpenAIServing, ServeContext, diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py index 24b0c8c2b3cf..d8e5cf64127e 100644 --- a/vllm/entrypoints/pooling/embed/api_router.py +++ b/vllm/entrypoints/pooling/embed/api_router.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from typing_extensions import assert_never -from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.embed.protocol import ( EmbeddingBytesResponse, diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 3829a1a6a6c1..79c6d540d278 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -10,7 +10,7 @@ from vllm import PoolingParams from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo +from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo from vllm.utils import random_uuid from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py index e94b80043962..6e1381878e93 100644 --- a/vllm/entrypoints/pooling/embed/serving.py +++ b/vllm/entrypoints/pooling/embed/serving.py @@ -12,11 +12,11 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( EmbeddingServeContext, OpenAIServing, ServeContext, diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py index 4baaf8f30f6b..223d6e3b89be 100644 --- a/vllm/entrypoints/pooling/pooling/api_router.py +++ b/vllm/entrypoints/pooling/pooling/api_router.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from typing_extensions import assert_never -from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.pooling.protocol import ( IOProcessorResponse, diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 76b361b49b66..22f2bb18e9c2 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -9,7 +9,7 @@ from vllm import PoolingParams from vllm.config.pooler import get_use_activation -from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo +from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.pooling.embed.protocol import ( EmbeddingChatRequest, EmbeddingCompletionRequest, diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 4e1b326806ea..c27c9179ec94 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -14,11 +14,11 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.pooling.protocol import ( IOProcessorRequest, diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py index c7481ed9fa96..bd9b5c425b05 100644 --- a/vllm/entrypoints/pooling/score/api_router.py +++ b/vllm/entrypoints/pooling/score/api_router.py @@ -6,7 +6,7 @@ from fastapi.responses import JSONResponse from typing_extensions import assert_never -from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.score.protocol import ( RerankRequest, diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index e81bda2eec3d..35dfd5402002 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -10,7 +10,7 @@ from vllm import PoolingParams from vllm.config.pooler import get_use_activation -from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo +from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam from vllm.utils import random_uuid diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index 9762b2363985..e44f15e66b32 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -9,11 +9,11 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.score.protocol import ( RerankDocument, diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index 5fd0cf43e687..dded2eea7efd 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -22,7 +22,7 @@ from vllm import envs from vllm.entrypoints.constants import MCP_PREFIX -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ChatCompletionMessageParam, ResponseInputOutputItem, ) diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py index ea88c0fc4b97..f2668baec896 100644 --- a/vllm/entrypoints/sagemaker/routes.py +++ b/vllm/entrypoints/sagemaker/routes.py @@ -14,16 +14,20 @@ base, chat, completion, - create_chat_completion, create_completion, validate_json_request, ) -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.api_router import ( + create_chat_completion, +) +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( CompletionRequest, ErrorResponse, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.pooling.classify.api_router import classify, create_classify from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py index c38ede30dad1..6924dc83882f 100644 --- a/vllm/entrypoints/serve/disagg/api_router.py +++ b/vllm/entrypoints/serve/disagg/api_router.py @@ -11,7 +11,7 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.api_server import validate_json_request -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) from vllm.entrypoints.serve.disagg.protocol import ( diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py index 251fcf12ed7d..68c39f9069a4 100644 --- a/vllm/entrypoints/serve/disagg/protocol.py +++ b/vllm/entrypoints/serve/disagg/protocol.py @@ -4,8 +4,8 @@ from pydantic import BaseModel, Field -from vllm.entrypoints.openai.protocol import ( - ChatCompletionLogProbs, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs +from vllm.entrypoints.openai.engine.protocol import ( Logprob, SamplingParams, StreamOptions, diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index 1798b174b141..8649bc6684bc 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -11,16 +11,18 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionLogProbsContent, +) +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, PromptTokenUsageInfo, RequestResponseMetadata, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.serve.disagg.protocol import ( GenerateRequest, diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py index e5adb81051ff..1a3b57d4ca89 100644 --- a/vllm/entrypoints/serve/elastic_ep/api_router.py +++ b/vllm/entrypoints/serve/elastic_ep/api_router.py @@ -10,7 +10,7 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.api_server import validate_json_request -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, ) from vllm.entrypoints.serve.elastic_ep.middleware import ( diff --git a/vllm/entrypoints/serve/lora/api_router.py b/vllm/entrypoints/serve/lora/api_router.py index 6a57e73f334f..41ec354d750d 100644 --- a/vllm/entrypoints/serve/lora/api_router.py +++ b/vllm/entrypoints/serve/lora/api_router.py @@ -8,7 +8,7 @@ from vllm import envs from vllm.entrypoints.openai.api_server import models, validate_json_request -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, LoadLoRAAdapterRequest, UnloadLoRAAdapterRequest, diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py index a10e78c8d28e..ec486cf410d6 100644 --- a/vllm/entrypoints/serve/tokenize/api_router.py +++ b/vllm/entrypoints/serve/tokenize/api_router.py @@ -10,10 +10,12 @@ from typing_extensions import assert_never from vllm.entrypoints.openai.api_server import validate_json_request -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( + ErrorResponse, +) +from vllm.entrypoints.serve.tokenize.protocol import ( DetokenizeRequest, DetokenizeResponse, - ErrorResponse, TokenizeRequest, TokenizeResponse, ) diff --git a/vllm/entrypoints/serve/tokenize/protocol.py b/vllm/entrypoints/serve/tokenize/protocol.py new file mode 100644 index 000000000000..66a85a8b61fb --- /dev/null +++ b/vllm/entrypoints/serve/tokenize/protocol.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from typing import Any, TypeAlias + +from pydantic import ConfigDict, Field, model_validator + +from vllm.entrypoints.chat_utils import ( + ChatCompletionMessageParam, +) +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( + OpenAIBaseModel, +) + + +class TokenizeCompletionRequest(OpenAIBaseModel): + model: str | None = None + prompt: str + + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt." + ), + ) + return_token_strs: bool | None = Field( + default=False, + description=( + "If true, also return the token strings corresponding to the token ids." + ), + ) + + +class TokenizeChatRequest(OpenAIBaseModel): + model: str | None = None + messages: list[ChatCompletionMessageParam] + + add_generation_prompt: bool = Field( + default=True, + description=( + "If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model." + ), + ) + return_token_strs: bool | None = Field( + default=False, + description=( + "If true, also return the token strings corresponding to the token ids." + ), + ) + continue_final_message: bool = Field( + default=False, + description=( + "If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + 'This allows you to "prefill" part of the model\'s response for it. ' + "Cannot be used at the same time as `add_generation_prompt`." + ), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)." + ), + ) + chat_template: str | None = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one." + ), + ) + chat_template_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Additional keyword args to pass to the template renderer. " + "Will be accessible by the chat template." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + tools: list[ChatCompletionToolsParam] | None = Field( + default=None, + description=("A list of tools the model may call."), + ) + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get("add_generation_prompt"): + raise ValueError( + "Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True." + ) + return data + + +TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest + + +class TokenizeResponse(OpenAIBaseModel): + count: int + max_model_len: int + tokens: list[int] + token_strs: list[str] | None = None + + +class DetokenizeRequest(OpenAIBaseModel): + model: str | None = None + tokens: list[int] + + +class DetokenizeResponse(OpenAIBaseModel): + prompt: str + + +class TokenizerInfoResponse(OpenAIBaseModel): + """ + Response containing tokenizer configuration + equivalent to tokenizer_config.json + """ + + model_config = ConfigDict(extra="allow") + tokenizer_class: str diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 0b07f0b18dfd..c80009eaa08f 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -9,18 +9,20 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.engine.protocol import ( + ErrorResponse, +) +from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.renderer import RenderConfig +from vllm.entrypoints.serve.tokenize.protocol import ( DetokenizeRequest, DetokenizeResponse, - ErrorResponse, TokenizeChatRequest, TokenizeRequest, TokenizeResponse, TokenizerInfoResponse, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing -from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.renderer import RenderConfig from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 6f3e7c55f5bc..0d6fea36fd8a 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -22,9 +22,11 @@ resolve_hf_chat_template, resolve_mistral_chat_template, ) -from vllm.entrypoints.openai.cli_args import make_arg_parser -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.openai.engine.protocol import ( CompletionRequest, StreamOptions, ) diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index bf593ca4e52a..63ce096d09ab 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -14,8 +14,10 @@ from vllm.utils.import_utils import import_from_path if TYPE_CHECKING: - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, + ) + from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ResponsesRequest, ) diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index 43067ca4afe0..ea0aa1dce1f8 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -5,13 +5,15 @@ from collections.abc import Sequence from typing import TYPE_CHECKING, Any -from vllm.entrypoints.openai.protocol import DeltaMessage +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.tokenizers import TokenizerLike if TYPE_CHECKING: - from vllm.entrypoints.openai.protocol import ( + from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, + ) + from vllm.entrypoints.openai.engine.protocol import ( ResponsesRequest, ) else: diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index a91c8ceeb625..d24e331bb4a2 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -3,7 +3,7 @@ from collections.abc import Sequence -from vllm.entrypoints.openai.protocol import DeltaMessage +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index efb080276e46..2db39e16c642 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -5,7 +5,10 @@ from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py index 3cdbf14858ec..6ff86488bb36 100644 --- a/vllm/reasoning/ernie45_reasoning_parser.py +++ b/vllm/reasoning/ernie45_reasoning_parser.py @@ -5,7 +5,10 @@ from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index e0920ef3160b..4c938bac6222 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -5,8 +5,11 @@ from transformers import PreTrainedTokenizerBase +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from vllm.entrypoints.tool_server import ToolServer from vllm.logger import init_logger from vllm.reasoning import ReasoningParser diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 484045d66a3c..5cae16f74ac3 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -6,7 +6,10 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index 3b5bfd838017..b0bda09794d2 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -3,7 +3,10 @@ from collections.abc import Sequence -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ( ReasoningParser, diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py index f297454f57ec..05e4d586268b 100644 --- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py +++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py @@ -6,7 +6,10 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index e998e071efcf..ad4e0c8ffab6 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -5,7 +5,10 @@ from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index a2b9224cb3bf..06b97d39eacc 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -3,8 +3,10 @@ from collections.abc import Sequence -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ResponsesRequest, ) diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index 48a36b4c6634..05ff2bff0c3e 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -3,8 +3,10 @@ from functools import cached_property -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( ResponsesRequest, ) from vllm.logger import init_logger diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 2742a24a2c3e..9ee8fe37ecb0 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -10,9 +10,10 @@ if TYPE_CHECKING: from vllm.tokenizers import TokenizerLike - -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ResponsesRequest, ) diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index ef7762bf0af5..9c2f7404b9de 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ResponsesRequest from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py index b85bc826572f..47131b2bccee 100644 --- a/vllm/reasoning/step3_reasoning_parser.py +++ b/vllm/reasoning/step3_reasoning_parser.py @@ -6,7 +6,10 @@ import regex as re from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import DeltaMessage from vllm.logger import init_logger from vllm.reasoning import ReasoningParser diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 35a11e95b8bd..bb85052dba8e 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -19,7 +19,7 @@ from mistral_common.tokens.tokenizers.tekken import Tekkenizer from vllm.entrypoints.chat_utils import ChatCompletionMessageParam -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest from vllm.logger import init_logger from .protocol import TokenizerLike diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py index b7cac3454dab..feb3ce6f301f 100644 --- a/vllm/tool_parsers/abstract_tool_parser.py +++ b/vllm/tool_parsers/abstract_tool_parser.py @@ -10,8 +10,8 @@ ResponseFormatTextJSONSchemaConfig, ) -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, ResponsesRequest, diff --git a/vllm/tool_parsers/deepseekv31_tool_parser.py b/vllm/tool_parsers/deepseekv31_tool_parser.py index 33383e1bc073..ad42bb7713c4 100644 --- a/vllm/tool_parsers/deepseekv31_tool_parser.py +++ b/vllm/tool_parsers/deepseekv31_tool_parser.py @@ -6,8 +6,10 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py index db081178fdea..49c9540d6b26 100644 --- a/vllm/tool_parsers/deepseekv32_tool_parser.py +++ b/vllm/tool_parsers/deepseekv32_tool_parser.py @@ -8,8 +8,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/deepseekv3_tool_parser.py b/vllm/tool_parsers/deepseekv3_tool_parser.py index f8cf559f2284..83bba1c878e0 100644 --- a/vllm/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/tool_parsers/deepseekv3_tool_parser.py @@ -6,8 +6,10 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/ernie45_tool_parser.py b/vllm/tool_parsers/ernie45_tool_parser.py index 79193787b3b3..d5dc7a3da3cc 100644 --- a/vllm/tool_parsers/ernie45_tool_parser.py +++ b/vllm/tool_parsers/ernie45_tool_parser.py @@ -6,8 +6,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/functiongemma_tool_parser.py b/vllm/tool_parsers/functiongemma_tool_parser.py index 9be78b0a0691..22fa8d981f88 100644 --- a/vllm/tool_parsers/functiongemma_tool_parser.py +++ b/vllm/tool_parsers/functiongemma_tool_parser.py @@ -7,8 +7,10 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/gigachat3_tool_parser.py b/vllm/tool_parsers/gigachat3_tool_parser.py index 27a6bc1a7bad..02cdad9edebe 100644 --- a/vllm/tool_parsers/gigachat3_tool_parser.py +++ b/vllm/tool_parsers/gigachat3_tool_parser.py @@ -7,8 +7,10 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py index 6ad7d7cb460c..522c67dc2e6a 100644 --- a/vllm/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/tool_parsers/glm4_moe_tool_parser.py @@ -8,9 +8,11 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/tool_parsers/granite_20b_fc_tool_parser.py index d841fb57ac87..7fe3c39f70cf 100644 --- a/vllm/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/tool_parsers/granite_20b_fc_tool_parser.py @@ -10,8 +10,10 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/granite_tool_parser.py b/vllm/tool_parsers/granite_tool_parser.py index 7abfdf72849d..7cad01e16431 100644 --- a/vllm/tool_parsers/granite_tool_parser.py +++ b/vllm/tool_parsers/granite_tool_parser.py @@ -8,8 +8,10 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/hermes_tool_parser.py b/vllm/tool_parsers/hermes_tool_parser.py index 4b1dea7edf27..47dd2a24d251 100644 --- a/vllm/tool_parsers/hermes_tool_parser.py +++ b/vllm/tool_parsers/hermes_tool_parser.py @@ -9,8 +9,10 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py index c73982136804..4f446bfcce95 100644 --- a/vllm/tool_parsers/hunyuan_a13b_tool_parser.py +++ b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py @@ -8,8 +8,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/internlm2_tool_parser.py b/vllm/tool_parsers/internlm2_tool_parser.py index e87efe3275a7..3b858f34c20a 100644 --- a/vllm/tool_parsers/internlm2_tool_parser.py +++ b/vllm/tool_parsers/internlm2_tool_parser.py @@ -8,8 +8,10 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/jamba_tool_parser.py b/vllm/tool_parsers/jamba_tool_parser.py index 7f3de0b38a33..937e28b17079 100644 --- a/vllm/tool_parsers/jamba_tool_parser.py +++ b/vllm/tool_parsers/jamba_tool_parser.py @@ -9,8 +9,10 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/kimi_k2_tool_parser.py b/vllm/tool_parsers/kimi_k2_tool_parser.py index 96630504f068..354ed412b701 100644 --- a/vllm/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/tool_parsers/kimi_k2_tool_parser.py @@ -6,8 +6,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/tool_parsers/llama4_pythonic_tool_parser.py index 3c5409bbfaf4..707cdd6625c7 100644 --- a/vllm/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/tool_parsers/llama4_pythonic_tool_parser.py @@ -9,8 +9,10 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/llama_tool_parser.py b/vllm/tool_parsers/llama_tool_parser.py index b0dfe05c8e55..527d3f7358e8 100644 --- a/vllm/tool_parsers/llama_tool_parser.py +++ b/vllm/tool_parsers/llama_tool_parser.py @@ -11,8 +11,10 @@ import vllm.envs as envs from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index 67bd0e61620d..eb5c46c327e5 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -8,8 +8,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/minimax_tool_parser.py b/vllm/tool_parsers/minimax_tool_parser.py index 86e1433c6e36..cb5610fc7503 100644 --- a/vllm/tool_parsers/minimax_tool_parser.py +++ b/vllm/tool_parsers/minimax_tool_parser.py @@ -8,8 +8,10 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py index 35b853b0ad7e..67f6345bf589 100644 --- a/vllm/tool_parsers/mistral_tool_parser.py +++ b/vllm/tool_parsers/mistral_tool_parser.py @@ -12,8 +12,10 @@ import regex as re from pydantic import Field -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/olmo3_tool_parser.py b/vllm/tool_parsers/olmo3_tool_parser.py index 8cd6a84a9f6b..7b0d609d51df 100644 --- a/vllm/tool_parsers/olmo3_tool_parser.py +++ b/vllm/tool_parsers/olmo3_tool_parser.py @@ -9,8 +9,10 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py index da1a9c773f78..76f7a49dfaea 100644 --- a/vllm/tool_parsers/openai_tool_parser.py +++ b/vllm/tool_parsers/openai_tool_parser.py @@ -4,14 +4,16 @@ from collections.abc import Sequence from typing import TYPE_CHECKING -from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, FunctionCall, ToolCall, ) +from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages from vllm.logger import init_logger from vllm.tool_parsers.abstract_tool_parser import ( ToolParser, diff --git a/vllm/tool_parsers/phi4mini_tool_parser.py b/vllm/tool_parsers/phi4mini_tool_parser.py index 9003429d8c6f..f222cffd61d3 100644 --- a/vllm/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/tool_parsers/phi4mini_tool_parser.py @@ -9,8 +9,10 @@ from transformers import PreTrainedTokenizerBase from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, FunctionCall, diff --git a/vllm/tool_parsers/pythonic_tool_parser.py b/vllm/tool_parsers/pythonic_tool_parser.py index 476a62d5f527..dc9926608e60 100644 --- a/vllm/tool_parsers/pythonic_tool_parser.py +++ b/vllm/tool_parsers/pythonic_tool_parser.py @@ -10,8 +10,10 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py index d1a3cbeaafc7..a3c79f865b15 100644 --- a/vllm/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/tool_parsers/qwen3coder_tool_parser.py @@ -8,9 +8,11 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/qwen3xml_tool_parser.py b/vllm/tool_parsers/qwen3xml_tool_parser.py index 107f791654a1..f7dcf20abb7c 100644 --- a/vllm/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/tool_parsers/qwen3xml_tool_parser.py @@ -9,9 +9,11 @@ import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/seed_oss_tool_parser.py b/vllm/tool_parsers/seed_oss_tool_parser.py index 206072e65b10..6927071c3846 100644 --- a/vllm/tool_parsers/seed_oss_tool_parser.py +++ b/vllm/tool_parsers/seed_oss_tool_parser.py @@ -11,9 +11,11 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/step3_tool_parser.py b/vllm/tool_parsers/step3_tool_parser.py index acd99bf56d0b..8e6f27907c96 100644 --- a/vllm/tool_parsers/step3_tool_parser.py +++ b/vllm/tool_parsers/step3_tool_parser.py @@ -8,8 +8,10 @@ import regex as re -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, diff --git a/vllm/tool_parsers/utils.py b/vllm/tool_parsers/utils.py index 570eb447a467..cbbf5b545538 100644 --- a/vllm/tool_parsers/utils.py +++ b/vllm/tool_parsers/utils.py @@ -13,7 +13,7 @@ from openai.types.responses.tool import Tool from partial_json_parser.core.options import Allow -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionToolsParam, ) diff --git a/vllm/tool_parsers/xlam_tool_parser.py b/vllm/tool_parsers/xlam_tool_parser.py index 9c2b585fe9fd..d0d191ad28a2 100644 --- a/vllm/tool_parsers/xlam_tool_parser.py +++ b/vllm/tool_parsers/xlam_tool_parser.py @@ -6,10 +6,11 @@ from typing import Any, Optional, Union import regex as re - -from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, +) +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall,