diff --git a/tests/entrypoints/openai/responses/test_errors.py b/tests/entrypoints/openai/responses/test_errors.py
index f8ea17828883..7daa3d1fb58f 100644
--- a/tests/entrypoints/openai/responses/test_errors.py
+++ b/tests/entrypoints/openai/responses/test_errors.py
@@ -6,8 +6,8 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import ErrorResponse
-from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.serving import GenerationError, OpenAIServing
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/responses/test_function_call_parsing.py b/tests/entrypoints/openai/responses/test_function_call_parsing.py
index 3c5a11c867eb..d487759c1af2 100644
--- a/tests/entrypoints/openai/responses/test_function_call_parsing.py
+++ b/tests/entrypoints/openai/responses/test_function_call_parsing.py
@@ -7,7 +7,7 @@
 import pytest
 from openai.types.responses import ResponseFunctionToolCall
 
-from vllm.entrypoints.openai.protocol import ResponsesRequest
+from vllm.entrypoints.openai.engine.protocol import ResponsesRequest
 
 
 def test_function_call_dict_converted_to_object():
@@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog):
     }
 
     # Mock the logger to verify debug was called
-    with patch("vllm.entrypoints.openai.protocol.logger") as mock_logger:
+    with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger:
         with pytest.raises(ValueError):
             ResponsesRequest(**request_data)
 
diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py
index 4af4dd88b08f..c5e82d147f7b 100644
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@@ -9,8 +9,9 @@
 import pytest
 
 from vllm.config.multimodal import MultiModalConfig
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.tokenizers import get_tokenizer
diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 77087ac21ea8..961ad40ca2c3 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -5,7 +5,7 @@
 
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.tokenizers import get_tokenizer
 
 from ...models.registry import HF_EXAMPLE_MODELS
diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py
index e1eb6d2e3024..9b4539d4766b 100644
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@@ -9,7 +9,7 @@
 import pytest
 
 from vllm.config.multimodal import MultiModalConfig
-from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
index fbfae4f268d5..d8ee91f77834 100644
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -8,7 +8,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     StructuredOutputsParams,
 )
 from vllm.entrypoints.tool_server import ToolServer
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index ced230aff029..f740e7968ac8 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -9,7 +9,7 @@
 import pytest
 
 from vllm.config.multimodal import MultiModalConfig
-from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py
index e9b1cfb58b50..c4baf2f7784c 100644
--- a/tests/entrypoints/openai/test_protocol.py
+++ b/tests/entrypoints/openai/test_protocol.py
@@ -4,7 +4,10 @@
     Message,
 )
 
-from vllm.entrypoints.openai.protocol import serialize_message, serialize_messages
+from vllm.entrypoints.openai.engine.protocol import (
+    serialize_message,
+    serialize_messages,
+)
 
 
 def test_serialize_message() -> None:
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index d8a296e5f09d..9e4ce94a110f 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -11,14 +11,16 @@
 from openai import OpenAI
 
 from vllm.config.multimodal import MultiModalConfig
-from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     RequestResponseMetadata,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.parser.harmony_utils import get_encoding
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.tokenizers import get_tokenizer
@@ -1517,12 +1519,12 @@ class TestCreateRemainingArgsDelta:
 
     def test_preserves_id_type_name(self):
         """Test that id, type, and name are preserved from original delta."""
-        from vllm.entrypoints.openai.protocol import (
+        from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+        from vllm.entrypoints.openai.engine.protocol import (
             DeltaFunctionCall,
             DeltaMessage,
             DeltaToolCall,
         )
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
         original_delta = DeltaMessage(
             tool_calls=[
@@ -1552,12 +1554,12 @@ def test_preserves_id_type_name(self):
 
     def test_matches_by_index(self):
         """Test that the correct tool call is matched by index."""
-        from vllm.entrypoints.openai.protocol import (
+        from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+        from vllm.entrypoints.openai.engine.protocol import (
             DeltaFunctionCall,
             DeltaMessage,
             DeltaToolCall,
         )
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
         original_delta = DeltaMessage(
             tool_calls=[
@@ -1588,12 +1590,12 @@ def test_matches_by_index(self):
 
     def test_no_matching_tool_call(self):
         """Test graceful handling when no matching tool call is found."""
-        from vllm.entrypoints.openai.protocol import (
+        from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+        from vllm.entrypoints.openai.engine.protocol import (
             DeltaFunctionCall,
             DeltaMessage,
             DeltaToolCall,
         )
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 
         original_delta = DeltaMessage(
             tool_calls=[
@@ -1620,8 +1622,8 @@ def test_no_matching_tool_call(self):
 
     def test_function_is_none(self):
         """Test handling when original tool call has no function."""
-        from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
-        from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+        from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+        from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
 
         original_delta = DeltaMessage(
             tool_calls=[
diff --git a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
index 1934d43d5cfb..b5483a2dff31 100644
--- a/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
+++ b/tests/entrypoints/openai/test_serving_chat_stream_harmony.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+from vllm.entrypoints.openai.chat_completion.stream_harmony import (
     extract_harmony_streaming_delta,
 )
 
@@ -82,7 +82,7 @@ def test_analysis_channel_reasoning(self, include_reasoning, expected_has_messag
         assert tools_streamed is False
 
     @pytest.mark.parametrize("channel", ["commentary", "analysis"])
-    @patch("vllm.entrypoints.openai.serving_chat_stream_harmony.make_tool_call_id")
+    @patch("vllm.entrypoints.openai.chat_completion.stream_harmony.make_tool_call_id")
     def test_new_tool_call(self, mock_make_tool_call_id, channel):
         """Test new tool call creation when recipient changes."""
         mock_make_tool_call_id.return_value = "call_test123"
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 192c7cafb749..c2bc82514362 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -8,7 +8,7 @@
 import pytest
 
 from vllm.config import ModelConfig
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.tokenizers.mistral import MistralTokenizer
 
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index b585835a0667..e596b32ba69e 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -8,7 +8,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     LoadLoRAAdapterRequest,
     UnloadLoRAAdapterRequest,
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 7d03dccec30d..a79ee66a9e3c 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -14,7 +14,7 @@
 )
 
 from vllm.entrypoints.context import ConversationContext
-from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest
 from vllm.entrypoints.openai.serving_responses import (
     OpenAIServingResponses,
     _extract_allowed_tools_from_mcp_requests,
diff --git a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
index 6ac48317e8bc..634ec421f1c8 100644
--- a/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_gigachat3_tool_parser.py
@@ -9,7 +9,7 @@
     run_tool_extraction,
     run_tool_extraction_streaming,
 )
-from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 8600aaf63943..626d845e1b44 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
index 394457532139..89c91c2ec63f 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hunyuan_a13b_tool_parser.py
@@ -11,7 +11,7 @@
     run_tool_extraction,
     run_tool_extraction_streaming,
 )
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
 from vllm.tool_parsers import ToolParser, ToolParserManager
 
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 3ce7801b4597..53948d577c15 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
+from vllm.entrypoints.openai.engine.protocol import ExtractedToolCallInformation
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.llama_tool_parser import Llama3JsonToolParser
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index 3bd1ca7f528d..a0576db02ff7 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -9,7 +9,7 @@
     run_tool_extraction,
     run_tool_extraction_streaming,
 )
-from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index 3774b3d1833e..dbd7e1d483c7 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -9,7 +9,7 @@
     run_tool_extraction,
     run_tool_extraction_streaming,
 )
-from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager
 
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index c4cad17fd2d0..8ab4c5a5a2d2 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -9,7 +9,7 @@
     run_tool_extraction,
     run_tool_extraction_streaming,
 )
-from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers import ToolParser, ToolParserManager
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 0b32e5f899ff..c7dfdc461632 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -3,8 +3,8 @@
 
 from collections.abc import Iterable
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ExtractedToolCallInformation,
     FunctionCall,
diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py
index 501f6dcc9154..da65b8ad50bd 100644
--- a/tests/entrypoints/openai/utils.py
+++ b/tests/entrypoints/openai/utils.py
@@ -4,11 +4,13 @@
 from collections.abc import AsyncGenerator
 from typing import Any
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionResponse,
     ChatCompletionResponseChoice,
     ChatCompletionStreamResponse,
     ChatMessage,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     UsageInfo,
 )
 
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
index 165e91a2c79f..8c69f75a3bbc 100644
--- a/tests/reasoning/test_base_thinking_reasoning_parser.py
+++ b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -5,7 +5,7 @@
 from transformers import AutoTokenizer
 
 from tests.reasoning.utils import run_reasoning_extraction
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
index 874fdef77811..4b0938d15520 100644
--- a/tests/reasoning/test_deepseekv3_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -4,7 +4,8 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
 from vllm.reasoning.deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
 from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index a020fb8e9716..39ba52bc78f5 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.reasoning import ReasoningParser
 from vllm.tokenizers.mistral import MistralTokenizer
 
diff --git a/tests/tool_parsers/test_ernie45_moe_tool_parser.py b/tests/tool_parsers/test_ernie45_moe_tool_parser.py
index 533bd1ec3dff..a00e43894767 100644
--- a/tests/tool_parsers/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_parsers/test_ernie45_moe_tool_parser.py
@@ -7,8 +7,8 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     FunctionCall,
     ToolCall,
diff --git a/tests/tool_parsers/test_functiongemma_tool_parser.py b/tests/tool_parsers/test_functiongemma_tool_parser.py
index a5a0a5a19131..d32aba3085e5 100644
--- a/tests/tool_parsers/test_functiongemma_tool_parser.py
+++ b/tests/tool_parsers/test_functiongemma_tool_parser.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.tool_parsers.functiongemma_tool_parser import FunctionGemmaToolParser
 
 
diff --git a/tests/tool_parsers/test_glm4_moe_tool_parser.py b/tests/tool_parsers/test_glm4_moe_tool_parser.py
index 52f5a9198e9b..d9d88b8444d7 100644
--- a/tests/tool_parsers/test_glm4_moe_tool_parser.py
+++ b/tests/tool_parsers/test_glm4_moe_tool_parser.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.glm4_moe_tool_parser import (
     Glm4MoeModelToolParser,
diff --git a/tests/tool_parsers/test_jamba_tool_parser.py b/tests/tool_parsers/test_jamba_tool_parser.py
index ccad16ae2f6b..f0e7899c8aaf 100644
--- a/tests/tool_parsers/test_jamba_tool_parser.py
+++ b/tests/tool_parsers/test_jamba_tool_parser.py
@@ -8,7 +8,7 @@
 import pytest
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tool_parsers.jamba_tool_parser import JambaToolParser
diff --git a/tests/tool_parsers/test_kimi_k2_tool_parser.py b/tests/tool_parsers/test_kimi_k2_tool_parser.py
index dc6140374d53..21b3d5adfde1 100644
--- a/tests/tool_parsers/test_kimi_k2_tool_parser.py
+++ b/tests/tool_parsers/test_kimi_k2_tool_parser.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser
 
diff --git a/tests/tool_parsers/test_minimax_tool_parser.py b/tests/tool_parsers/test_minimax_tool_parser.py
index 28cfc4ea7a17..08b2104277b8 100644
--- a/tests/tool_parsers/test_minimax_tool_parser.py
+++ b/tests/tool_parsers/test_minimax_tool_parser.py
@@ -7,8 +7,10 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     FunctionCall,
     ToolCall,
 )
diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
index d2502079d0de..bf2fba8a8655 100644
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -11,7 +11,7 @@
 from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage, DeltaToolCall
 from vllm.tokenizers import TokenizerLike, get_tokenizer
 from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
 from vllm.tokenizers.mistral import MistralTokenizer
diff --git a/tests/tool_parsers/test_openai_tool_parser.py b/tests/tool_parsers/test_openai_tool_parser.py
index 44b8c92745e9..e9e39ef4c029 100644
--- a/tests/tool_parsers/test_openai_tool_parser.py
+++ b/tests/tool_parsers/test_openai_tool_parser.py
@@ -14,7 +14,7 @@
     load_harmony_encoding,
 )
 
-from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
+from vllm.entrypoints.openai.engine.protocol import FunctionCall, ToolCall
 from vllm.tokenizers import get_tokenizer
 from vllm.tool_parsers.openai_tool_parser import OpenAIToolParser
 
diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py
index 3a0a612d7fbf..3d46f73de612 100644
--- a/tests/tool_parsers/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -6,9 +6,11 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     FunctionCall,
     ToolCall,
diff --git a/tests/tool_parsers/test_seed_oss_tool_parser.py b/tests/tool_parsers/test_seed_oss_tool_parser.py
index c7f595830f34..88cc736f67a6 100644
--- a/tests/tool_parsers/test_seed_oss_tool_parser.py
+++ b/tests/tool_parsers/test_seed_oss_tool_parser.py
@@ -7,9 +7,11 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     FunctionCall,
     ToolCall,
diff --git a/tests/tool_parsers/test_xlam_tool_parser.py b/tests/tool_parsers/test_xlam_tool_parser.py
index 380792a9926a..a5cab218f72b 100644
--- a/tests/tool_parsers/test_xlam_tool_parser.py
+++ b/tests/tool_parsers/test_xlam_tool_parser.py
@@ -6,8 +6,8 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     FunctionCall,
     ToolCall,
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 50cd9e4279b2..69846f9adb12 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 
 
 def test_chat_completion_request_with_no_tools():
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 6ff37255e48d..01c1360818eb 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -8,10 +8,10 @@
 import regex as re
 from pydantic import TypeAdapter
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionToolsParam,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
 from vllm.tool_parsers.utils import get_json_schema_from_tools
 
 pytestmark = pytest.mark.cpu_test
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 11681cfcebca..270092faf8ce 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -11,12 +11,14 @@
 from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py
index 25c2d88a2c7a..5177d50f7c00 100644
--- a/vllm/entrypoints/anthropic/serving_messages.py
+++ b/vllm/entrypoints/anthropic/serving_messages.py
@@ -25,16 +25,18 @@
 )
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
     ChatCompletionStreamResponse,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     StreamOptions,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 
 logger = logging.getLogger(__name__)
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index c9bece08f188..6f61a0114597 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -22,6 +22,12 @@
     ChatTemplateContentFormatOption,
 )
 from vllm.entrypoints.constants import MCP_PREFIX
+from vllm.entrypoints.openai.engine.protocol import (
+    FunctionCall,
+    ResponseInputOutputItem,
+    ResponseRawMessageAndToken,
+    ResponsesRequest,
+)
 from vllm.entrypoints.openai.parser.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -30,12 +36,6 @@
 from vllm.entrypoints.openai.parser.responses_parser import (
     get_responses_parser_for_simple_context,
 )
-from vllm.entrypoints.openai.protocol import (
-    FunctionCall,
-    ResponseInputOutputItem,
-    ResponseRawMessageAndToken,
-    ResponsesRequest,
-)
 from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index fad989284ca5..86942f22fbee 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -42,11 +42,9 @@
 from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
 from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
-from vllm.entrypoints.openai.orca_metrics import metrics_header
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
+from vllm.entrypoints.openai.engine.protocol import (
     CompletionRequest,
     CompletionResponse,
     ErrorInfo,
@@ -59,9 +57,9 @@
     TranslationRequest,
     TranslationResponseVariant,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.openai.orca_metrics import metrics_header
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (
     BaseModelPath,
     OpenAIServingModels,
@@ -475,47 +473,6 @@ def translate_error_response(response: ErrorResponse) -> JSONResponse:
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/v1/chat/completions",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
-    metrics_header_format = raw_request.headers.get(
-        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
-    )
-    handler = chat(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Chat Completions API"
-        )
-    try:
-        generator = await handler.create_chat_completion(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(
-            content=generator.model_dump(),
-            headers=metrics_header(metrics_header_format),
-        )
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
 @router.post(
     "/v1/completions",
     dependencies=[Depends(validate_json_request)],
@@ -735,8 +692,10 @@ async def send_with_request_id(message: Message) -> None:
 def _extract_content_from_chunk(chunk_data: dict) -> str:
     """Extract content from a streaming response chunk."""
     try:
-        from vllm.entrypoints.openai.protocol import (
+        from vllm.entrypoints.openai.chat_completion.protocol import (
             ChatCompletionStreamResponse,
+        )
+        from vllm.entrypoints.openai.engine.protocol import (
             CompletionStreamResponse,
         )
 
@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI:
     from vllm.entrypoints.serve import register_vllm_serve_api_routers
 
     register_vllm_serve_api_routers(app)
+    from vllm.entrypoints.openai.chat_completion.api_router import (
+        attach_router as register_chat_api_router,
+    )
 
+    register_chat_api_router(app)
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
     register_sagemaker_routes(router)
diff --git a/vllm/entrypoints/openai/chat_completion/__init__.py b/vllm/entrypoints/openai/chat_completion/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/vllm/entrypoints/openai/chat_completion/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py
new file mode 100644
index 000000000000..e4010c5256a0
--- /dev/null
+++ b/vllm/entrypoints/openai/chat_completion/api_router.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.orca_metrics import metrics_header
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
+
+
+def chat(request: Request) -> OpenAIServingChat | None:
+    return request.app.state.openai_serving_chat
+
+
+@router.post(
+    "/v1/chat/completions",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
+    handler = chat(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Chat Completions API"
+        )
+    try:
+        generator = await handler.create_chat_completion(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, ChatCompletionResponse):
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
+def attach_router(app: FastAPI):
+    app.include_router(router)
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
new file mode 100644
index 000000000000..299069925bbd
--- /dev/null
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -0,0 +1,654 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import json
+import time
+from typing import Annotated, Any, ClassVar, Literal
+
+import torch
+from openai.types.chat.chat_completion_audio import (
+    ChatCompletionAudio as OpenAIChatCompletionAudio,
+)
+from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.engine.protocol import (
+    AnyResponseFormat,
+    DeltaMessage,
+    FunctionCall,
+    FunctionDefinition,
+    LegacyStructuralTagResponseFormat,
+    LogitsProcessors,
+    OpenAIBaseModel,
+    StreamOptions,
+    StructuralTagResponseFormat,
+    ToolCall,
+    UsageInfo,
+    get_logits_processors,
+)
+from vllm.exceptions import VLLMValidationError
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.sampling_params import (
+    BeamSearchParams,
+    RequestOutputKind,
+    SamplingParams,
+    StructuredOutputsParams,
+)
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+_LONG_INFO = torch.iinfo(torch.long)
+
+
+class ChatMessage(OpenAIBaseModel):
+    role: str
+    content: str | None = None
+    refusal: str | None = None
+    annotations: OpenAIAnnotation | None = None
+    audio: OpenAIChatCompletionAudio | None = None
+    function_call: FunctionCall | None = None
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+
+    # vLLM-specific fields that are not in OpenAI spec
+    reasoning: str | None = None
+    reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
+
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
+
+
+class ChatCompletionLogProb(OpenAIBaseModel):
+    token: str
+    logprob: float = -9999.0
+    bytes: list[int] | None = None
+
+
+class ChatCompletionLogProbsContent(ChatCompletionLogProb):
+    # Workaround: redefine fields name cache so that it's not
+    # shared with the super class.
+    field_names: ClassVar[set[str] | None] = None
+    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
+
+
+class ChatCompletionLogProbs(OpenAIBaseModel):
+    content: list[ChatCompletionLogProbsContent] | None = None
+
+
+class ChatCompletionResponseChoice(OpenAIBaseModel):
+    index: int
+    message: ChatMessage
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    # not part of the OpenAI spec but included in vLLM for legacy reasons
+    stop_reason: int | str | None = None
+    # not part of the OpenAI spec but is useful for tracing the tokens
+    # in agent scenarios
+    token_ids: list[int] | None = None
+
+
+class ChatCompletionResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseChoice]
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
+    usage: UsageInfo
+
+    # vLLM-specific fields that are not in OpenAI spec
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None, description="KVTransfer parameters."
+    )
+
+
+class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
+    index: int
+    delta: DeltaMessage
+    logprobs: ChatCompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
+    # not part of the OpenAI spec but for tracing the tokens
+    token_ids: list[int] | None = None
+
+
+class ChatCompletionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseStreamChoice]
+    usage: UsageInfo | None = Field(default=None)
+    # not part of the OpenAI spec but for tracing the tokens
+    prompt_token_ids: list[int] | None = None
+
+
+class ChatCompletionToolsParam(OpenAIBaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+
+
+class ChatCompletionNamedFunction(OpenAIBaseModel):
+    name: str
+
+
+class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+
+
+class ChatCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: list[ChatCompletionMessageParam]
+    model: str | None = None
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = False
+    top_logprobs: int | None = 0
+    max_tokens: int | None = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of "
+        "the max_completion_tokens field",
+    )
+    max_completion_tokens: int | None = None
+    n: int | None = 1
+    presence_penalty: float | None = 0.0
+    response_format: AnyResponseFormat | None = None
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    tools: list[ChatCompletionToolsParam] | None = None
+    tool_choice: (
+        Literal["none"]
+        | Literal["auto"]
+        | Literal["required"]
+        | ChatCompletionNamedToolChoiceParam
+        | None
+    ) = "none"
+    reasoning_effort: Literal["low", "medium", "high"] | None = None
+    include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
+
+    # NOTE this will be ignored by vLLM
+    user: str | None = None
+
+    # --8<-- [start:chat-completion-sampling-params]
+    use_beam_search: bool = False
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
+    length_penalty: float = 1.0
+    stop_token_ids: list[int] | None = []
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
+        None
+    )
+    prompt_logprobs: int | None = None
+    allowed_token_ids: list[int] | None = None
+    bad_words: list[str] = Field(default_factory=list)
+    # --8<-- [end:chat-completion-sampling-params]
+
+    # --8<-- [start:chat-completion-extra-params]
+    echo: bool = Field(
+        default=False,
+        description=(
+            "If true, the new message will be prepended with the last message "
+            "if they belong to the same role."
+        ),
+    )
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    documents: list[dict[str, str]] | None = Field(
+        default=None,
+        description=(
+            "A list of dicts representing documents that will be accessible to "
+            "the model if it is performing RAG (retrieval-augmented generation)."
+            " If the template does not support RAG, this argument will have no "
+            "effect. We recommend that each document should be a dict containing "
+            '"title" and "text" keys.'
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    structured_outputs: StructuredOutputsParams | None = Field(
+        default=None,
+        description="Additional kwargs for structured outputs",
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    logits_processors: LogitsProcessors | None = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."
+        ),
+    )
+    return_tokens_as_token_ids: bool | None = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."
+        ),
+    )
+    return_token_ids: bool | None = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."
+        ),
+    )
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
+        default=None,
+        description=(
+            "Additional request parameters with (list of) string or "
+            "numeric values, used by custom extensions."
+        ),
+    )
+
+    # --8<-- [end:chat-completion-extra-params]
+
+    # Default sampling parameters for chat completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": 0,
+        "min_p": 0.0,
+    }
+
+    def to_beam_search_params(
+        self, max_tokens: int, default_sampling_params: dict
+    ) -> BeamSearchParams:
+        n = self.n if self.n is not None else 1
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+
+        return BeamSearchParams(
+            beam_width=n,
+            max_tokens=max_tokens,
+            ignore_eos=self.ignore_eos,
+            temperature=temperature,
+            length_penalty=self.length_penalty,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
+
+    def to_sampling_params(
+        self,
+        max_tokens: int,
+        logits_processor_pattern: str | None,
+        default_sampling_params: dict,
+    ) -> SamplingParams:
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
+            )
+
+        prompt_logprobs = self.prompt_logprobs
+        if prompt_logprobs is None and self.echo:
+            prompt_logprobs = self.top_logprobs
+
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        return SamplingParams.from_optional(
+            n=self.n,
+            presence_penalty=self.presence_penalty,
+            frequency_penalty=self.frequency_penalty,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            seed=self.seed,
+            stop=self.stop,
+            stop_token_ids=self.stop_token_ids,
+            logprobs=self.top_logprobs if self.logprobs else None,
+            prompt_logprobs=prompt_logprobs,
+            ignore_eos=self.ignore_eos,
+            max_tokens=max_tokens,
+            min_tokens=self.min_tokens,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(
+                self.logits_processors, logits_processor_pattern
+            ),
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
+            structured_outputs=self.structured_outputs,
+            logit_bias=self.logit_bias,
+            bad_words=self.bad_words,
+            allowed_token_ids=self.allowed_token_ids,
+            extra_args=extra_args or None,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
+        )
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
+            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
+                )
+
+            if prompt_logprobs < 0 and prompt_logprobs != -1:
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
+        if (top_logprobs := data.get("top_logprobs")) is not None:
+            if top_logprobs < 0 and top_logprobs != -1:
+                raise VLLMValidationError(
+                    "`top_logprobs` must be a positive value or -1.",
+                    parameter="top_logprobs",
+                    value=top_logprobs,
+                )
+
+            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
+                raise VLLMValidationError(
+                    "when using `top_logprobs`, `logprobs` must be set to true.",
+                    parameter="top_logprobs",
+                )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_structured_outputs_count(cls, data):
+        if isinstance(data, ValueError):
+            raise data
+
+        if data.get("structured_outputs", None) is None:
+            return data
+
+        structured_outputs_kwargs = data["structured_outputs"]
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice")
+        )
+        # you can only use one kind of constraints for structured outputs
+        if count > 1:
+            raise ValueError(
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice')."
+            )
+        # you can only either use structured outputs or tools, not both
+        if count > 1 and data.get("tool_choice", "none") not in (
+            "none",
+            "auto",
+            "required",
+        ):
+            raise ValueError(
+                "You can only either use constraints for structured outputs "
+                "or tools, not both."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tool_usage(cls, data):
+        # if "tool_choice" is not specified but tools are provided,
+        # default to "auto" tool_choice
+        if "tool_choice" not in data and data.get("tools"):
+            data["tool_choice"] = "auto"
+
+        # if "tool_choice" is "none" -- no validation is needed for tools
+        if "tool_choice" in data and data["tool_choice"] == "none":
+            return data
+
+        # if "tool_choice" is specified -- validation
+        if "tool_choice" in data and data["tool_choice"] is not None:
+            # ensure that if "tool choice" is specified, tools are present
+            if "tools" not in data or data["tools"] is None:
+                raise ValueError("When using `tool_choice`, `tools` must be set.")
+
+            # make sure that tool choice is either a named tool
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
+                data["tool_choice"], dict
+            ):
+                raise ValueError(
+                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
+                    'Only named tools, "none", "auto" or "required" '
+                    "are supported."
+                )
+
+            # if tool_choice is "required" but the "tools" list is empty,
+            # override the data to behave like "none" to align with
+            # OpenAI’s behavior.
+            if (
+                data["tool_choice"] == "required"
+                and isinstance(data["tools"], list)
+                and len(data["tools"]) == 0
+            ):
+                data["tool_choice"] = "none"
+                del data["tools"]
+                return data
+
+            # ensure that if "tool_choice" is specified as an object,
+            # it matches a valid tool
+            correct_usage_message = (
+                'Correct usage: `{"type": "function",'
+                ' "function": {"name": "my_function"}}`'
+            )
+            if isinstance(data["tool_choice"], dict):
+                valid_tool = False
+                function = data["tool_choice"].get("function")
+                if not isinstance(function, dict):
+                    raise ValueError(
+                        f"Invalid value for `function`: `{function}` in "
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
+                if "name" not in function:
+                    raise ValueError(
+                        f"Expected field `name` in `function` in "
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
+                function_name = function["name"]
+                if not isinstance(function_name, str) or len(function_name) == 0:
+                    raise ValueError(
+                        f"Invalid `name` in `function`: `{function_name}`"
+                        f" in `tool_choice`! {correct_usage_message}"
+                    )
+                for tool in data["tools"]:
+                    if tool["function"]["name"] == function_name:
+                        valid_tool = True
+                        break
+                if not valid_tool:
+                    raise ValueError(
+                        "The tool specified in `tool_choice` does not match any"
+                        " of the specified `tools`"
+                    )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
+        return data
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/chat_completion/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_chat.py
rename to vllm/entrypoints/openai/chat_completion/serving.py
index e65dba2b893b..2b0d0101f81b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -23,16 +23,7 @@
     make_tool_call_id,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.parser.harmony_utils import (
-    get_developer_message,
-    get_stop_tokens_for_assistant_actions,
-    get_streamable_parser_for_assistant,
-    get_system_message,
-    parse_chat_inputs_to_harmony_messages,
-    parse_chat_output,
-    render_for_completion,
-)
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionLogProb,
     ChatCompletionLogProbs,
     ChatCompletionLogProbsContent,
@@ -43,6 +34,11 @@
     ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse,
     ChatMessage,
+)
+from vllm.entrypoints.openai.chat_completion.stream_harmony import (
+    extract_harmony_streaming_delta,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
@@ -52,14 +48,20 @@
     ToolCall,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_chat_stream_harmony import (
-    extract_harmony_streaming_delta,
-)
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
     GenerationError,
     OpenAIServing,
     clamp_prompt_logprobs,
 )
+from vllm.entrypoints.openai.parser.harmony_utils import (
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_streamable_parser_for_assistant,
+    get_system_message,
+    parse_chat_inputs_to_harmony_messages,
+    parse_chat_output,
+    render_for_completion,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
diff --git a/vllm/entrypoints/openai/serving_chat_stream_harmony.py b/vllm/entrypoints/openai/chat_completion/stream_harmony.py
similarity index 98%
rename from vllm/entrypoints/openai/serving_chat_stream_harmony.py
rename to vllm/entrypoints/openai/chat_completion/stream_harmony.py
index 1b5ae620651c..9a95bc6e1c4a 100644
--- a/vllm/entrypoints/openai/serving_chat_stream_harmony.py
+++ b/vllm/entrypoints/openai/chat_completion/stream_harmony.py
@@ -10,7 +10,7 @@
 from openai_harmony import StreamableParser
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/entrypoints/openai/engine/__init__.py b/vllm/entrypoints/openai/engine/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/vllm/entrypoints/openai/engine/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/engine/protocol.py
similarity index 68%
rename from vllm/entrypoints/openai/protocol.py
rename to vllm/entrypoints/openai/engine/protocol.py
index 845dae7c1bf1..9434e214f348 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/engine/protocol.py
@@ -11,10 +11,6 @@
 import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
-from openai.types.chat.chat_completion_audio import (
-    ChatCompletionAudio as OpenAIChatCompletionAudio,
-)
-from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
 from openai.types.responses import (
     ResponseCodeInterpreterCallCodeDeltaEvent,
     ResponseCodeInterpreterCallCodeDoneEvent,
@@ -234,20 +230,6 @@ class FunctionDefinition(OpenAIBaseModel):
     parameters: dict[str, Any] | None = None
 
 
-class ChatCompletionToolsParam(OpenAIBaseModel):
-    type: Literal["function"] = "function"
-    function: FunctionDefinition
-
-
-class ChatCompletionNamedFunction(OpenAIBaseModel):
-    name: str
-
-
-class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
-    function: ChatCompletionNamedFunction
-    type: Literal["function"] = "function"
-
-
 # extra="forbid" is a workaround to have kwargs as a field,
 # see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
@@ -414,609 +396,66 @@ def to_sampling_params(
             )
         if (top_k := self.top_k) is None:
             top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
-            )
-        stop_token_ids = default_sampling_params.get("stop_token_ids")
-
-        # Structured output
-        structured_outputs = None
-        if self.text is not None and self.text.format is not None:
-            response_format = self.text.format
-            if (
-                response_format.type == "json_schema"
-                and response_format.schema_ is not None
-            ):
-                structured_outputs = StructuredOutputsParams(
-                    json=response_format.schema_
-                )
-            elif response_format.type == "json_object":
-                raise NotImplementedError("json_object is not supported")
-
-        # TODO: add more parameters
-        return SamplingParams.from_optional(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            max_tokens=max_tokens,
-            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
-            stop_token_ids=stop_token_ids,
-            output_kind=(
-                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
-            ),
-            structured_outputs=structured_outputs,
-            logit_bias=self.logit_bias,
-            skip_clone=True,  # Created fresh per request, safe to skip clone
-        )
-
-    def is_include_output_logprobs(self) -> bool:
-        """Check if the request includes output logprobs."""
-        if self.include is None:
-            return False
-        return (
-            isinstance(self.include, list)
-            and "message.output_text.logprobs" in self.include
-        )
-
-    @model_validator(mode="before")
-    def validate_background(cls, data):
-        if not data.get("background"):
-            return data
-        if not data.get("store", True):
-            raise ValueError("background can only be used when `store` is true")
-        return data
-
-    @model_validator(mode="before")
-    def validate_prompt(cls, data):
-        if data.get("prompt") is not None:
-            raise VLLMValidationError(
-                "prompt template is not supported", parameter="prompt"
-            )
-        return data
-
-    @model_validator(mode="before")
-    def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None and (
-            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
-        ):
-            raise ValueError(
-                "Parameter 'cache_salt' must be a non-empty string if provided."
-            )
-        return data
-
-    @model_validator(mode="before")
-    def function_call_parsing(cls, data):
-        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
-        This ensures Pydantic can properly resolve union types in the input field.
-        Function calls provided as dicts are converted to ResponseFunctionToolCall
-        objects before validation, while invalid structures are left for Pydantic
-        to reject with appropriate error messages.
-        """
-
-        input_data = data.get("input")
-
-        # Early return for None, strings, or bytes
-        # (strings are iterable but shouldn't be processed)
-        if input_data is None or isinstance(input_data, (str, bytes)):
-            return data
-
-        # Convert iterators (like ValidatorIterator) to list
-        if not isinstance(input_data, list):
-            try:
-                input_data = list(input_data)
-            except TypeError:
-                # Not iterable, leave as-is for Pydantic to handle
-                return data
-
-        processed_input = []
-        for item in input_data:
-            if isinstance(item, dict) and item.get("type") == "function_call":
-                try:
-                    processed_input.append(ResponseFunctionToolCall(**item))
-                except ValidationError:
-                    # Let Pydantic handle validation for malformed function calls
-                    logger.debug(
-                        "Failed to parse function_call to ResponseFunctionToolCall, "
-                        "leaving for Pydantic validation"
-                    )
-                    processed_input.append(item)
-            else:
-                processed_input.append(item)
-
-        data["input"] = processed_input
-        return data
-
-
-class ChatCompletionRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/chat/create
-    messages: list[ChatCompletionMessageParam]
-    model: str | None = None
-    frequency_penalty: float | None = 0.0
-    logit_bias: dict[str, float] | None = None
-    logprobs: bool | None = False
-    top_logprobs: int | None = 0
-    max_tokens: int | None = Field(
-        default=None,
-        deprecated="max_tokens is deprecated in favor of "
-        "the max_completion_tokens field",
-    )
-    max_completion_tokens: int | None = None
-    n: int | None = 1
-    presence_penalty: float | None = 0.0
-    response_format: AnyResponseFormat | None = None
-    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: str | list[str] | None = []
-    stream: bool | None = False
-    stream_options: StreamOptions | None = None
-    temperature: float | None = None
-    top_p: float | None = None
-    tools: list[ChatCompletionToolsParam] | None = None
-    tool_choice: (
-        Literal["none"]
-        | Literal["auto"]
-        | Literal["required"]
-        | ChatCompletionNamedToolChoiceParam
-        | None
-    ) = "none"
-    reasoning_effort: Literal["low", "medium", "high"] | None = None
-    include_reasoning: bool = True
-    parallel_tool_calls: bool | None = True
-
-    # NOTE this will be ignored by vLLM
-    user: str | None = None
-
-    # --8<-- [start:chat-completion-sampling-params]
-    use_beam_search: bool = False
-    top_k: int | None = None
-    min_p: float | None = None
-    repetition_penalty: float | None = None
-    length_penalty: float = 1.0
-    stop_token_ids: list[int] | None = []
-    include_stop_str_in_output: bool = False
-    ignore_eos: bool = False
-    min_tokens: int = 0
-    skip_special_tokens: bool = True
-    spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
-        None
-    )
-    prompt_logprobs: int | None = None
-    allowed_token_ids: list[int] | None = None
-    bad_words: list[str] = Field(default_factory=list)
-    # --8<-- [end:chat-completion-sampling-params]
-
-    # --8<-- [start:chat-completion-extra-params]
-    echo: bool = Field(
-        default=False,
-        description=(
-            "If true, the new message will be prepended with the last message "
-            "if they belong to the same role."
-        ),
-    )
-    add_generation_prompt: bool = Field(
-        default=True,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-    continue_final_message: bool = Field(
-        default=False,
-        description=(
-            "If this is set, the chat will be formatted so that the final "
-            "message in the chat is open-ended, without any EOS tokens. The "
-            "model will continue this message rather than starting a new one. "
-            'This allows you to "prefill" part of the model\'s response for it. '
-            "Cannot be used at the same time as `add_generation_prompt`."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    documents: list[dict[str, str]] | None = Field(
-        default=None,
-        description=(
-            "A list of dicts representing documents that will be accessible to "
-            "the model if it is performing RAG (retrieval-augmented generation)."
-            " If the template does not support RAG, this argument will have no "
-            "effect. We recommend that each document should be a dict containing "
-            '"title" and "text" keys.'
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    structured_outputs: StructuredOutputsParams | None = Field(
-        default=None,
-        description="Additional kwargs for structured outputs",
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    logits_processors: LogitsProcessors | None = Field(
-        default=None,
-        description=(
-            "A list of either qualified names of logits processors, or "
-            "constructor objects, to apply when sampling. A constructor is "
-            "a JSON object with a required 'qualname' field specifying the "
-            "qualified name of the processor class/factory, and optional "
-            "'args' and 'kwargs' fields containing positional and keyword "
-            "arguments. For example: {'qualname': "
-            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
-            "{'param': 'value'}}."
-        ),
-    )
-    return_tokens_as_token_ids: bool | None = Field(
-        default=None,
-        description=(
-            "If specified with 'logprobs', tokens are represented "
-            " as strings of the form 'token_id:{token_id}' so that tokens "
-            "that are not JSON-encodable can be identified."
-        ),
-    )
-    return_token_ids: bool | None = Field(
-        default=None,
-        description=(
-            "If specified, the result will include token IDs alongside the "
-            "generated text. In streaming mode, prompt_token_ids is included "
-            "only in the first chunk, and token_ids contains the delta tokens "
-            "for each chunk. This is useful for debugging or when you "
-            "need to map generated text back to input tokens."
-        ),
-    )
-    cache_salt: str | None = Field(
-        default=None,
-        description=(
-            "If specified, the prefix cache will be salted with the provided "
-            "string to prevent an attacker to guess prompts in multi-user "
-            "environments. The salt should be random, protected from "
-            "access by 3rd parties, and long enough to be "
-            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit)."
-        ),
-    )
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None,
-        description="KVTransfer parameters used for disaggregated serving.",
-    )
-
-    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
-        default=None,
-        description=(
-            "Additional request parameters with (list of) string or "
-            "numeric values, used by custom extensions."
-        ),
-    )
-
-    # --8<-- [end:chat-completion-extra-params]
-
-    # Default sampling parameters for chat completion requests
-    _DEFAULT_SAMPLING_PARAMS: dict = {
-        "repetition_penalty": 1.0,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": 0,
-        "min_p": 0.0,
-    }
-
-    def to_beam_search_params(
-        self, max_tokens: int, default_sampling_params: dict
-    ) -> BeamSearchParams:
-        n = self.n if self.n is not None else 1
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
-            )
-
-        return BeamSearchParams(
-            beam_width=n,
-            max_tokens=max_tokens,
-            ignore_eos=self.ignore_eos,
-            temperature=temperature,
-            length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output,
-        )
-
-    def to_sampling_params(
-        self,
-        max_tokens: int,
-        logits_processor_pattern: str | None,
-        default_sampling_params: dict,
-    ) -> SamplingParams:
-        # Default parameters
-        if (repetition_penalty := self.repetition_penalty) is None:
-            repetition_penalty = default_sampling_params.get(
-                "repetition_penalty",
-                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
-            )
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
-            )
-        if (top_p := self.top_p) is None:
-            top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
-            )
-        if (top_k := self.top_k) is None:
-            top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
-            )
-        if (min_p := self.min_p) is None:
-            min_p = default_sampling_params.get(
-                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
-            )
-
-        prompt_logprobs = self.prompt_logprobs
-        if prompt_logprobs is None and self.echo:
-            prompt_logprobs = self.top_logprobs
-
-        response_format = self.response_format
-        if response_format is not None:
-            # If structured outputs wasn't already enabled,
-            # we must enable it for these features to work
-            if self.structured_outputs is None:
-                self.structured_outputs = StructuredOutputsParams()
-
-            # Set structured output params for response format
-            if response_format.type == "json_object":
-                self.structured_outputs.json_object = True
-            elif response_format.type == "json_schema":
-                json_schema = response_format.json_schema
-                assert json_schema is not None
-                self.structured_outputs.json = json_schema.json_schema
-            elif response_format.type == "structural_tag":
-                structural_tag = response_format
-                assert structural_tag is not None and isinstance(
-                    structural_tag,
-                    (
-                        LegacyStructuralTagResponseFormat,
-                        StructuralTagResponseFormat,
-                    ),
-                )
-                s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
-
-        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
-        if self.kv_transfer_params:
-            # Pass in kv_transfer_params via extra_args
-            extra_args["kv_transfer_params"] = self.kv_transfer_params
-        return SamplingParams.from_optional(
-            n=self.n,
-            presence_penalty=self.presence_penalty,
-            frequency_penalty=self.frequency_penalty,
-            repetition_penalty=repetition_penalty,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            seed=self.seed,
-            stop=self.stop,
-            stop_token_ids=self.stop_token_ids,
-            logprobs=self.top_logprobs if self.logprobs else None,
-            prompt_logprobs=prompt_logprobs,
-            ignore_eos=self.ignore_eos,
-            max_tokens=max_tokens,
-            min_tokens=self.min_tokens,
-            skip_special_tokens=self.skip_special_tokens,
-            spaces_between_special_tokens=self.spaces_between_special_tokens,
-            logits_processors=get_logits_processors(
-                self.logits_processors, logits_processor_pattern
-            ),
-            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA
-            if self.stream
-            else RequestOutputKind.FINAL_ONLY,
-            structured_outputs=self.structured_outputs,
-            logit_bias=self.logit_bias,
-            bad_words=self.bad_words,
-            allowed_token_ids=self.allowed_token_ids,
-            extra_args=extra_args or None,
-            skip_clone=True,  # Created fresh per request, safe to skip clone
-        )
-
-    @model_validator(mode="before")
-    @classmethod
-    def validate_stream_options(cls, data):
-        if data.get("stream_options") and not data.get("stream"):
-            raise VLLMValidationError(
-                "Stream options can only be defined when `stream=True`.",
-                parameter="stream_options",
-            )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_logprobs(cls, data):
-        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
-            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise VLLMValidationError(
-                    "`prompt_logprobs` are not available when `stream=True`.",
-                    parameter="prompt_logprobs",
-                )
-
-            if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise VLLMValidationError(
-                    "`prompt_logprobs` must be a positive value or -1.",
-                    parameter="prompt_logprobs",
-                    value=prompt_logprobs,
-                )
-        if (top_logprobs := data.get("top_logprobs")) is not None:
-            if top_logprobs < 0 and top_logprobs != -1:
-                raise VLLMValidationError(
-                    "`top_logprobs` must be a positive value or -1.",
-                    parameter="top_logprobs",
-                    value=top_logprobs,
-                )
-
-            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
-                raise VLLMValidationError(
-                    "when using `top_logprobs`, `logprobs` must be set to true.",
-                    parameter="top_logprobs",
-                )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_structured_outputs_count(cls, data):
-        if isinstance(data, ValueError):
-            raise data
-
-        if data.get("structured_outputs", None) is None:
-            return data
-
-        structured_outputs_kwargs = data["structured_outputs"]
-        count = sum(
-            structured_outputs_kwargs.get(k) is not None
-            for k in ("json", "regex", "choice")
-        )
-        # you can only use one kind of constraints for structured outputs
-        if count > 1:
-            raise ValueError(
-                "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice')."
-            )
-        # you can only either use structured outputs or tools, not both
-        if count > 1 and data.get("tool_choice", "none") not in (
-            "none",
-            "auto",
-            "required",
-        ):
-            raise ValueError(
-                "You can only either use constraints for structured outputs "
-                "or tools, not both."
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
             )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_tool_usage(cls, data):
-        # if "tool_choice" is not specified but tools are provided,
-        # default to "auto" tool_choice
-        if "tool_choice" not in data and data.get("tools"):
-            data["tool_choice"] = "auto"
-
-        # if "tool_choice" is "none" -- no validation is needed for tools
-        if "tool_choice" in data and data["tool_choice"] == "none":
-            return data
-
-        # if "tool_choice" is specified -- validation
-        if "tool_choice" in data and data["tool_choice"] is not None:
-            # ensure that if "tool choice" is specified, tools are present
-            if "tools" not in data or data["tools"] is None:
-                raise ValueError("When using `tool_choice`, `tools` must be set.")
+        stop_token_ids = default_sampling_params.get("stop_token_ids")
 
-            # make sure that tool choice is either a named tool
-            # OR that it's set to "auto" or "required"
-            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
-                data["tool_choice"], dict
+        # Structured output
+        structured_outputs = None
+        if self.text is not None and self.text.format is not None:
+            response_format = self.text.format
+            if (
+                response_format.type == "json_schema"
+                and response_format.schema_ is not None
             ):
-                raise ValueError(
-                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
-                    'Only named tools, "none", "auto" or "required" '
-                    "are supported."
+                structured_outputs = StructuredOutputsParams(
+                    json=response_format.schema_
                 )
+            elif response_format.type == "json_object":
+                raise NotImplementedError("json_object is not supported")
 
-            # if tool_choice is "required" but the "tools" list is empty,
-            # override the data to behave like "none" to align with
-            # OpenAI’s behavior.
-            if (
-                data["tool_choice"] == "required"
-                and isinstance(data["tools"], list)
-                and len(data["tools"]) == 0
-            ):
-                data["tool_choice"] = "none"
-                del data["tools"]
-                return data
+        # TODO: add more parameters
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            max_tokens=max_tokens,
+            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
+            stop_token_ids=stop_token_ids,
+            output_kind=(
+                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
+            ),
+            structured_outputs=structured_outputs,
+            logit_bias=self.logit_bias,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
+        )
 
-            # ensure that if "tool_choice" is specified as an object,
-            # it matches a valid tool
-            correct_usage_message = (
-                'Correct usage: `{"type": "function",'
-                ' "function": {"name": "my_function"}}`'
-            )
-            if isinstance(data["tool_choice"], dict):
-                valid_tool = False
-                function = data["tool_choice"].get("function")
-                if not isinstance(function, dict):
-                    raise ValueError(
-                        f"Invalid value for `function`: `{function}` in "
-                        f"`tool_choice`! {correct_usage_message}"
-                    )
-                if "name" not in function:
-                    raise ValueError(
-                        f"Expected field `name` in `function` in "
-                        f"`tool_choice`! {correct_usage_message}"
-                    )
-                function_name = function["name"]
-                if not isinstance(function_name, str) or len(function_name) == 0:
-                    raise ValueError(
-                        f"Invalid `name` in `function`: `{function_name}`"
-                        f" in `tool_choice`! {correct_usage_message}"
-                    )
-                for tool in data["tools"]:
-                    if tool["function"]["name"] == function_name:
-                        valid_tool = True
-                        break
-                if not valid_tool:
-                    raise ValueError(
-                        "The tool specified in `tool_choice` does not match any"
-                        " of the specified `tools`"
-                    )
+    def is_include_output_logprobs(self) -> bool:
+        """Check if the request includes output logprobs."""
+        if self.include is None:
+            return False
+        return (
+            isinstance(self.include, list)
+            and "message.output_text.logprobs" in self.include
+        )
+
+    @model_validator(mode="before")
+    def validate_background(cls, data):
+        if not data.get("background"):
+            return data
+        if not data.get("store", True):
+            raise ValueError("background can only be used when `store` is true")
         return data
 
     @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
+    def validate_prompt(cls, data):
+        if data.get("prompt") is not None:
+            raise VLLMValidationError(
+                "prompt template is not supported", parameter="prompt"
             )
         return data
 
     @model_validator(mode="before")
-    @classmethod
     def check_cache_salt_support(cls, data):
         if data.get("cache_salt") is not None and (
             not isinstance(data["cache_salt"], str) or not data["cache_salt"]
@@ -1026,6 +465,48 @@ def check_cache_salt_support(cls, data):
             )
         return data
 
+    @model_validator(mode="before")
+    def function_call_parsing(cls, data):
+        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
+        This ensures Pydantic can properly resolve union types in the input field.
+        Function calls provided as dicts are converted to ResponseFunctionToolCall
+        objects before validation, while invalid structures are left for Pydantic
+        to reject with appropriate error messages.
+        """
+
+        input_data = data.get("input")
+
+        # Early return for None, strings, or bytes
+        # (strings are iterable but shouldn't be processed)
+        if input_data is None or isinstance(input_data, (str, bytes)):
+            return data
+
+        # Convert iterators (like ValidatorIterator) to list
+        if not isinstance(input_data, list):
+            try:
+                input_data = list(input_data)
+            except TypeError:
+                # Not iterable, leave as-is for Pydantic to handle
+                return data
+
+        processed_input = []
+        for item in input_data:
+            if isinstance(item, dict) and item.get("type") == "function_call":
+                try:
+                    processed_input.append(ResponseFunctionToolCall(**item))
+                except ValidationError:
+                    # Let Pydantic handle validation for malformed function calls
+                    logger.debug(
+                        "Failed to parse function_call to ResponseFunctionToolCall, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+            else:
+                processed_input.append(item)
+
+        data["input"] = processed_input
+        return data
+
 
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -1486,75 +967,6 @@ class ExtractedToolCallInformation(BaseModel):
     content: str | None = None
 
 
-class ChatMessage(OpenAIBaseModel):
-    role: str
-    content: str | None = None
-    refusal: str | None = None
-    annotations: OpenAIAnnotation | None = None
-    audio: OpenAIChatCompletionAudio | None = None
-    function_call: FunctionCall | None = None
-    tool_calls: list[ToolCall] = Field(default_factory=list)
-
-    # vLLM-specific fields that are not in OpenAI spec
-    reasoning: str | None = None
-    reasoning_content: str | None = None
-    """Deprecated: use `reasoning` instead."""
-
-    @model_validator(mode="after")
-    def handle_deprecated_reasoning_content(self):
-        """Copy reasoning to reasoning_content for backward compatibility."""
-        self.reasoning_content = self.reasoning
-        return self
-
-
-class ChatCompletionLogProb(OpenAIBaseModel):
-    token: str
-    logprob: float = -9999.0
-    bytes: list[int] | None = None
-
-
-class ChatCompletionLogProbsContent(ChatCompletionLogProb):
-    # Workaround: redefine fields name cache so that it's not
-    # shared with the super class.
-    field_names: ClassVar[set[str] | None] = None
-    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
-
-
-class ChatCompletionLogProbs(OpenAIBaseModel):
-    content: list[ChatCompletionLogProbsContent] | None = None
-
-
-class ChatCompletionResponseChoice(OpenAIBaseModel):
-    index: int
-    message: ChatMessage
-    logprobs: ChatCompletionLogProbs | None = None
-    # per OpenAI spec this is the default
-    finish_reason: str | None = "stop"
-    # not part of the OpenAI spec but included in vLLM for legacy reasons
-    stop_reason: int | str | None = None
-    # not part of the OpenAI spec but is useful for tracing the tokens
-    # in agent scenarios
-    token_ids: list[int] | None = None
-
-
-class ChatCompletionResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: Literal["chat.completion"] = "chat.completion"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: list[ChatCompletionResponseChoice]
-    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
-    system_fingerprint: str | None = None
-    usage: UsageInfo
-
-    # vLLM-specific fields that are not in OpenAI spec
-    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
-    prompt_token_ids: list[int] | None = None
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None, description="KVTransfer parameters."
-    )
-
-
 class DeltaMessage(OpenAIBaseModel):
     role: str | None = None
     content: str | None = None
@@ -1570,27 +982,6 @@ def handle_deprecated_reasoning_content(self):
         return self
 
 
-class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
-    index: int
-    delta: DeltaMessage
-    logprobs: ChatCompletionLogProbs | None = None
-    finish_reason: str | None = None
-    stop_reason: int | str | None = None
-    # not part of the OpenAI spec but for tracing the tokens
-    token_ids: list[int] | None = None
-
-
-class ChatCompletionStreamResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: list[ChatCompletionResponseStreamChoice]
-    usage: UsageInfo | None = Field(default=None)
-    # not part of the OpenAI spec but for tracing the tokens
-    prompt_token_ids: list[int] | None = None
-
-
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
     finish_reason: str | None = None
@@ -1856,128 +1247,6 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
 )
 
 
-class TokenizeCompletionRequest(OpenAIBaseModel):
-    model: str | None = None
-    prompt: str
-
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    return_token_strs: bool | None = Field(
-        default=False,
-        description=(
-            "If true, also return the token strings corresponding to the token ids."
-        ),
-    )
-
-
-class TokenizeChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-
-    add_generation_prompt: bool = Field(
-        default=True,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-    return_token_strs: bool | None = Field(
-        default=False,
-        description=(
-            "If true, also return the token strings corresponding to the token ids."
-        ),
-    )
-    continue_final_message: bool = Field(
-        default=False,
-        description=(
-            "If this is set, the chat will be formatted so that the final "
-            "message in the chat is open-ended, without any EOS tokens. The "
-            "model will continue this message rather than starting a new one. "
-            'This allows you to "prefill" part of the model\'s response for it. '
-            "Cannot be used at the same time as `add_generation_prompt`."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    tools: list[ChatCompletionToolsParam] | None = Field(
-        default=None,
-        description=("A list of tools the model may call."),
-    )
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
-            )
-        return data
-
-
-TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
-
-
-class TokenizeResponse(OpenAIBaseModel):
-    count: int
-    max_model_len: int
-    tokens: list[int]
-    token_strs: list[str] | None = None
-
-
-class DetokenizeRequest(OpenAIBaseModel):
-    model: str | None = None
-    tokens: list[int]
-
-
-class DetokenizeResponse(OpenAIBaseModel):
-    prompt: str
-
-
-class TokenizerInfoResponse(OpenAIBaseModel):
-    """
-    Response containing tokenizer configuration
-    equivalent to tokenizer_config.json
-    """
-
-    model_config = ConfigDict(extra="allow")
-    tokenizer_class: str
-
-
 class LoadLoRAAdapterRequest(BaseModel):
     lora_name: str
     lora_path: str
@@ -2537,30 +1806,3 @@ class GenerateRequest(BaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.",
     )
-
-
-class GenerateResponseChoice(BaseModel):
-    index: int
-    logprobs: ChatCompletionLogProbs | None = None
-    # per OpenAI spec this is the default
-    finish_reason: str | None = "stop"
-    token_ids: list[int] | None = None
-
-
-class GenerateResponse(BaseModel):
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    choices: list[GenerateResponseChoice]
-
-    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
-
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None,
-        description="KVTransfer parameters used for disaggregated serving.",
-    )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/engine/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_engine.py
rename to vllm/entrypoints/openai/engine/serving.py
index fac4ced42bcb..666d83c21aea 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -38,22 +38,20 @@
     StreamingHarmonyContext,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     CompletionRequest,
     CompletionResponse,
-    DetokenizeRequest,
     ErrorInfo,
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
     ResponseInputOutputItem,
     ResponsesRequest,
-    TokenizeChatRequest,
-    TokenizeCompletionRequest,
-    TokenizeResponse,
     TranscriptionRequest,
     TranscriptionResponse,
     TranslationRequest,
@@ -86,6 +84,12 @@
     construct_input_messages,
 )
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
+from vllm.entrypoints.serve.tokenize.protocol import (
+    DetokenizeRequest,
+    TokenizeChatRequest,
+    TokenizeCompletionRequest,
+    TokenizeResponse,
+)
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import (
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 533286c5906f..a3959c873be4 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -43,8 +43,8 @@
 from openai_harmony import Role as OpenAIHarmonyRole
 
 from vllm import envs
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionToolsParam,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
+from vllm.entrypoints.openai.engine.protocol import (
     ResponseInputOutputItem,
     ResponsesRequest,
 )
diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py
index 14a6f5cb73e1..bda7e99520af 100644
--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -16,7 +16,10 @@
 )
 
 from vllm.entrypoints.constants import MCP_PREFIX
-from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.entrypoints.openai.engine.protocol import (
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers import TokenizerLike
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 6bb6d0f3f97b..5b72dc663b89 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -19,13 +19,15 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     OpenAIBaseModel,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
 from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 6ef5ae3ef01c..187ccb64e9ba 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -12,7 +12,7 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     CompletionLogProbs,
     CompletionRequest,
     CompletionResponse,
@@ -25,7 +25,7 @@
     UsageInfo,
     VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
     GenerationError,
     OpenAIServing,
     clamp_prompt_logprobs,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 6b03fa72fc59..4e6d4d5e0545 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -7,7 +7,7 @@
 from http import HTTPStatus
 
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorInfo,
     ErrorResponse,
     LoadLoRAAdapterRequest,
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index f79dad8d9e5e..e870d6e92822 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -72,19 +72,7 @@
     StreamingHarmonyContext,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.parser.harmony_utils import (
-    construct_harmony_previous_input_messages,
-    get_developer_message,
-    get_stop_tokens_for_assistant_actions,
-    get_system_message,
-    get_user_message,
-    has_custom_tools,
-    parse_output_message,
-    parse_remaining_state,
-    parse_response_input,
-    render_for_completion,
-)
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ErrorResponse,
     InputTokensDetails,
@@ -102,10 +90,22 @@
     StreamingResponsesResponse,
     VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
     GenerationError,
     OpenAIServing,
 )
+from vllm.entrypoints.openai.parser.harmony_utils import (
+    construct_harmony_previous_input_messages,
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_system_message,
+    get_user_message,
+    has_custom_tools,
+    parse_output_message,
+    parse_remaining_state,
+    parse_response_input,
+    render_for_completion,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
     construct_input_messages,
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 189b532810b4..ef576153df96 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -6,7 +6,7 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     RequestResponseMetadata,
     TranscriptionRequest,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b6332d1941c1..6fdfea2f405d 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -15,7 +15,7 @@
 import vllm.envs as envs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ErrorResponse,
     RequestResponseMetadata,
@@ -32,7 +32,7 @@
     UsageInfo,
     VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
+from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
index 29db601af67f..55e59510f549 100644
--- a/vllm/entrypoints/openai/utils.py
+++ b/vllm/entrypoints/openai/utils.py
@@ -5,7 +5,7 @@
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponseChoice,
     ChatCompletionResponseStreamChoice,
diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py
index d6ced73c88eb..7bd170a9f144 100644
--- a/vllm/entrypoints/pooling/classify/api_router.py
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -6,7 +6,7 @@
 from starlette.responses import JSONResponse
 from typing_extensions import assert_never
 
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.classify.protocol import (
     ClassificationRequest,
diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py
index 273bdd29ee58..89e927fd7759 100644
--- a/vllm/entrypoints/pooling/classify/protocol.py
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -11,7 +11,7 @@
 from vllm import PoolingParams
 from vllm.config.pooler import get_use_activation
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
 from vllm.utils import random_uuid
 
 
diff --git a/vllm/entrypoints/pooling/classify/serving.py b/vllm/entrypoints/pooling/classify/serving.py
index e166405a6f05..446366880047 100644
--- a/vllm/entrypoints/pooling/classify/serving.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -11,12 +11,14 @@
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
     ClassificationServeContext,
     OpenAIServing,
     ServeContext,
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
index 24b0c8c2b3cf..d8e5cf64127e 100644
--- a/vllm/entrypoints/pooling/embed/api_router.py
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -6,7 +6,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing_extensions import assert_never
 
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.embed.protocol import (
     EmbeddingBytesResponse,
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
index 3829a1a6a6c1..79c6d540d278 100644
--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -10,7 +10,7 @@
 
 from vllm import PoolingParams
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
 from vllm.utils import random_uuid
 from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
 
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index e94b80043962..6e1381878e93 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -12,11 +12,11 @@
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
     EmbeddingServeContext,
     OpenAIServing,
     ServeContext,
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
index 4baaf8f30f6b..223d6e3b89be 100644
--- a/vllm/entrypoints/pooling/pooling/api_router.py
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -6,7 +6,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing_extensions import assert_never
 
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.pooling.protocol import (
     IOProcessorResponse,
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
index 76b361b49b66..22f2bb18e9c2 100644
--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -9,7 +9,7 @@
 
 from vllm import PoolingParams
 from vllm.config.pooler import get_use_activation
-from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
 from vllm.entrypoints.pooling.embed.protocol import (
     EmbeddingChatRequest,
     EmbeddingCompletionRequest,
diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py
index 4e1b326806ea..c27c9179ec94 100644
--- a/vllm/entrypoints/pooling/pooling/serving.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -14,11 +14,11 @@
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.pooling.protocol import (
     IOProcessorRequest,
diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py
index c7481ed9fa96..bd9b5c425b05 100644
--- a/vllm/entrypoints/pooling/score/api_router.py
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -6,7 +6,7 @@
 from fastapi.responses import JSONResponse
 from typing_extensions import assert_never
 
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.score.protocol import (
     RerankRequest,
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
index e81bda2eec3d..35dfd5402002 100644
--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -10,7 +10,7 @@
 
 from vllm import PoolingParams
 from vllm.config.pooler import get_use_activation
-from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
 from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.utils import random_uuid
 
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index 9762b2363985..e44f15e66b32 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -9,11 +9,11 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.pooling.score.protocol import (
     RerankDocument,
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index 5fd0cf43e687..dded2eea7efd 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -22,7 +22,7 @@
 
 from vllm import envs
 from vllm.entrypoints.constants import MCP_PREFIX
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ChatCompletionMessageParam,
     ResponseInputOutputItem,
 )
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index ea88c0fc4b97..f2668baec896 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -14,16 +14,20 @@
     base,
     chat,
     completion,
-    create_chat_completion,
     create_completion,
     validate_json_request,
 )
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.api_router import (
+    create_chat_completion,
+)
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     CompletionRequest,
     ErrorResponse,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
 from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
 from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
 from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
diff --git a/vllm/entrypoints/serve/disagg/api_router.py b/vllm/entrypoints/serve/disagg/api_router.py
index c38ede30dad1..6924dc83882f 100644
--- a/vllm/entrypoints/serve/disagg/api_router.py
+++ b/vllm/entrypoints/serve/disagg/api_router.py
@@ -11,7 +11,7 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.api_server import validate_json_request
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
 )
 from vllm.entrypoints.serve.disagg.protocol import (
diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py
index 251fcf12ed7d..68c39f9069a4 100644
--- a/vllm/entrypoints/serve/disagg/protocol.py
+++ b/vllm/entrypoints/serve/disagg/protocol.py
@@ -4,8 +4,8 @@
 
 from pydantic import BaseModel, Field
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionLogProbs,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs
+from vllm.entrypoints.openai.engine.protocol import (
     Logprob,
     SamplingParams,
     StreamOptions,
diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py
index 1798b174b141..8649bc6684bc 100644
--- a/vllm/entrypoints/serve/disagg/serving.py
+++ b/vllm/entrypoints/serve/disagg/serving.py
@@ -11,16 +11,18 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionLogProb,
     ChatCompletionLogProbs,
     ChatCompletionLogProbsContent,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     PromptTokenUsageInfo,
     RequestResponseMetadata,
     UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.serve.disagg.protocol import (
     GenerateRequest,
diff --git a/vllm/entrypoints/serve/elastic_ep/api_router.py b/vllm/entrypoints/serve/elastic_ep/api_router.py
index e5adb81051ff..1a3b57d4ca89 100644
--- a/vllm/entrypoints/serve/elastic_ep/api_router.py
+++ b/vllm/entrypoints/serve/elastic_ep/api_router.py
@@ -10,7 +10,7 @@
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.api_server import validate_json_request
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
 )
 from vllm.entrypoints.serve.elastic_ep.middleware import (
diff --git a/vllm/entrypoints/serve/lora/api_router.py b/vllm/entrypoints/serve/lora/api_router.py
index 6a57e73f334f..41ec354d750d 100644
--- a/vllm/entrypoints/serve/lora/api_router.py
+++ b/vllm/entrypoints/serve/lora/api_router.py
@@ -8,7 +8,7 @@
 
 from vllm import envs
 from vllm.entrypoints.openai.api_server import models, validate_json_request
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
     ErrorResponse,
     LoadLoRAAdapterRequest,
     UnloadLoRAAdapterRequest,
diff --git a/vllm/entrypoints/serve/tokenize/api_router.py b/vllm/entrypoints/serve/tokenize/api_router.py
index a10e78c8d28e..ec486cf410d6 100644
--- a/vllm/entrypoints/serve/tokenize/api_router.py
+++ b/vllm/entrypoints/serve/tokenize/api_router.py
@@ -10,10 +10,12 @@
 from typing_extensions import assert_never
 
 from vllm.entrypoints.openai.api_server import validate_json_request
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.serve.tokenize.protocol import (
     DetokenizeRequest,
     DetokenizeResponse,
-    ErrorResponse,
     TokenizeRequest,
     TokenizeResponse,
 )
diff --git a/vllm/entrypoints/serve/tokenize/protocol.py b/vllm/entrypoints/serve/tokenize/protocol.py
new file mode 100644
index 000000000000..66a85a8b61fb
--- /dev/null
+++ b/vllm/entrypoints/serve/tokenize/protocol.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Any, TypeAlias
+
+from pydantic import ConfigDict, Field, model_validator
+
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+)
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    OpenAIBaseModel,
+)
+
+
+class TokenizeCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    prompt: str
+
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    return_token_strs: bool | None = Field(
+        default=False,
+        description=(
+            "If true, also return the token strings corresponding to the token ids."
+        ),
+    )
+
+
+class TokenizeChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+    return_token_strs: bool | None = Field(
+        default=False,
+        description=(
+            "If true, also return the token strings corresponding to the token ids."
+        ),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    tools: list[ChatCompletionToolsParam] | None = Field(
+        default=None,
+        description=("A list of tools the model may call."),
+    )
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+
+TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
+
+
+class TokenizeResponse(OpenAIBaseModel):
+    count: int
+    max_model_len: int
+    tokens: list[int]
+    token_strs: list[str] | None = None
+
+
+class DetokenizeRequest(OpenAIBaseModel):
+    model: str | None = None
+    tokens: list[int]
+
+
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
+
+
+class TokenizerInfoResponse(OpenAIBaseModel):
+    """
+    Response containing tokenizer configuration
+    equivalent to tokenizer_config.json
+    """
+
+    model_config = ConfigDict(extra="allow")
+    tokenizer_class: str
diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py
index 0b07f0b18dfd..c80009eaa08f 100644
--- a/vllm/entrypoints/serve/tokenize/serving.py
+++ b/vllm/entrypoints/serve/tokenize/serving.py
@@ -9,18 +9,20 @@
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.renderer import RenderConfig
+from vllm.entrypoints.serve.tokenize.protocol import (
     DetokenizeRequest,
     DetokenizeResponse,
-    ErrorResponse,
     TokenizeChatRequest,
     TokenizeRequest,
     TokenizeResponse,
     TokenizerInfoResponse,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs import TokensPrompt
 from vllm.logger import init_logger
 from vllm.tokenizers import TokenizerLike
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 6f3e7c55f5bc..0d6fea36fd8a 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -22,9 +22,11 @@
     resolve_hf_chat_template,
     resolve_mistral_chat_template,
 )
-from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.engine.protocol import (
     CompletionRequest,
     StreamOptions,
 )
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index bf593ca4e52a..63ce096d09ab 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -14,8 +14,10 @@
 from vllm.utils.import_utils import import_from_path
 
 if TYPE_CHECKING:
-    from vllm.entrypoints.openai.protocol import (
+    from vllm.entrypoints.openai.chat_completion.protocol import (
         ChatCompletionRequest,
+    )
+    from vllm.entrypoints.openai.engine.protocol import (
         DeltaMessage,
         ResponsesRequest,
     )
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 43067ca4afe0..ea0aa1dce1f8 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -5,13 +5,15 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any
 
-from vllm.entrypoints.openai.protocol import DeltaMessage
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers import TokenizerLike
 
 if TYPE_CHECKING:
-    from vllm.entrypoints.openai.protocol import (
+    from vllm.entrypoints.openai.chat_completion.protocol import (
         ChatCompletionRequest,
+    )
+    from vllm.entrypoints.openai.engine.protocol import (
         ResponsesRequest,
     )
 else:
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index a91c8ceeb625..d24e331bb4a2 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Sequence
 
-from vllm.entrypoints.openai.protocol import DeltaMessage
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
index efb080276e46..2db39e16c642 100644
--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -5,7 +5,10 @@
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py
index 3cdbf14858ec..6ff86488bb36 100644
--- a/vllm/reasoning/ernie45_reasoning_parser.py
+++ b/vllm/reasoning/ernie45_reasoning_parser.py
@@ -5,7 +5,10 @@
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index e0920ef3160b..4c938bac6222 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -5,8 +5,11 @@
 
 from transformers import PreTrainedTokenizerBase
 
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 484045d66a3c..5cae16f74ac3 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -6,7 +6,10 @@
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 
diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py
index 3b5bfd838017..b0bda09794d2 100644
--- a/vllm/reasoning/holo2_reasoning_parser.py
+++ b/vllm/reasoning/holo2_reasoning_parser.py
@@ -3,7 +3,10 @@
 
 from collections.abc import Sequence
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import (
     ReasoningParser,
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index f297454f57ec..05e4d586268b 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -6,7 +6,10 @@
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
index e998e071efcf..ad4e0c8ffab6 100644
--- a/vllm/reasoning/identity_reasoning_parser.py
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -5,7 +5,10 @@
 
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index a2b9224cb3bf..06b97d39eacc 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -3,8 +3,10 @@
 
 from collections.abc import Sequence
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ResponsesRequest,
 )
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index 48a36b4c6634..05ff2bff0c3e 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -3,8 +3,10 @@
 
 from functools import cached_property
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     ResponsesRequest,
 )
 from vllm.logger import init_logger
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index 2742a24a2c3e..9ee8fe37ecb0 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -10,9 +10,10 @@
 
 if TYPE_CHECKING:
     from vllm.tokenizers import TokenizerLike
-
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ResponsesRequest,
 )
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index ef7762bf0af5..9c2f7404b9de 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import ResponsesRequest
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
index b85bc826572f..47131b2bccee 100644
--- a/vllm/reasoning/step3_reasoning_parser.py
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -6,7 +6,10 @@
 import regex as re
 from transformers import PreTrainedTokenizerBase
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index 35a11e95b8bd..bb85052dba8e 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -19,7 +19,7 @@
 from mistral_common.tokens.tokenizers.tekken import Tekkenizer
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
 from vllm.logger import init_logger
 
 from .protocol import TokenizerLike
diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
index b7cac3454dab..feb3ce6f301f 100644
--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -10,8 +10,8 @@
     ResponseFormatTextJSONSchemaConfig,
 )
 
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ExtractedToolCallInformation,
     ResponsesRequest,
diff --git a/vllm/tool_parsers/deepseekv31_tool_parser.py b/vllm/tool_parsers/deepseekv31_tool_parser.py
index 33383e1bc073..ad42bb7713c4 100644
--- a/vllm/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv31_tool_parser.py
@@ -6,8 +6,10 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py
index db081178fdea..49c9540d6b26 100644
--- a/vllm/tool_parsers/deepseekv32_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv32_tool_parser.py
@@ -8,8 +8,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/deepseekv3_tool_parser.py b/vllm/tool_parsers/deepseekv3_tool_parser.py
index f8cf559f2284..83bba1c878e0 100644
--- a/vllm/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv3_tool_parser.py
@@ -6,8 +6,10 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/ernie45_tool_parser.py b/vllm/tool_parsers/ernie45_tool_parser.py
index 79193787b3b3..d5dc7a3da3cc 100644
--- a/vllm/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/tool_parsers/ernie45_tool_parser.py
@@ -6,8 +6,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/functiongemma_tool_parser.py b/vllm/tool_parsers/functiongemma_tool_parser.py
index 9be78b0a0691..22fa8d981f88 100644
--- a/vllm/tool_parsers/functiongemma_tool_parser.py
+++ b/vllm/tool_parsers/functiongemma_tool_parser.py
@@ -7,8 +7,10 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/gigachat3_tool_parser.py b/vllm/tool_parsers/gigachat3_tool_parser.py
index 27a6bc1a7bad..02cdad9edebe 100644
--- a/vllm/tool_parsers/gigachat3_tool_parser.py
+++ b/vllm/tool_parsers/gigachat3_tool_parser.py
@@ -7,8 +7,10 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py
index 6ad7d7cb460c..522c67dc2e6a 100644
--- a/vllm/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/tool_parsers/glm4_moe_tool_parser.py
@@ -8,9 +8,11 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/tool_parsers/granite_20b_fc_tool_parser.py
index d841fb57ac87..7fe3c39f70cf 100644
--- a/vllm/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/tool_parsers/granite_20b_fc_tool_parser.py
@@ -10,8 +10,10 @@
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/granite_tool_parser.py b/vllm/tool_parsers/granite_tool_parser.py
index 7abfdf72849d..7cad01e16431 100644
--- a/vllm/tool_parsers/granite_tool_parser.py
+++ b/vllm/tool_parsers/granite_tool_parser.py
@@ -8,8 +8,10 @@
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/hermes_tool_parser.py b/vllm/tool_parsers/hermes_tool_parser.py
index 4b1dea7edf27..47dd2a24d251 100644
--- a/vllm/tool_parsers/hermes_tool_parser.py
+++ b/vllm/tool_parsers/hermes_tool_parser.py
@@ -9,8 +9,10 @@
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py
index c73982136804..4f446bfcce95 100644
--- a/vllm/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -8,8 +8,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/internlm2_tool_parser.py b/vllm/tool_parsers/internlm2_tool_parser.py
index e87efe3275a7..3b858f34c20a 100644
--- a/vllm/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/tool_parsers/internlm2_tool_parser.py
@@ -8,8 +8,10 @@
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/jamba_tool_parser.py b/vllm/tool_parsers/jamba_tool_parser.py
index 7f3de0b38a33..937e28b17079 100644
--- a/vllm/tool_parsers/jamba_tool_parser.py
+++ b/vllm/tool_parsers/jamba_tool_parser.py
@@ -9,8 +9,10 @@
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/kimi_k2_tool_parser.py b/vllm/tool_parsers/kimi_k2_tool_parser.py
index 96630504f068..354ed412b701 100644
--- a/vllm/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/tool_parsers/kimi_k2_tool_parser.py
@@ -6,8 +6,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/tool_parsers/llama4_pythonic_tool_parser.py
index 3c5409bbfaf4..707cdd6625c7 100644
--- a/vllm/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/tool_parsers/llama4_pythonic_tool_parser.py
@@ -9,8 +9,10 @@
 from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/llama_tool_parser.py b/vllm/tool_parsers/llama_tool_parser.py
index b0dfe05c8e55..527d3f7358e8 100644
--- a/vllm/tool_parsers/llama_tool_parser.py
+++ b/vllm/tool_parsers/llama_tool_parser.py
@@ -11,8 +11,10 @@
 
 import vllm.envs as envs
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py
index 67bd0e61620d..eb5c46c327e5 100644
--- a/vllm/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/tool_parsers/minimax_m2_tool_parser.py
@@ -8,8 +8,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/minimax_tool_parser.py b/vllm/tool_parsers/minimax_tool_parser.py
index 86e1433c6e36..cb5610fc7503 100644
--- a/vllm/tool_parsers/minimax_tool_parser.py
+++ b/vllm/tool_parsers/minimax_tool_parser.py
@@ -8,8 +8,10 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
index 35b853b0ad7e..67f6345bf589 100644
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -12,8 +12,10 @@
 import regex as re
 from pydantic import Field
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/olmo3_tool_parser.py b/vllm/tool_parsers/olmo3_tool_parser.py
index 8cd6a84a9f6b..7b0d609d51df 100644
--- a/vllm/tool_parsers/olmo3_tool_parser.py
+++ b/vllm/tool_parsers/olmo3_tool_parser.py
@@ -9,8 +9,10 @@
 from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
index da1a9c773f78..76f7a49dfaea 100644
--- a/vllm/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -4,14 +4,16 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
-from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ExtractedToolCallInformation,
     FunctionCall,
     ToolCall,
 )
+from vllm.entrypoints.openai.parser.harmony_utils import parse_output_into_messages
 from vllm.logger import init_logger
 from vllm.tool_parsers.abstract_tool_parser import (
     ToolParser,
diff --git a/vllm/tool_parsers/phi4mini_tool_parser.py b/vllm/tool_parsers/phi4mini_tool_parser.py
index 9003429d8c6f..f222cffd61d3 100644
--- a/vllm/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/tool_parsers/phi4mini_tool_parser.py
@@ -9,8 +9,10 @@
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaMessage,
     ExtractedToolCallInformation,
     FunctionCall,
diff --git a/vllm/tool_parsers/pythonic_tool_parser.py b/vllm/tool_parsers/pythonic_tool_parser.py
index 476a62d5f527..dc9926608e60 100644
--- a/vllm/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/tool_parsers/pythonic_tool_parser.py
@@ -10,8 +10,10 @@
 from transformers import PreTrainedTokenizerBase
 
 import vllm.envs as envs
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index d1a3cbeaafc7..a3c79f865b15 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -8,9 +8,11 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/qwen3xml_tool_parser.py b/vllm/tool_parsers/qwen3xml_tool_parser.py
index 107f791654a1..f7dcf20abb7c 100644
--- a/vllm/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/tool_parsers/qwen3xml_tool_parser.py
@@ -9,9 +9,11 @@
 import regex as re
 
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/seed_oss_tool_parser.py b/vllm/tool_parsers/seed_oss_tool_parser.py
index 206072e65b10..6927071c3846 100644
--- a/vllm/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/tool_parsers/seed_oss_tool_parser.py
@@ -11,9 +11,11 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/step3_tool_parser.py b/vllm/tool_parsers/step3_tool_parser.py
index acd99bf56d0b..8e6f27907c96 100644
--- a/vllm/tool_parsers/step3_tool_parser.py
+++ b/vllm/tool_parsers/step3_tool_parser.py
@@ -8,8 +8,10 @@
 
 import regex as re
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,
diff --git a/vllm/tool_parsers/utils.py b/vllm/tool_parsers/utils.py
index 570eb447a467..cbbf5b545538 100644
--- a/vllm/tool_parsers/utils.py
+++ b/vllm/tool_parsers/utils.py
@@ -13,7 +13,7 @@
 from openai.types.responses.tool import Tool
 from partial_json_parser.core.options import Allow
 
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionToolsParam,
 )
diff --git a/vllm/tool_parsers/xlam_tool_parser.py b/vllm/tool_parsers/xlam_tool_parser.py
index 9c2b585fe9fd..d0d191ad28a2 100644
--- a/vllm/tool_parsers/xlam_tool_parser.py
+++ b/vllm/tool_parsers/xlam_tool_parser.py
@@ -6,10 +6,11 @@
 from typing import Any, Optional, Union
 
 import regex as re
-
-from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionRequest,
+)
+from vllm.entrypoints.chat_utils import make_tool_call_id
+from vllm.entrypoints.openai.engine.protocol import (
     DeltaFunctionCall,
     DeltaMessage,
     DeltaToolCall,