dont use sentence types

Andrew Xia · Andrew Xia · commit c7e56ac0a952 · 2025-11-15T16:36:29.000-08:00
Signed-off-by: Andrew Xia &lt;axia@fb.com&gt;
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -36,6 +36,9 @@
 )
 from openai.types.chat.chat_completion_content_part_input_audio_param import InputAudio
 from openai.types.responses import ResponseInputImageParam
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
 from openai_harmony import Message as OpenAIHarmonyMessage
 from PIL import Image
 from pydantic import BaseModel, ConfigDict, TypeAdapter
@@ -216,6 +219,7 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     | CustomChatCompletionContentSimpleVideoParam
     | str
     | CustomThinkCompletionContentParam
+    | ResponseReasoningTextContent
 )
 
 
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
@@ -223,6 +223,9 @@ def append_output(self, output) -> None:
         for token_id in output_token_ids:
             self.parser.process(token_id)
 
+    def append_tool_output(self, output) -> None:
+        raise NotImplementedError("Should not be called.")
+
     def need_builtin_tool_call(self) -> bool:
         return False
 
diff --git a/vllm/entrypoints/openai/parser/parser.py b/vllm/entrypoints/openai/parser/parser.py
@@ -2,7 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import logging
 
-from vllm.entrypoints.openai.parser.sentence import Author, Role, Sentence, TextContent
+from openai.types.chat.chat_completion_content_part_text_param import (
+    ChatCompletionContentPartTextParam,
+)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+
+from vllm.entrypoints.chat_utils import CustomChatCompletionMessageParam
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 
 logger = logging.getLogger(__name__)
@@ -12,17 +19,20 @@ class StreamableParser:
     """Incremental parser over completion tokens with reasoning support."""
 
     def __init__(self, *, tokenizer, reasoning_parser: ReasoningParser):
-        self.sentences: list[Sentence] = []
+        self.chat_completion_messages: list[CustomChatCompletionMessageParam] = []
         self.tokens: list[int] = []
         self.tokenizer = tokenizer
 
         # Initialize reasoning parser instance if provided
         self.reasoning_parser_instance = reasoning_parser(tokenizer)
 
         # start like this
-        self.current_role = Role.ASSISTANT
-        self.current_sentence = Sentence(
-            author=Author(role=self.current_role), content=[]
+        self.current_role = "assistant"
+        # self.current_sentence = Sentence(
+        #     author=Author(role=self.current_role), content=[]
+        # )
+        self.current_chat_completion_message = CustomChatCompletionMessageParam(
+            role=self.current_role, content=[]
         )
         self.current_channel = "think"
         self.current_text = ""
@@ -38,20 +48,27 @@ def process(self, token: int) -> "StreamableParser":
         self.tokens.append(token)
         decoded = self.tokenizer.decode(token)
         if self.reasoning_parser_instance.is_reasoning_end([token]):
-            new_content = TextContent(
-                text=self.current_text, channel=self.current_channel
+            # TODO: how to capture reasoning?
+            # new_content = {
+            #     "role": "assistant",
+            #     "reasoning_content": self.current_text
+            # }
+
+            new_content = ResponseReasoningTextContent(
+                text=self.current_text, type="reasoning_text"
             )
-            self.current_sentence.content.append(new_content)
+
+            self.current_chat_completion_message["content"].append(new_content)
 
             self.current_text = ""
             self.current_channel = "final"
         elif token == self.tokenizer.eos_token_id:
             # end of sentence
-            new_content = TextContent(
-                text=self.current_text, channel=self.current_channel
+            new_content = ChatCompletionContentPartTextParam(
+                text=self.current_text, type="text"
             )
-            self.current_sentence.content.append(new_content)
-            self.sentences.append(self.current_sentence)
+            self.current_chat_completion_message["content"].append(new_content)
+            self.chat_completion_messages.append(self.current_chat_completion_message)
 
             self.current_text = ""
             self.current_channel = None
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -55,6 +55,7 @@
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
     ChatTemplateContentFormatOption,
+    CustomChatCompletionMessageParam,
 )
 from vllm.entrypoints.context import (
     ConversationContext,
@@ -76,7 +77,6 @@
     render_for_completion,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.parser.sentence import Sentence
 from vllm.entrypoints.openai.protocol import (
     DeltaMessage,
     ErrorResponse,
@@ -271,7 +271,9 @@ async def create_responses(
         | ErrorResponse
     ):
         error_check_ret = await self._check_model(request)
-        import fbvscode; fbvscode.set_trace()
+        import fbvscode
+
+        fbvscode.set_trace()
         if error_check_ret is not None:
             logger.error("Error with model %s", error_check_ret)
             return error_check_ret
@@ -611,9 +613,9 @@ async def responses_full_generator(
             else:
                 status = "incomplete"
         elif isinstance(context, ParsableContext):
-            sentences = context.parser.sentences
+            chat_completion_messages = context.parser.chat_completion_messages
             output = self._make_response_output_items_from_parsable_context(
-                request, sentences
+                request, chat_completion_messages
             )
 
             # TODO: context for non-gptoss models doesn't use messages
@@ -784,7 +786,9 @@ def _create_stream_response_logprobs(
         ]
 
     def _make_response_output_items_from_parsable_context(
-        self, request: ResponsesRequest, sentences: list[Sentence]
+        self,
+        request: ResponsesRequest,
+        chat_completion_messages: list[CustomChatCompletionMessageParam],
     ) -> list[ResponseOutputItem]:
         """Given a list of sentences, construct ResponseOutput Items.
 
@@ -795,29 +799,25 @@ def _make_response_output_items_from_parsable_context(
         """
         output_items: list[ResponseOutputItem] = []
 
-        for sentence in sentences:
-            for text_content in sentence.content:
-                channel = text_content.channel
-                text = text_content.text
-
-                if channel == "think" or channel == "analysis":
+        for sentence in chat_completion_messages:
+            for text_content in sentence["content"]:
+                if isinstance(text_content, ResponseReasoningTextContent):
                     # Reasoning content
                     reasoning_item = ResponseReasoningItem(
                         id=f"rs_{random_uuid()}",
                         summary=[],
                         type="reasoning",
-                        content=[
-                            ResponseReasoningTextContent(
-                                text=text, type="reasoning_text"
-                            )
-                        ],
+                        content=[text_content],
                         status="completed",
                     )
                     output_items.append(reasoning_item)
-                elif channel == "final":
+                elif (
+                    isinstance(text_content, dict)
+                    and text_content.get("type") == "text"
+                ):
                     # Final output content
                     output_text = ResponseOutputText(
-                        text=text,
+                        text=text_content["text"],
                         annotations=[],
                         type="output_text",
                         logprobs=None,  # Not available from parser