working mcp

Andrew Xia · Andrew Xia · commit 69022c42d0f4 · 2025-11-07T15:56:39.000-08:00
Signed-off-by: Andrew Xia &lt;axia@fb.com&gt;
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
@@ -15,7 +15,10 @@
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
 from vllm import envs
-from vllm.entrypoints.chat_utils import CustomChatCompletionMessageParam
+from vllm.entrypoints.chat_utils import (
+    ChatTemplateContentFormatOption,
+    CustomChatCompletionMessageParam,
+)
 from vllm.entrypoints.harmony_utils import (
     get_encoding,
     get_streamable_parser_for_assistant,
@@ -193,6 +196,9 @@ def __init__(
         request: ResponsesRequest,
         available_tools: list[str] | None,
         tool_parser_cls,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        tool_dicts: list[dict] | None = None,
     ):
         self.last_output = None
         self.num_prompt_tokens = 0
@@ -210,6 +216,8 @@ def __init__(
             request=request,
             tool_parser_cls=tool_parser_cls,
         )
+        self.tool_parser_cls = tool_parser_cls
+        self.request = request
         self.tokenizer = tokenizer
         self.reasoning_parser = reasoning_parser
 
@@ -220,6 +228,10 @@ def __init__(
         self._tool_sessions: dict[str, ClientSession | Tool] = {}
         self.called_tools: set[str] = set()
 
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.tool_dicts = tool_dicts
+
     def append_output(
         self, output: RequestOutput | list[CustomChatCompletionMessageParam]
     ) -> None:
@@ -252,8 +264,9 @@ async def call_python_tool(
         self.called_tools.add("python")
         if isinstance(tool_session, Tool):
             return await tool_session.get_result(self)
+        args = json.loads(last_msg.arguments)
         param = {
-            "code": last_msg.arguments,
+            "code": args['code'],
         }
         result = await tool_session.call_tool("python", param)
         result_str = result.content[0].text
@@ -263,7 +276,9 @@ async def call_python_tool(
 
         message = CustomChatCompletionMessageParam(
             role="tool",
-            content=[ChatCompletionContentPartTextParam(text=content, type="text")],
+            content=[
+                ChatCompletionContentPartTextParam(text=content, type="text")
+            ],  # TODO: why is this nested?
         )
 
         return [message]
@@ -281,8 +296,16 @@ async def call_tool(self) -> list[CustomChatCompletionMessageParam]:
         # if recipient is not None and recipient.startswith("python"):
         #     return await self.call_python_tool(self._tool_sessions["python"], last_tool_request)
 
-    def render_for_completion(self) -> list[int]:
-        raise NotImplementedError("Should not be called.")
+    def render_for_completion(self):
+        return [
+            self.request,
+            self.tokenizer,
+            self.parser.chat_completion_messages,
+            self.tool_dicts,
+            self.tool_parser_cls,
+            self.chat_template,
+            self.chat_template_content_format,
+        ]
 
     async def init_tool_sessions(
         self,
diff --git a/vllm/entrypoints/openai/parser/parser.py b/vllm/entrypoints/openai/parser/parser.py
@@ -5,9 +5,6 @@
 from openai.types.chat.chat_completion_content_part_text_param import (
     ChatCompletionContentPartTextParam,
 )
-from openai.types.responses.response_reasoning_item import (
-    Content as ResponseReasoningTextContent,
-)
 
 from vllm.entrypoints.chat_utils import CustomChatCompletionMessageParam
 from vllm.entrypoints.openai.protocol import FunctionCall, ResponsesRequest
@@ -63,7 +60,7 @@ def process(self, output: CompletionOutput) -> "StreamableParser":
             )
         )
         if reasoning_content:
-            new_content = ResponseReasoningTextContent(
+            new_content = ChatCompletionContentPartTextParam(
                 text=reasoning_content, type="reasoning_text"
             )
 
@@ -77,6 +74,7 @@ def process(self, output: CompletionOutput) -> "StreamableParser":
         if tool_call_info is not None and tool_call_info.tools_called:
             # extract_tool_calls() returns a list of tool calls.
             function_calls.extend(
+                # TODO: this should be a TypedDict
                 FunctionCall(
                     name=tool_call.function.name,
                     arguments=tool_call.function.arguments,
@@ -92,8 +90,10 @@ def process(self, output: CompletionOutput) -> "StreamableParser":
             self.current_chat_completion_message["content"].extend(function_calls)
 
         self.chat_completion_messages.append(self.current_chat_completion_message)
-        # if len(function_calls) > 0:
-        # TODO: add a tool call to the parser
+
+        self.current_chat_completion_message = CustomChatCompletionMessageParam(
+            role=self.current_role, content=[]
+        )
 
         return self
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -1179,6 +1179,61 @@ async def _process_inputs(
         )
         return engine_request, tokenization_kwargs
 
+    async def _render_next_turn(
+        self,
+        request,
+        tokenizer,
+        messages,
+        tool_dicts,
+        tool_parser,
+        chat_template,
+        chat_template_content_format,
+    ):
+        new_messages = []
+        for item in messages:
+            if item["role"] == "user" or item["role"] == "tool":
+                new_messages.append(item)
+            elif item["role"] == "assistant":
+                for content in item["content"]:
+                    if isinstance(content, FunctionCall):
+                        new_msg = {
+                            "role": "assistant",
+                            "tool_calls": [
+                                {
+                                    "id": "dafsdfdsa",
+                                    "type": "function",
+                                    "function": {
+                                        "name": content.name,
+                                        "arguments": content.arguments,
+                                    },
+                                }
+                            ],
+                        }
+                        new_messages.append(new_msg)
+                    elif content["type"] == "text":
+                        new_messages.append(
+                            {"role": "assistant", "content": content["text"]}
+                        )
+                    elif content["type"] == "reasoning_text":
+                        reasoning_content = content["text"]
+                        new_messages.append(
+                            {
+                                "role": "assistant",
+                                "content": "<think>" + reasoning_content + "</think>",
+                            }
+                        )
+
+        _, request_prompts, engine_prompts = await self._preprocess_chat(
+            request,
+            tokenizer,
+            new_messages,
+            tool_dicts=tool_dicts,
+            tool_parser=tool_parser,
+            chat_template=chat_template,
+            chat_template_content_format=chat_template_content_format,
+        )
+        return request_prompts, engine_prompts
+
     async def _generate_with_builtin_tools(
         self,
         request_id: str,
@@ -1238,11 +1293,33 @@ async def _generate_with_builtin_tools(
 
             # Create inputs for the next turn.
             # Render the next prompt token ids.
-            prompt_token_ids = context.render_for_completion()
-            engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
-            request_prompt = prompt_token_ids
+            [
+                request,
+                tokenizer,
+                messages,
+                tool_dicts,
+                tool_parser,
+                chat_template,
+                chat_template_content_format,
+            ] = context.render_for_completion()
+
+            # HACK
+            request_prompts, engine_prompts = await self._render_next_turn(
+                request,
+                tokenizer,
+                messages,
+                tool_dicts,
+                tool_parser,
+                chat_template,
+                chat_template_content_format,
+            )
+            engine_prompt = engine_prompts[0]
+            request_prompt = request_prompts[0]
+
+            # engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+            # request_prompt = prompt_token_ids
             # Update the sampling params.
-            sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
+            sampling_params.max_tokens = self.max_model_len - len(engine_prompt)
             # OPTIMIZATION
             priority = orig_priority - 1
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -390,6 +390,14 @@ async def create_responses(
                             request=request,
                             tool_parser_cls=self.tool_parser,
                             available_tools=available_tools,
+                            tool_dicts=[
+                                convert_tool_responses_to_completions_format(
+                                    tool.model_dump()
+                                )
+                                for tool in request.tools
+                            ],
+                            chat_template=self.chat_template,
+                            chat_template_content_format=self.chat_template_content_format,
                         )
                     else:
                         context = SimpleContext()
@@ -804,6 +812,16 @@ def _make_response_output_items_from_parsable_context(
         output_items: list[ResponseOutputItem] = []
 
         for sentence in chat_completion_messages:
+            # if sentence['role'] == 'tool':
+            # TODO: this should be a McpCall type
+            #     function_call_output = ResponseFunctionToolCallOutputItem(
+            #         id=f"fc_{random_uuid()}",
+            #         call_id=f"call_{random_uuid()}",
+            #         type="function_call_output",
+            #         status="completed",
+            #         output=sentence['content'][0]['text'].text,
+            #     )
+            #     output_items.append(function_call_output)
             if sentence["role"] != "assistant":
                 # This could be a system/user message, and
                 # This is a message from a tool to the assistant (e.g., search result).
@@ -812,13 +830,20 @@ def _make_response_output_items_from_parsable_context(
                 continue
 
             for content in sentence["content"]:
-                if isinstance(content, ResponseReasoningTextContent):
+                if (
+                    isinstance(content, dict)
+                    and content.get("type") == "reasoning_text"
+                ):
                     # Reasoning content
                     reasoning_item = ResponseReasoningItem(
                         id=f"rs_{random_uuid()}",
                         summary=[],
                         type="reasoning",
-                        content=[content],
+                        content=[
+                            ResponseReasoningTextContent(
+                                text=content["text"], type="reasoning_text"
+                            )
+                        ],
                         status="completed",
                     )
                     output_items.append(reasoning_item)