kimi k2 fixes

Andrew Xia · Andrew Xia · commit 1584cb0df2e7 · 2025-11-15T16:37:21.000-08:00
Signed-off-by: Andrew Xia &lt;axia@fb.com&gt;
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -1495,6 +1495,12 @@ def _parse_chat_message_content(
     role = message["role"]
     content = message.get("content")
     reasoning = message.get("reasoning") or message.get("reasoning_content")
+    # TODO: get from reasoning_content?
+
+    # HACK
+    if role == "tool":
+        content_format = "openai"
+
     if content is None:
         content = []
     elif isinstance(content, str):
@@ -1503,7 +1509,9 @@ def _parse_chat_message_content(
         role,
         content,  # type: ignore
         mm_tracker,
-        wrap_dicts=(content_format == "openai"),
+        wrap_dicts=(
+            content_format == "openai"
+        ),  # kimik2 thinks this is string, breaks on tool
         interleave_strings=interleave_strings,
     )
 
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
@@ -263,7 +263,7 @@ def need_builtin_tool_call(self) -> bool:
         last_message = self.parser.chat_completion_messages[-1]["content"][-1]
         if isinstance(last_message, FunctionCall):
             # HACK: figure out which tools are MCP tools
-            if last_message.name == "code_interpreter":
+            if last_message.name == "code_interpreter" or last_message.name == "python":
                 return True
 
         return False
@@ -276,7 +276,7 @@ async def call_python_tool(
             return await tool_session.get_result(self)
         args = json.loads(last_msg.arguments)
         param = {
-            "code": args['code'],
+            "code": args["code"],
         }
         result = await tool_session.call_tool("python", param)
         result_str = result.content[0].text
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -1334,7 +1334,9 @@ async def _generate_with_builtin_tools(
             # engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
             # request_prompt = prompt_token_ids
             # Update the sampling params.
-            sampling_params.max_tokens = self.max_model_len - len(engine_prompt)
+            sampling_params.max_tokens = self.max_model_len - len(
+                engine_prompt["prompt_token_ids"]
+            )
             # OPTIMIZATION
             priority = orig_priority - 1