From a477e4f74f9a65ed7be5ff5fbd14e520b4831aee Mon Sep 17 00:00:00 2001
From: chaunceyjiang <chaunceyjiang@gmail.com>
Date: Thu, 12 Mar 2026 11:19:23 +0800
Subject: [PATCH 1/5] [Bugfix] Fix crash when tool_choice=required exceeds
 max_tokens

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/tool_use/test_chat_completions.py       | 31 +++++++++++++++++++
 .../openai/chat_completion/serving.py         |  1 -
 vllm/entrypoints/openai/engine/serving.py     | 10 ++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 07b7933f65c0..1e6d5f37e9f3 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -193,3 +193,34 @@ async def test_response_format_with_tool_choice_required(
     assert choice.finish_reason == "tool_calls"
     assert choice.message.tool_calls is not None
     assert len(choice.message.tool_calls) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(120)
+async def test_max_tokens_with_tool_choice_required(
+    client: openai.AsyncOpenAI, server_config: ServerConfig
+):
+    """ """
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+
+    # This combination previously crashed the engine
+    chat_completion = await client.chat.completions.create(
+        messages=ensure_system_prompt(
+            [{"role": "user", "content": "What is the weather in Dallas, Texas?"}],
+            server_config,
+        ),
+        temperature=0,
+        max_completion_tokens=150,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice="required",
+        max_tokens=5,
+    )
+    # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
+    # both `tool_calls` and `content` should be empty.
+    # This behavior should be consistent with OpenAI.
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert len(choice.message.tool_calls) == 0
+    assert choice.message.content == ""
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index eb39e649a7e4..9af597847fcc 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -1507,7 +1507,6 @@ async def chat_completion_full_generator(
 
             elif request.tool_choice and request.tool_choice == "required":
                 tool_call_class_items = []
-                assert tool_calls is not None and len(tool_calls) > 0
                 for idx, tool_call in enumerate(tool_calls):
                     # Use native ID if available,
                     # otherwise generate ID with correct id_type
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index fad2a7f8c2eb..2ef08893ed62 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import contextlib
 import json
 import time
 from collections.abc import AsyncGenerator, Callable, Mapping, Sequence
@@ -13,7 +14,7 @@
 from openai.types.responses import (
     ToolChoiceFunction,
 )
-from pydantic import ConfigDict, TypeAdapter
+from pydantic import ConfigDict, TypeAdapter, ValidationError
 from starlette.datastructures import Headers
 
 import vllm.envs as envs
@@ -1125,8 +1126,11 @@ def _parse_tool_calls_from_content(
             )
             content = None  # Clear content since tool is called.
         elif request.tool_choice == "required":
-            assert content is not None
-            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
+            tool_calls = []
+            with contextlib.suppress(ValidationError):
+                tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
+                    content
+                )
             function_calls.extend(
                 [
                     FunctionCall(

From 1be8bdfab2ea7014fa66147a7eacad9de3d28d85 Mon Sep 17 00:00:00 2001
From: chaunceyjiang <chaunceyjiang@gmail.com>
Date: Thu, 12 Mar 2026 11:25:35 +0800
Subject: [PATCH 2/5] [Bugfix] Fix crash when tool_choice=required exceeds
 max_tokens

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/openai/engine/serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 2ef08893ed62..668526e3c430 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -1126,8 +1126,8 @@ def _parse_tool_calls_from_content(
             )
             content = None  # Clear content since tool is called.
         elif request.tool_choice == "required":
-            tool_calls = []
             with contextlib.suppress(ValidationError):
+                content = content or "[]"
                 tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
                     content
                 )

From 377798e55e646132e5d5fe2d5bc85dec04c7af86 Mon Sep 17 00:00:00 2001
From: chaunceyjiang <chaunceyjiang@gmail.com>
Date: Thu, 12 Mar 2026 11:32:57 +0800
Subject: [PATCH 3/5] [Bugfix] Fix crash when tool_choice=required exceeds
 max_tokens

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/openai/chat_completion/serving.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index 9af597847fcc..da934c05d5a7 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -1507,6 +1507,7 @@ async def chat_completion_full_generator(
 
             elif request.tool_choice and request.tool_choice == "required":
                 tool_call_class_items = []
+                tool_calls = tool_calls or []
                 for idx, tool_call in enumerate(tool_calls):
                     # Use native ID if available,
                     # otherwise generate ID with correct id_type

From 63078763662693d1141d194f4f8cabfca3f6f41b Mon Sep 17 00:00:00 2001
From: chaunceyjiang <chaunceyjiang@gmail.com>
Date: Thu, 12 Mar 2026 14:15:48 +0800
Subject: [PATCH 4/5] [Bugfix] Fix crash when tool_choice=required exceeds
 max_tokens

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/tool_use/test_chat_completions.py   |  2 +-
 vllm/entrypoints/openai/engine/serving.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 1e6d5f37e9f3..fb72f0aab819 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -215,7 +215,7 @@ async def test_max_tokens_with_tool_choice_required(
         model=model_name,
         tools=[WEATHER_TOOL],
         tool_choice="required",
-        max_tokens=5,
+        max_tokens=1,
     )
     # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
     # both `tool_calls` and `content` should be empty.
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 668526e3c430..2049b3adfd3c 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -1126,20 +1126,19 @@ def _parse_tool_calls_from_content(
             )
             content = None  # Clear content since tool is called.
         elif request.tool_choice == "required":
+            tool_calls = []
             with contextlib.suppress(ValidationError):
-                content = content or "[]"
+                content = content or ""
                 tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
                     content
                 )
-            function_calls.extend(
-                [
+            for tool_call in tool_calls:
+                function_calls.append(
                     FunctionCall(
                         name=tool_call.name,
                         arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
                     )
-                    for tool_call in tool_calls
-                ]
-            )
+                )
             content = None  # Clear content since tool is called.
         elif (
             tool_parser_cls

From d2533751cb5975418416798f69bb962dff3a445a Mon Sep 17 00:00:00 2001
From: chaunceyjiang <chaunceyjiang@gmail.com>
Date: Thu, 12 Mar 2026 14:22:08 +0800
Subject: [PATCH 5/5] [Bugfix] Fix crash when tool_choice=required exceeds
 max_tokens

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../test_completion_with_function_calling.py  | 24 ++++++++++++++
 tests/tool_use/test_chat_completions.py       | 31 -------------------
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 15a2fb85f489..39ab13213134 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -514,3 +514,27 @@ async def test_inconsistent_tool_choice_and_tools(
             ],
             tool_choice={},
         )
+
+
+@pytest.mark.asyncio
+async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
+    """ """
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+
+    # This combination previously crashed the engine
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        temperature=0,
+        max_completion_tokens=1,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+    )
+    # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
+    # both `tool_calls` and `content` should be empty.
+    # This behavior should be consistent with OpenAI.
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert len(choice.message.tool_calls) == 0
+    assert choice.message.content == ""
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index fb72f0aab819..07b7933f65c0 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -193,34 +193,3 @@ async def test_response_format_with_tool_choice_required(
     assert choice.finish_reason == "tool_calls"
     assert choice.message.tool_calls is not None
     assert len(choice.message.tool_calls) > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.timeout(120)
-async def test_max_tokens_with_tool_choice_required(
-    client: openai.AsyncOpenAI, server_config: ServerConfig
-):
-    """ """
-    models = await client.models.list()
-    model_name: str = models.data[0].id
-
-    # This combination previously crashed the engine
-    chat_completion = await client.chat.completions.create(
-        messages=ensure_system_prompt(
-            [{"role": "user", "content": "What is the weather in Dallas, Texas?"}],
-            server_config,
-        ),
-        temperature=0,
-        max_completion_tokens=150,
-        model=model_name,
-        tools=[WEATHER_TOOL],
-        tool_choice="required",
-        max_tokens=1,
-    )
-    # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
-    # both `tool_calls` and `content` should be empty.
-    # This behavior should be consistent with OpenAI.
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert len(choice.message.tool_calls) == 0
-    assert choice.message.content == ""