From a477e4f74f9a65ed7be5ff5fbd14e520b4831aee Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Thu, 12 Mar 2026 11:19:23 +0800 Subject: [PATCH 1/5] [Bugfix] Fix crash when tool_choice=required exceeds max_tokens Signed-off-by: chaunceyjiang --- tests/tool_use/test_chat_completions.py | 31 +++++++++++++++++++ .../openai/chat_completion/serving.py | 1 - vllm/entrypoints/openai/engine/serving.py | 10 ++++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 07b7933f65c0..1e6d5f37e9f3 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -193,3 +193,34 @@ async def test_response_format_with_tool_choice_required( assert choice.finish_reason == "tool_calls" assert choice.message.tool_calls is not None assert len(choice.message.tool_calls) > 0 + + +@pytest.mark.asyncio +@pytest.mark.timeout(120) +async def test_max_tokens_with_tool_choice_required( + client: openai.AsyncOpenAI, server_config: ServerConfig +): + """ """ + models = await client.models.list() + model_name: str = models.data[0].id + + # This combination previously crashed the engine + chat_completion = await client.chat.completions.create( + messages=ensure_system_prompt( + [{"role": "user", "content": "What is the weather in Dallas, Texas?"}], + server_config, + ), + temperature=0, + max_completion_tokens=150, + model=model_name, + tools=[WEATHER_TOOL], + tool_choice="required", + max_tokens=5, + ) + # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`, + # both `tool_calls` and `content` should be empty. + # This behavior should be consistent with OpenAI. + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert len(choice.message.tool_calls) == 0 + assert choice.message.content == "" diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index eb39e649a7e4..9af597847fcc 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1507,7 +1507,6 @@ async def chat_completion_full_generator( elif request.tool_choice and request.tool_choice == "required": tool_call_class_items = [] - assert tool_calls is not None and len(tool_calls) > 0 for idx, tool_call in enumerate(tool_calls): # Use native ID if available, # otherwise generate ID with correct id_type diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index fad2a7f8c2eb..2ef08893ed62 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import contextlib import json import time from collections.abc import AsyncGenerator, Callable, Mapping, Sequence @@ -13,7 +14,7 @@ from openai.types.responses import ( ToolChoiceFunction, ) -from pydantic import ConfigDict, TypeAdapter +from pydantic import ConfigDict, TypeAdapter, ValidationError from starlette.datastructures import Headers import vllm.envs as envs @@ -1125,8 +1126,11 @@ def _parse_tool_calls_from_content( ) content = None # Clear content since tool is called. elif request.tool_choice == "required": - assert content is not None - tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content) + tool_calls = [] + with contextlib.suppress(ValidationError): + tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json( + content + ) function_calls.extend( [ FunctionCall( From 1be8bdfab2ea7014fa66147a7eacad9de3d28d85 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Thu, 12 Mar 2026 11:25:35 +0800 Subject: [PATCH 2/5] [Bugfix] Fix crash when tool_choice=required exceeds max_tokens Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/engine/serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 2ef08893ed62..668526e3c430 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -1126,8 +1126,8 @@ def _parse_tool_calls_from_content( ) content = None # Clear content since tool is called. elif request.tool_choice == "required": - tool_calls = [] with contextlib.suppress(ValidationError): + content = content or "[]" tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json( content ) From 377798e55e646132e5d5fe2d5bc85dec04c7af86 Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Thu, 12 Mar 2026 11:32:57 +0800 Subject: [PATCH 3/5] [Bugfix] Fix crash when tool_choice=required exceeds max_tokens Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/chat_completion/serving.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 9af597847fcc..da934c05d5a7 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1507,6 +1507,7 @@ async def chat_completion_full_generator( elif request.tool_choice and request.tool_choice == "required": tool_call_class_items = [] + tool_calls = tool_calls or [] for idx, tool_call in enumerate(tool_calls): # Use native ID if available, # otherwise generate ID with correct id_type From 63078763662693d1141d194f4f8cabfca3f6f41b Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Thu, 12 Mar 2026 14:15:48 +0800 Subject: [PATCH 4/5] [Bugfix] Fix crash when tool_choice=required exceeds max_tokens Signed-off-by: chaunceyjiang --- tests/tool_use/test_chat_completions.py | 2 +- vllm/entrypoints/openai/engine/serving.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 1e6d5f37e9f3..fb72f0aab819 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -215,7 +215,7 @@ async def test_max_tokens_with_tool_choice_required( model=model_name, tools=[WEATHER_TOOL], tool_choice="required", - max_tokens=5, + max_tokens=1, ) # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`, # both `tool_calls` and `content` should be empty. diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 668526e3c430..2049b3adfd3c 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -1126,20 +1126,19 @@ def _parse_tool_calls_from_content( ) content = None # Clear content since tool is called. elif request.tool_choice == "required": + tool_calls = [] with contextlib.suppress(ValidationError): - content = content or "[]" + content = content or "" tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json( content ) - function_calls.extend( - [ + for tool_call in tool_calls: + function_calls.append( FunctionCall( name=tool_call.name, arguments=json.dumps(tool_call.parameters, ensure_ascii=False), ) - for tool_call in tool_calls - ] - ) + ) content = None # Clear content since tool is called. elif ( tool_parser_cls From d2533751cb5975418416798f69bb962dff3a445a Mon Sep 17 00:00:00 2001 From: chaunceyjiang Date: Thu, 12 Mar 2026 14:22:08 +0800 Subject: [PATCH 5/5] [Bugfix] Fix crash when tool_choice=required exceeds max_tokens Signed-off-by: chaunceyjiang --- .../test_completion_with_function_calling.py | 24 ++++++++++++++ tests/tool_use/test_chat_completions.py | 31 ------------------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 15a2fb85f489..39ab13213134 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -514,3 +514,27 @@ async def test_inconsistent_tool_choice_and_tools( ], tool_choice={}, ) + + +@pytest.mark.asyncio +async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI): + """ """ + models = await client.models.list() + model_name: str = models.data[0].id + + # This combination previously crashed the engine + chat_completion = await client.chat.completions.create( + messages=messages, + temperature=0, + max_completion_tokens=1, + model=model_name, + tools=tools, + tool_choice="required", + ) + # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`, + # both `tool_calls` and `content` should be empty. + # This behavior should be consistent with OpenAI. + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert len(choice.message.tool_calls) == 0 + assert choice.message.content == "" diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index fb72f0aab819..07b7933f65c0 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -193,34 +193,3 @@ async def test_response_format_with_tool_choice_required( assert choice.finish_reason == "tool_calls" assert choice.message.tool_calls is not None assert len(choice.message.tool_calls) > 0 - - -@pytest.mark.asyncio -@pytest.mark.timeout(120) -async def test_max_tokens_with_tool_choice_required( - client: openai.AsyncOpenAI, server_config: ServerConfig -): - """ """ - models = await client.models.list() - model_name: str = models.data[0].id - - # This combination previously crashed the engine - chat_completion = await client.chat.completions.create( - messages=ensure_system_prompt( - [{"role": "user", "content": "What is the weather in Dallas, Texas?"}], - server_config, - ), - temperature=0, - max_completion_tokens=150, - model=model_name, - tools=[WEATHER_TOOL], - tool_choice="required", - max_tokens=1, - ) - # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`, - # both `tool_calls` and `content` should be empty. - # This behavior should be consistent with OpenAI. - choice = chat_completion.choices[0] - assert choice.finish_reason == "length" - assert len(choice.message.tool_calls) == 0 - assert choice.message.content == ""