Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
bfe136a
update the logic of tool parser.
Seven-Streams Apr 28, 2026
76899e1
finish the test.
Seven-Streams Apr 5, 2026
5a984a1
update the qwen_coder.
Seven-Streams Apr 28, 2026
95e64e7
update the logic of get stag.
Seven-Streams Apr 29, 2026
7a3bbd0
update the test.
Seven-Streams Apr 29, 2026
3f5e0f3
fix the tool_choice type.
Seven-Streams Apr 29, 2026
db9ccc6
fix the test.
Seven-Streams Apr 29, 2026
8b246f9
Revert "fix the test."
Seven-Streams Apr 29, 2026
a5a5277
fix the validation.
Seven-Streams Apr 29, 2026
2de7bbd
fix the test.
Seven-Streams Apr 29, 2026
e70a720
update the version of xgr.
Seven-Streams Apr 29, 2026
7dfbd4d
fix the tool type.
Seven-Streams Apr 29, 2026
f06ccda
fix the test.
Seven-Streams Apr 29, 2026
f7c8c91
fix the import.
Seven-Streams Apr 29, 2026
5fbb503
fix the import.
Seven-Streams Apr 29, 2026
8a09479
update the api.
Seven-Streams Apr 29, 2026
098b80c
add v4 tests.
Seven-Streams Apr 29, 2026
15c99cb
update.
Seven-Streams Apr 29, 2026
93fc4b4
update hte priority.
Seven-Streams Apr 29, 2026
cbc745e
fix the test.
Seven-Streams Apr 29, 2026
894871f
fix the import.
Seven-Streams Apr 29, 2026
b3bf271
update the version of xgr.
Seven-Streams Apr 29, 2026
2e9478b
Merge branch 'main' into main-dev/2026-03-25/new_stag
mgoin May 1, 2026
1ecff43
Lint
mgoin May 1, 2026
6ca893e
Move structural tag builders into vLLM
Ubospica May 3, 2026
9ae5478
Drop non-target structural tag changes
Ubospica May 3, 2026
d962b80
Centralize structural tag xgrammar imports
Ubospica May 3, 2026
7d90832
Rename Qwen structural tag key
Ubospica May 3, 2026
760e5af
Inline structural tag builders
Ubospica May 3, 2026
4bd7d72
Stop Qwen 3.5 structural tag after first tool call
Ubospica May 3, 2026
1e94b99
Fix Qwen structural tag parsing
Ubospica May 3, 2026
deaf07a
Normalize tool schemas for Qwen structural tags
Ubospica May 3, 2026
e4285c7
Allow multiple Qwen structural tool calls
Ubospica May 3, 2026
e6ec236
format.
Seven-Streams May 3, 2026
260dea2
add the requirement.
Seven-Streams May 3, 2026
c6f98c5
avoid overwriting user's setting.
Seven-Streams May 3, 2026
306d220
a
Ubospica May 4, 2026
5fb2f35
format.
Seven-Streams May 4, 2026
e6fa4e3
set the flag off as default.
Seven-Streams May 4, 2026
dfda37c
update and fix bug
Ubospica May 4, 2026
07eb072
update
Ubospica May 4, 2026
376b84e
update
Ubospica May 4, 2026
45d43b6
format.
Ubospica May 4, 2026
ad4395a
Fix failing qwen3coder test
sfeng33 May 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ outlines_core == 0.2.14
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar >= 0.1.32, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
xgrammar >= 0.2.0, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
Expand Down
5 changes: 4 additions & 1 deletion requirements/test/rocm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ anyio==4.13.0
# sse-starlette
# starlette
# watchfiles
apache-tvm-ffi==0.1.10
# via xgrammar
arctic-inference==0.1.1
# via -r requirements/test/rocm.in
argcomplete==3.6.3
Expand Down Expand Up @@ -1264,6 +1266,7 @@ typing-extensions==4.15.0
# alembic
# anthropic
# anyio
# apache-tvm-ffi
# azure-core
# azure-identity
# azure-storage-blob
Expand Down Expand Up @@ -1345,7 +1348,7 @@ word2number==1.1
# via lm-eval
wrapt==2.1.2
# via smart-open
xgrammar==0.1.33
xgrammar==0.2.0
# via
# -c requirements/common.txt
# -r requirements/test/../common.txt
Expand Down
82 changes: 82 additions & 0 deletions tests/tool_parsers/test_deepseekv4_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@
import json
from unittest.mock import MagicMock

import pytest
from xgrammar import StructuralTag

from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedFunction,
ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
ChatCompletionToolsParam,
)
from vllm.tool_parsers import ToolParserManager
from vllm.tool_parsers.deepseekv4_tool_parser import DeepSeekV4ToolParser

Expand All @@ -20,6 +29,43 @@
PARAM_END = "</|DSML|parameter>"


@pytest.fixture
def sample_tools() -> list[ChatCompletionToolsParam]:
return [
ChatCompletionToolsParam(
type="function",
function={
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "The city name"},
"state": {"type": "string", "description": "The state code"},
"unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
},
"required": ["city", "state"],
},
},
),
ChatCompletionToolsParam(
type="function",
function={
"name": "calculate_area",
"description": "Calculate area of a shape",
"parameters": {
"type": "object",
"properties": {
"shape": {"type": "string"},
"dimensions": {"type": "object"},
"precision": {"type": "integer"},
},
},
},
),
]


def make_parser(tools=None) -> DeepSeekV4ToolParser:
return DeepSeekV4ToolParser(MOCK_TOKENIZER, tools=tools)

Expand Down Expand Up @@ -121,3 +167,39 @@ def test_streaming_extracts_complete_invokes():
]
assert names == ["search"]
assert json.loads(reconstruct_args(deltas)) == {"query": "deepseek v4"}


def test_get_vllm_registry_structural_tag_returns_structural_tag(
sample_tools: list[ChatCompletionToolsParam],
) -> None:
parser = make_parser()
req = ChatCompletionRequest(
messages=[],
model="m",
tools=sample_tools,
tool_choice="auto",
)
tag = parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)

req = ChatCompletionRequest(
messages=[],
model="m",
tools=sample_tools,
tool_choice="required",
)
tag = parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)

if sample_tools:
tool = sample_tools[0]
req = ChatCompletionRequest(
messages=[],
model="m",
tools=sample_tools,
)
req.tool_choice = ChatCompletionNamedToolChoiceParam(
function=ChatCompletionNamedFunction(name=tool.function.name)
)
tag = parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)
109 changes: 109 additions & 0 deletions tests/tool_parsers/test_qwen3coder_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

import pytest
from openai.types.responses.function_tool import FunctionTool
from xgrammar import StructuralTag

from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedFunction,
ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
ChatCompletionToolsParam,
)
Expand Down Expand Up @@ -108,6 +111,27 @@ def sample_tools(request):
]


def _as_chat_completion_tools(
tools: list[ChatCompletionToolsParam | FunctionTool],
) -> list[ChatCompletionToolsParam]:
normalized: list[ChatCompletionToolsParam] = []
for tool in tools:
if isinstance(tool, ChatCompletionToolsParam):
normalized.append(tool)
else:
normalized.append(
ChatCompletionToolsParam(
type="function",
function={
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters,
},
)
)
return normalized


def assert_tool_calls(
actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
):
Expand Down Expand Up @@ -1146,3 +1170,88 @@ def test_no_double_serialization_string_args(qwen3_tool_parser):
args = json.loads(raw_arguments)
assert args["message"] == "hello world"
assert '\\"hello world\\"' not in raw_arguments


def test_get_vllm_registry_structural_tag_returns_structural_tag(
qwen3_tool_parser: Qwen3CoderToolParser,
sample_tools: list[ChatCompletionToolsParam],
) -> None:
request_tools = _as_chat_completion_tools(sample_tools)
req = ChatCompletionRequest(
messages=[],
model="m",
tools=request_tools,
tool_choice="auto",
)
tag = qwen3_tool_parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)

req = ChatCompletionRequest(
messages=[],
model="m",
tools=request_tools,
tool_choice="required",
)
tag = qwen3_tool_parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)

if request_tools:
tool = request_tools[0]
req = ChatCompletionRequest(
messages=[],
model="m",
tools=request_tools,
)
req.tool_choice = ChatCompletionNamedToolChoiceParam(
function=ChatCompletionNamedFunction(name=tool.function.name)
)
tag = qwen3_tool_parser.get_structural_tag(req)
assert isinstance(tag, StructuralTag)


@pytest.mark.parametrize("include_reasoning", [True, False])
def test_adjust_request_auto_uses_vllm_registry_structural_tag(
monkeypatch: pytest.MonkeyPatch,
qwen3_tool_parser: Qwen3CoderToolParser,
sample_tools: list[ChatCompletionToolsParam],
include_reasoning: bool,
) -> None:
monkeypatch.setattr(
"vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
True,
)
request_tools = _as_chat_completion_tools(sample_tools)
req = ChatCompletionRequest(
messages=[],
model="m",
tools=request_tools,
tool_choice="auto",
include_reasoning=include_reasoning,
)
out = qwen3_tool_parser.adjust_request(req)
assert out.structured_outputs is not None
assert out.structured_outputs.structural_tag is not None
assert isinstance(out.structured_outputs.structural_tag, str)
loaded = json.loads(out.structured_outputs.structural_tag)
assert isinstance(loaded, dict)


def test_adjust_request_required_prefers_structural_tag(
monkeypatch: pytest.MonkeyPatch,
qwen3_tool_parser: Qwen3CoderToolParser,
sample_tools: list[ChatCompletionToolsParam],
) -> None:
monkeypatch.setattr(
"vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
True,
)
request_tools = _as_chat_completion_tools(sample_tools)
req = ChatCompletionRequest(
messages=[],
model="m",
tools=request_tools,
tool_choice="required",
)
out = qwen3_tool_parser.adjust_request(req)
assert out.structured_outputs is not None
assert out.structured_outputs.structural_tag is not None
15 changes: 15 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,21 @@ async def init_app_state(
supported_tasks: tuple["SupportedTask", ...] | None = None,
) -> None:
vllm_config = engine_client.vllm_config

# Propagate enable_in_reasoning to the API-server process. The engine core
# runs in a separate process, so the contextvar that backs
# `get_current_vllm_config_or_none()` is None on this stack. Tool parsers
# call `get_enable_structured_outputs_in_reasoning()` during request
# handling and need to see the real flag, otherwise they silently fall
# back to False and mismatch the engine-side bitmask gating.
from vllm.tool_parsers.structural_tag_registry import (
set_enable_structured_outputs_in_reasoning,
)

set_enable_structured_outputs_in_reasoning(
vllm_config.structured_outputs_config.enable_in_reasoning
)

if supported_tasks is None:
warnings.warn(
"The 'supported_tasks' parameter was not provided to "
Expand Down
7 changes: 7 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
VLLM_SYSTEM_START_DATE: str | None = None
VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
VLLM_ENFORCE_STRICT_TOOL_CALLING: bool = False
VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
Expand Down Expand Up @@ -1591,6 +1592,12 @@ def _get_or_set_default() -> str:
"VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool(
int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0"))
),
# When 1,the model structural tags will be used to enforce the model
# output conforming to the model's tool-calling format and schema.
# Default 0 (off).
"VLLM_ENFORCE_STRICT_TOOL_CALLING": lambda: bool(
int(os.getenv("VLLM_ENFORCE_STRICT_TOOL_CALLING", "0"))
),
# Add optional custom scopes for profiling, disable to avoid overheads
"VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
Expand Down
40 changes: 36 additions & 4 deletions vllm/tool_parsers/abstract_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import importlib
import json
import os
from collections.abc import Callable, Sequence
from functools import cached_property
Expand All @@ -13,6 +14,7 @@
from openai.types.responses.function_tool import FunctionTool

from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
ChatCompletionToolsParam,
)
Expand All @@ -23,6 +25,7 @@
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.envs import VLLM_ENFORCE_STRICT_TOOL_CALLING
from vllm.logger import init_logger
from vllm.sampling_params import (
StructuredOutputsParams,
Expand Down Expand Up @@ -83,13 +86,39 @@ def vocab(self) -> dict[str, int]:
return self.model_tokenizer.get_vocab()

def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
self,
request: ChatCompletionRequest | ResponsesRequest,
) -> ChatCompletionRequest | ResponsesRequest:
"""
Static method that used to adjust the request parameters.
"""
# If there are no tools, return the request as is.
if not request.tools:
return request

# Step 1 (highest priority for ChatCompletionRequest): apply
# vLLM-owned structural tag support for model-specific tool formats.
if (
isinstance(request, ChatCompletionRequest)
and VLLM_ENFORCE_STRICT_TOOL_CALLING
):
need_tool_calling = (
request.tool_choice == "auto"
or request.tool_choice == "required"
or isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
)
if need_tool_calling:
structure_tag = self.get_structural_tag(request)
if structure_tag is not None:
if request.structured_outputs is None:
request.structured_outputs = StructuredOutputsParams(
structural_tag=json.dumps(structure_tag.model_dump()),
)
else:
request.structured_outputs.structural_tag = json.dumps(
structure_tag.model_dump()
)
return request

# Step 2: set structured output params when tool constraints are
# derived from the tool schema.
json_schema_from_tool = get_json_schema_from_tools(
tool_choice=request.tool_choice, tools=request.tools
)
Expand Down Expand Up @@ -121,6 +150,9 @@ def adjust_request(

return request

def get_structural_tag(self, request: ChatCompletionRequest):
return None

def extract_tool_calls(
self, model_output: str, request: ChatCompletionRequest
) -> ExtractedToolCallInformation:
Expand Down
Loading
Loading