diff --git a/tests/tool_parsers/test_qwen3_xml_coder_shared.py b/tests/tool_parsers/test_qwen3_xml_coder_shared.py
new file mode 100644
index 000000000000..6c56c6d47063
--- /dev/null
+++ b/tests/tool_parsers/test_qwen3_xml_coder_shared.py
@@ -0,0 +1,2230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Shared tests for the Qwen3 XML and Coder tool parsers.
+
+These tests cover behaviour that BOTH parsers must implement identically.
+Each test runs twice — once against ``Qwen3XMLToolParser`` and once against
+``Qwen3CoderToolParser`` — via the ``parser_cls`` fixture. Tests that
+target streaming-mode-specific quirks of one parser only stay in their
+parser-specific file (``test_qwen3xml_tool_parser.py`` or
+``test_qwen3coder_tool_parser.py``).
+"""
+
+import json
+from collections.abc import Generator
+
+import pytest
+from openai.types.responses.function_tool import FunctionTool
+from xgrammar import StructuralTag
+
+from tests.tool_parsers.utils import run_tool_extraction_streaming
+from vllm.entrypoints.openai.chat_completion.protocol import (
+ ChatCompletionNamedFunction,
+ ChatCompletionNamedToolChoiceParam,
+ ChatCompletionRequest,
+ ChatCompletionToolsParam,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+ DeltaMessage,
+ FunctionCall,
+ ToolCall,
+)
+from vllm.tokenizers import TokenizerLike, get_tokenizer
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.tool_parsers.qwen3coder_tool_parser import Qwen3CoderToolParser
+from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+
+MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+ return get_tokenizer(tokenizer_name=MODEL)
+
+
+@pytest.fixture(
+ params=[Qwen3XMLToolParser, Qwen3CoderToolParser],
+ ids=["xml", "coder"],
+)
+def parser_cls(request):
+ return request.param
+
+
+WEATHER_PARAMS = {
+ "type": "object",
+ "properties": {
+ "city": {"type": "string", "description": "The city name"},
+ "state": {"type": "string", "description": "The state code"},
+ "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
+ },
+ "required": ["city", "state"],
+}
+
+AREA_PARAMS = {
+ "type": "object",
+ "properties": {
+ "shape": {"type": "string"},
+ "dimensions": {"type": "object"},
+ "precision": {"type": "integer"},
+ },
+}
+
+
+@pytest.fixture(params=["chat_completion", "responses_api"])
+def sample_tools(request):
+ if request.param == "chat_completion":
+ return [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "get_current_weather",
+ "description": "Get the current weather",
+ "parameters": WEATHER_PARAMS,
+ },
+ ),
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "calculate_area",
+ "description": "Calculate area of a shape",
+ "parameters": AREA_PARAMS,
+ },
+ ),
+ ]
+ else:
+ return [
+ FunctionTool(
+ type="function",
+ name="get_current_weather",
+ description="Get the current weather",
+ parameters=WEATHER_PARAMS,
+ ),
+ FunctionTool(
+ type="function",
+ name="calculate_area",
+ description="Calculate area of a shape",
+ parameters=AREA_PARAMS,
+ ),
+ ]
+
+
+@pytest.fixture
+def parser(parser_cls, qwen3_tokenizer, sample_tools):
+ return parser_cls(qwen3_tokenizer, tools=sample_tools)
+
+
+def _as_chat_completion_tools(
+ tools: list[ChatCompletionToolsParam | FunctionTool],
+) -> list[ChatCompletionToolsParam]:
+ normalized: list[ChatCompletionToolsParam] = []
+ for tool in tools:
+ if isinstance(tool, ChatCompletionToolsParam):
+ normalized.append(tool)
+ else:
+ normalized.append(
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": tool.name,
+ "description": tool.description,
+ "parameters": tool.parameters,
+ },
+ )
+ )
+ return normalized
+
+
+def assert_tool_calls(
+ actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
+):
+ assert len(actual_tool_calls) == len(expected_tool_calls)
+ for actual_tool_call, expected_tool_call in zip(
+ actual_tool_calls, expected_tool_calls
+ ):
+ assert actual_tool_call.type == "function"
+ assert actual_tool_call.function.name == expected_tool_call.function.name
+ assert json.loads(actual_tool_call.function.arguments) == json.loads(
+ expected_tool_call.function.arguments
+ )
+
+
+def stream_delta_message_generator(
+ parser,
+ tokenizer: TokenizerLike,
+ model_output: str,
+ request: ChatCompletionRequest | None = None,
+) -> Generator[DeltaMessage, None, None]:
+ all_token_ids = tokenizer.encode(model_output, add_special_tokens=False)
+
+ previous_text = ""
+ previous_tokens = None
+ prefix_offset = 0
+ read_offset = 0
+ for i, delta_token in enumerate(all_token_ids):
+ delta_token_ids = [delta_token]
+ previous_token_ids = all_token_ids[:i]
+ current_token_ids = all_token_ids[: i + 1]
+
+ (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
+ detokenize_incrementally(
+ tokenizer=tokenizer,
+ all_input_ids=current_token_ids,
+ prev_tokens=previous_tokens,
+ prefix_offset=prefix_offset,
+ read_offset=read_offset,
+ skip_special_tokens=False,
+ spaces_between_special_tokens=True,
+ )
+ )
+
+ current_text = previous_text + delta_text
+
+ delta_message = parser.extract_tool_calls_streaming(
+ previous_text,
+ current_text,
+ delta_text,
+ previous_token_ids,
+ current_token_ids,
+ delta_token_ids,
+ request=request,
+ )
+ if delta_message:
+ yield delta_message
+
+ previous_text = current_text
+ previous_tokens = (
+ previous_tokens + new_tokens if previous_tokens else new_tokens
+ )
+ prefix_offset = new_prefix_offset
+ read_offset = new_read_offset
+
+
+# ---------------------------------------------------------------------------
+# Basic extraction
+# ---------------------------------------------------------------------------
+
+
+def test_extract_tool_calls_no_tools(parser):
+ model_output = "This is a test response without any tool calls"
+ extracted_tool_calls = parser.extract_tool_calls(model_output, request=None)
+ assert not extracted_tool_calls.tools_called
+ assert extracted_tool_calls.tool_calls == []
+ assert extracted_tool_calls.content == model_output
+
+
+_EXTRACT_CASES = [
+ (
+ """
+
+
+Dallas
+
+
+TX
+
+
+fahrenheit
+
+
+""",
+ [
+ ToolCall(
+ function=FunctionCall(
+ name="get_current_weather",
+ arguments=json.dumps(
+ {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+ ),
+ )
+ )
+ ],
+ None,
+ ),
+ (
+ """Sure! Let me check the weather for you.
+
+
+Dallas
+
+
+TX
+
+
+fahrenheit
+
+
+""",
+ [
+ ToolCall(
+ function=FunctionCall(
+ name="get_current_weather",
+ arguments=json.dumps(
+ {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+ ),
+ )
+ )
+ ],
+ "Sure! Let me check the weather for you.",
+ ),
+ (
+ """
+
+
+rectangle
+
+
+{"width": 10,
+ "height": 20}
+
+
+2
+
+
+""",
+ [
+ ToolCall(
+ function=FunctionCall(
+ name="calculate_area",
+ arguments=json.dumps(
+ {
+ "shape": "rectangle",
+ "dimensions": {"width": 10, "height": 20},
+ "precision": 2,
+ }
+ ),
+ )
+ )
+ ],
+ None,
+ ),
+ (
+ """
+
+
+Dallas
+
+
+TX
+
+
+fahrenheit
+
+
+
+
+
+
+Orlando
+
+
+FL
+
+
+fahrenheit
+
+
+""",
+ [
+ ToolCall(
+ function=FunctionCall(
+ name="get_current_weather",
+ arguments=json.dumps(
+ {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
+ ),
+ )
+ ),
+ ToolCall(
+ function=FunctionCall(
+ name="get_current_weather",
+ arguments=json.dumps(
+ {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
+ ),
+ )
+ ),
+ ],
+ "\n",
+ ),
+ (
+ """Let me calculate that area for you.
+
+
+circle
+
+
+{"radius": 15.5}
+
+
+3
+
+
+""",
+ [
+ ToolCall(
+ function=FunctionCall(
+ name="calculate_area",
+ arguments=json.dumps(
+ {
+ "shape": "circle",
+ "dimensions": {"radius": 15.5},
+ "precision": 3,
+ }
+ ),
+ )
+ )
+ ],
+ "Let me calculate that area for you.",
+ ),
+]
+
+_EXTRACT_IDS = [
+ "single_tool",
+ "single_tool_with_content",
+ "single_tool_multiline_param",
+ "parallel_tools",
+ "tool_with_typed_params",
+]
+
+
+@pytest.mark.parametrize(
+ ids=_EXTRACT_IDS,
+ argnames=["model_output", "expected_tool_calls", "expected_content"],
+ argvalues=_EXTRACT_CASES,
+)
+def test_extract_tool_calls(
+ parser, model_output, expected_tool_calls, expected_content
+):
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
+ assert extracted_tool_calls.tools_called
+ assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
+ # Both ``None`` and ``""`` are acceptable when the expected content is
+ # only whitespace — the two parsers differ on whether they preserve the
+ # newline that separates parallel tool-call blocks.
+ actual_content = extracted_tool_calls.content
+ if expected_content and expected_content.strip():
+ assert actual_content == expected_content
+ else:
+ assert (actual_content or "").strip() == (expected_content or "").strip()
+
+
+def test_extract_tool_calls_fallback_no_tags(parser):
+ """Test fallback parsing when XML tags are missing."""
+ model_output = """
+
+Dallas
+
+
+TX
+
+"""
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
+ assert extracted_tool_calls.tools_called
+ assert len(extracted_tool_calls.tool_calls) == 1
+ assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
+
+
+# ---------------------------------------------------------------------------
+# Type conversion
+# ---------------------------------------------------------------------------
+
+
+def test_extract_tool_calls_type_conversion(qwen3_tokenizer, parser_cls):
+ """Test parameter type conversion based on tool schema."""
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "test_types",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "int_param": {"type": "integer"},
+ "float_param": {"type": "float"},
+ "bool_param": {"type": "boolean"},
+ "str_param": {"type": "string"},
+ "obj_param": {"type": "object"},
+ },
+ },
+ },
+ )
+ ]
+
+ model_output = """
+
+
+42
+
+
+3.14
+
+
+true
+
+
+hello world
+
+
+{"key": "value"}
+
+
+"""
+
+ parser_inst = parser_cls(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ extracted_tool_calls = parser_inst.extract_tool_calls(model_output, request=request)
+
+ args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+ assert args["int_param"] == 42
+ assert args["float_param"] == 3.14
+ assert args["bool_param"] is True
+ assert args["str_param"] == "hello world"
+ assert args["obj_param"] == {"key": "value"}
+
+
+def test_extract_tool_calls_complex_type_with_single_quote(qwen3_tokenizer, parser_cls):
+ """Object parameter expressed as a Python repr (single quotes)."""
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "test_types",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "int_param": {"type": "integer"},
+ "float_param": {"type": "float"},
+ "bool_param": {"type": "boolean"},
+ "str_param": {"type": "string"},
+ "obj_param": {"type": "object"},
+ },
+ },
+ },
+ )
+ ]
+
+ model_output = """
+
+
+{'key': 'value'}
+
+
+"""
+
+ parser_inst = parser_cls(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ extracted_tool_calls = parser_inst.extract_tool_calls(model_output, request=request)
+
+ args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+ assert args["obj_param"] == {"key": "value"}
+
+
+# ---------------------------------------------------------------------------
+# Streaming extraction
+# ---------------------------------------------------------------------------
+
+
+_STREAMING_CASES = [
+ ("This is a test without tools", [], "This is a test without tools"),
+] + _EXTRACT_CASES
+
+_STREAMING_IDS = ["no_tools"] + _EXTRACT_IDS
+
+
+@pytest.mark.parametrize(
+ ids=_STREAMING_IDS,
+ argnames=["model_output", "expected_tool_calls", "expected_content"],
+ argvalues=_STREAMING_CASES,
+)
+def test_extract_tool_calls_streaming(
+ parser,
+ qwen3_tokenizer,
+ model_output,
+ expected_tool_calls,
+ expected_content,
+):
+ """Test incremental streaming behavior including typed parameters."""
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+
+ other_content = ""
+ tool_states = {}
+
+ for delta_message in stream_delta_message_generator(
+ parser, qwen3_tokenizer, model_output, request
+ ):
+ assert not delta_message.role
+
+ if delta_message.content:
+ other_content += delta_message.content
+
+ if delta_message.tool_calls:
+ for tool_call in delta_message.tool_calls:
+ idx = tool_call.index
+
+ if idx not in tool_states:
+ tool_states[idx] = {
+ "id": None,
+ "name": None,
+ "arguments": "",
+ "type": None,
+ }
+
+ if tool_call.id:
+ tool_states[idx]["id"] = tool_call.id
+
+ if tool_call.type:
+ assert tool_call.type == "function"
+ tool_states[idx]["type"] = tool_call.type
+
+ if tool_call.function:
+ if tool_call.function.name:
+ assert tool_states[idx]["name"] is None
+ tool_states[idx]["name"] = tool_call.function.name
+
+ if tool_call.function.arguments is not None:
+ tool_states[idx]["arguments"] += tool_call.function.arguments
+
+ # Be tolerant about whitespace-only deltas between parallel tool calls;
+ # see ``test_extract_tool_calls`` for the same reasoning.
+ if expected_content and expected_content.strip():
+ assert other_content == expected_content
+ else:
+ assert other_content.strip() == (expected_content or "").strip()
+ assert len(tool_states) == len(expected_tool_calls)
+ assert len(parser.prev_tool_call_arr) == len(expected_tool_calls)
+
+ for idx, expected_tool in enumerate(expected_tool_calls):
+ state = tool_states[idx]
+ assert state["id"] is not None
+ assert state["type"] == "function"
+ assert state["name"] == expected_tool.function.name
+
+ arguments_str = state["arguments"]
+ assert arguments_str is not None
+ actual_args = json.loads(arguments_str)
+ expected_args = json.loads(expected_tool.function.arguments)
+ assert actual_args == expected_args
+
+
+def test_extract_tool_calls_missing_closing_parameter_tag(parser):
+ """Test handling of missing closing tag."""
+ model_output = """Let me check the weather for you:
+
+
+
+Dallas
+
+TX
+
+
+fahrenheit
+
+
+"""
+
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
+
+ assert extracted_tool_calls.tools_called
+ assert len(extracted_tool_calls.tool_calls) == 1
+ assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
+ args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
+ assert "city" in args
+ assert args["city"] == "Dallas"
+ assert args["state"] == "TX"
+ assert args["unit"] == "fahrenheit"
+ assert "Let me check the weather for you:" in extracted_tool_calls.content
+
+
+def test_extract_tool_calls_streaming_missing_closing_tag(parser, qwen3_tokenizer):
+ """Streaming with missing closing tag."""
+ model_output = """Let me check the weather for you:
+
+
+
+Dallas
+
+TX
+
+
+fahrenheit
+
+
+"""
+
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ other_content = ""
+ tool_states = {}
+
+ for delta_message in stream_delta_message_generator(
+ parser, qwen3_tokenizer, model_output, request
+ ):
+ if delta_message.content:
+ other_content += delta_message.content
+
+ if delta_message.tool_calls:
+ for tool_call in delta_message.tool_calls:
+ idx = tool_call.index
+ if idx not in tool_states:
+ tool_states[idx] = {
+ "id": None,
+ "name": None,
+ "arguments": "",
+ "type": None,
+ }
+ if tool_call.id:
+ tool_states[idx]["id"] = tool_call.id
+ if tool_call.type:
+ assert tool_call.type == "function"
+ tool_states[idx]["type"] = tool_call.type
+ if tool_call.function:
+ if tool_call.function.name:
+ tool_states[idx]["name"] = tool_call.function.name
+ if tool_call.function.arguments is not None:
+ tool_states[idx]["arguments"] += tool_call.function.arguments
+
+ assert "Let me check the weather for you:" in other_content
+ assert len(tool_states) == 1
+ assert len(parser.prev_tool_call_arr) == 1
+
+ state = tool_states[0]
+ assert state["id"] is not None
+ assert state["type"] == "function"
+ assert state["name"] == "get_current_weather"
+ args = json.loads(state["arguments"])
+ assert args["city"] == "Dallas"
+ assert args["state"] == "TX"
+ assert args["unit"] == "fahrenheit"
+
+
+def test_extract_tool_calls_streaming_incremental(parser, qwen3_tokenizer):
+ """Test that streaming is truly incremental."""
+ model_output = """I'll check the weather.
+
+
+Dallas
+
+
+TX
+
+
+"""
+
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ chunks = []
+ for delta_message in stream_delta_message_generator(
+ parser, qwen3_tokenizer, model_output, request
+ ):
+ chunks.append(delta_message)
+
+ assert len(chunks) > 3
+ assert chunks[0].content is not None
+ assert chunks[0].tool_calls is None or chunks[0].tool_calls == []
+
+ header_found = False
+ for chunk in chunks:
+ if chunk.tool_calls and chunk.tool_calls[0].id:
+ header_found = True
+ assert chunk.tool_calls[0].function.name == "get_current_weather"
+ assert chunk.tool_calls[0].type == "function"
+ # XML emits an empty arguments string with the header; Coder
+ # emits the opening "{" with the header. Both are valid.
+ assert chunk.tool_calls[0].function.arguments in ("", "{")
+ break
+ assert header_found
+
+ arg_chunks = []
+ for chunk in chunks:
+ if chunk.tool_calls and chunk.tool_calls[0].function.arguments:
+ arg_chunks.append(chunk.tool_calls[0].function.arguments)
+
+ assert len(arg_chunks) > 1
+ full_args = "".join(arg_chunks)
+ parsed_args = json.loads(full_args)
+ assert parsed_args["city"] == "Dallas"
+ assert parsed_args["state"] == "TX"
+
+
+# ---------------------------------------------------------------------------
+# Robustness regressions
+# ---------------------------------------------------------------------------
+
+
+def test_malformed_xml_no_gt_delimiter(parser):
+ """Regression: malformed XML without '>' must not crash (PR #36774)."""
+ model_output = (
+ "\n"
+ "Dallas\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ result = parser.extract_tool_calls(model_output, request=request)
+ assert result is not None
+ assert isinstance(result.tool_calls, list)
+ assert all(tc is not None for tc in result.tool_calls)
+
+
+def test_none_tool_calls_filtered(parser):
+ """Regression: None tool calls filtered from output (PR #36774)."""
+ model_output = (
+ "\n"
+ "\n"
+ "\n"
+ "\n"
+ "\n"
+ "Dallas\n"
+ "TX\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ result = parser.extract_tool_calls(model_output, request=request)
+ assert all(tc is not None for tc in result.tool_calls)
+ assert result.tools_called
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "get_current_weather"
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["city"] == "Dallas"
+ assert args["state"] == "TX"
+
+
+def test_streaming_multi_param_single_chunk(parser):
+ """Regression: speculative decode delivering multiple params at once
+ (PR #35615)."""
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+
+ deltas = [
+ "",
+ "\n",
+ "\n",
+ # This single delta delivers all three parameters at once
+ "\nDallas\n"
+ "\n\nTX\n"
+ "\n\nfahrenheit\n",
+ "\n",
+ "\n",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ deltas,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert args["city"] == "Dallas"
+ assert args["state"] == "TX"
+ assert args["unit"] == "fahrenheit"
+
+
+def test_no_double_serialization_string_args(qwen3_tokenizer, parser_cls):
+ """Regression: string arguments must not be double-serialized
+ (PR #35615)."""
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "greet",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "message": {"type": "string"},
+ },
+ },
+ },
+ )
+ ]
+
+ model_output = (
+ "\n"
+ "\n"
+ "hello world\n"
+ "\n"
+ ""
+ )
+
+ parser_inst = parser_cls(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser_inst.extract_tool_calls(model_output, request=request)
+
+ assert result.tools_called
+ assert len(result.tool_calls) == 1
+ raw_arguments = result.tool_calls[0].function.arguments
+ args = json.loads(raw_arguments)
+ assert args["message"] == "hello world"
+ assert '\\"hello world\\"' not in raw_arguments
+
+
+def test_extract_tool_calls_streaming_speculative_decode_loss(parser):
+ """If the parser hasn't started JSON yet and the delta contains the
+ parameters AND the end of the tool call, the parser should not just
+ return '{' and lose the parameters.
+ """
+ request = ChatCompletionRequest(model="test", messages=[])
+
+ text1 = "\n\n"
+ parser.extract_tool_calls_streaming("", text1, text1, [], [1], [1], request)
+
+ delta_str = "\nParis\n\n\n"
+ text2 = text1 + delta_str
+ delta2 = parser.extract_tool_calls_streaming(
+ text1, text2, delta_str, [1], [1, 2], [2], request
+ )
+
+ assert delta2 is not None
+ assert delta2.tool_calls is not None
+ assert len(delta2.tool_calls) == 1
+ args = delta2.tool_calls[0].function.arguments
+ assert "Paris" in args, f"Arguments lost! Got: {args}"
+
+
+# ---------------------------------------------------------------------------
+# Value conversion: string "null" must NOT become JSON null
+# ---------------------------------------------------------------------------
+
+
+def test_string_null_value_preserved(qwen3_tokenizer, parser_cls):
+ """A string-typed parameter with literal value "null" must be preserved
+ as the string "null" (not converted to Python None / JSON null).
+
+ Root cause: _convert_param_value must check the schema's ``string``
+ type BEFORE the "null" shortcut — otherwise any param whose raw text
+ is "null" becomes None regardless of declared type.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "search",
+ "parameters": {
+ "type": "object",
+ "properties": {"query": {"type": "string"}},
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ model_output = (
+ "\n"
+ "\n"
+ "null\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser.extract_tool_calls(model_output, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["query"] == "null", (
+ f"String parameter 'null' was converted incorrectly. Got: {args.get('query')!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# anyOf nullable schema — type detection
+# ---------------------------------------------------------------------------
+
+
+def test_anyof_string_null_keeps_value_as_string(qwen3_tokenizer, parser_cls):
+ """anyOf [{type: string}, {type: null}] with a numeric-looking value
+ must keep the value as a string (the schema declares ``string``).
+
+ Root cause: anyOf was previously treated as ``object`` (for the Coder
+ parser) or fell back to ``string`` only when no object/array option
+ was present (for the XML parser). The correct behaviour is to pick
+ the FIRST non-null type from the anyOf list.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_code",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "code": {
+ "anyOf": [{"type": "string"}, {"type": "null"}],
+ },
+ },
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ model_output = (
+ "\n"
+ "\n"
+ "42\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser.extract_tool_calls(model_output, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["code"] == "42", (
+ f"anyOf string|null param '42' was parsed as "
+ f"{type(args['code']).__name__}: {args['code']!r}"
+ )
+
+
+def test_anyof_integer_null_parses_as_int(qwen3_tokenizer, parser_cls):
+ """anyOf [{type: integer}, {type: null}] must parse a numeric value as
+ an int. Previously the XML parser ignored anyOf for non-container
+ types and silently treated the param as ``string``.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_count",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "count": {
+ "anyOf": [{"type": "integer"}, {"type": "null"}],
+ },
+ },
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ model_output = (
+ "\n"
+ "\n"
+ "42\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser.extract_tool_calls(model_output, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["count"] == 42, (
+ f"anyOf integer|null: expected int 42, got {args['count']!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# anyOf object schema — value not double-encoded
+# ---------------------------------------------------------------------------
+
+_ANYOF_OBJECT_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "update_record",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "data": {
+ "anyOf": [{"type": "object"}, {"type": "null"}],
+ },
+ },
+ },
+ },
+ )
+]
+
+_ANYOF_OBJECT_OUTPUT = (
+ "\n"
+ "\n"
+ '{"key": "value", "count": 42}\n'
+ "\n"
+ ""
+)
+
+
+def test_anyof_object_param_not_double_encoded_nonstreaming(
+ qwen3_tokenizer, parser_cls
+):
+ parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_OBJECT_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_OBJECT_TOOLS)
+ result = parser.extract_tool_calls(_ANYOF_OBJECT_OUTPUT, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert isinstance(args["data"], dict), (
+ f"anyOf object param was double-encoded: data={args['data']!r}"
+ )
+ assert args["data"] == {"key": "value", "count": 42}
+
+
+def test_anyof_object_param_not_double_encoded_streaming(qwen3_tokenizer, parser_cls):
+ parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_OBJECT_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_OBJECT_TOOLS)
+ deltas = [
+ "",
+ "\n",
+ '\n{"key": "value", "count": 42}',
+ "\n",
+ "\n",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert isinstance(args["data"], dict), (
+ f"anyOf object param was double-encoded in streaming: data={args['data']!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# anyOf / nullable (Pydantic v2 Optional[T]) type resolution.
+# Both parsers extract the first non-null type from the anyOf union.
+# ---------------------------------------------------------------------------
+
+_ANYOF_TYPES_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "test_anyof",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "anyof_int": {
+ "anyOf": [{"type": "integer"}, {"type": "null"}],
+ "default": 5,
+ },
+ "anyof_str": {
+ "anyOf": [{"type": "string"}, {"type": "null"}],
+ },
+ "anyof_array": {
+ "anyOf": [
+ {"type": "array", "items": {"type": "string"}},
+ {"type": "null"},
+ ],
+ },
+ "anyof_obj": {
+ "anyOf": [{"type": "object"}, {"type": "null"}],
+ },
+ "type_as_array": {
+ "type": ["integer", "null"],
+ },
+ "multi_non_null": {
+ "anyOf": [
+ {"type": "string"},
+ {"type": "integer"},
+ {"type": "null"},
+ ],
+ },
+ },
+ },
+ },
+ )
+]
+
+_ANYOF_TYPES_OUTPUT = (
+ "\n"
+ "\n"
+ "5\n"
+ "hello\n"
+ '["a", "b", "c"]\n'
+ '{"key": "value"}\n'
+ "42\n"
+ "some text\n"
+ "\n"
+ ""
+)
+
+
+def test_extract_tool_calls_anyof_type_conversion(qwen3_tokenizer, parser_cls):
+ """anyOf nullable schemas (Pydantic v2 ``Optional[T]``) must resolve to
+ the first non-null type and apply the matching conversion: int(),
+ list/dict via json, string passthrough.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_TYPES_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_TYPES_TOOLS)
+ result = parser.extract_tool_calls(_ANYOF_TYPES_OUTPUT, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["anyof_int"] == 5
+ assert isinstance(args["anyof_int"], int)
+ assert args["anyof_str"] == "hello"
+ assert isinstance(args["anyof_str"], str)
+ assert args["anyof_array"] == ["a", "b", "c"]
+ assert isinstance(args["anyof_array"], list)
+ assert args["anyof_obj"] == {"key": "value"}
+ assert isinstance(args["anyof_obj"], dict)
+ # JSON-Schema list-form type {"type": ["integer", "null"]} → int
+ assert args["type_as_array"] == 42
+ assert isinstance(args["type_as_array"], int)
+ # anyOf[string, integer, null] → first non-null type is string
+ assert args["multi_non_null"] == "some text"
+ assert isinstance(args["multi_non_null"], str)
+
+
+_ANYOF_STREAMING_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "search_web",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "query": {
+ "anyOf": [{"type": "string"}, {"type": "null"}],
+ },
+ "count": {
+ "anyOf": [{"type": "integer"}, {"type": "null"}],
+ "default": 5,
+ },
+ "verbose": {
+ "anyOf": [{"type": "boolean"}, {"type": "null"}],
+ },
+ },
+ },
+ },
+ )
+]
+
+_ANYOF_STREAMING_OUTPUT = (
+ "\n"
+ "\n"
+ "vllm tool parser\n"
+ "10\n"
+ "true\n"
+ "\n"
+ ""
+)
+
+
+def test_extract_tool_calls_anyof_type_conversion_streaming(
+ qwen3_tokenizer, parser_cls
+):
+ """Streaming e2e for anyOf nullable schemas: string/int/bool types must
+ be resolved through the incremental pipeline for both parsers.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_STREAMING_TOOLS)
+ request = ChatCompletionRequest(
+ model=MODEL, messages=[], tools=_ANYOF_STREAMING_TOOLS
+ )
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ _ANYOF_STREAMING_OUTPUT,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+ assert len(reconstructor.tool_calls) == 1
+ assert reconstructor.tool_calls[0].function.name == "search_web"
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert args["query"] == "vllm tool parser"
+ assert isinstance(args["query"], str)
+ assert args["count"] == 10
+ assert isinstance(args["count"], int)
+ assert args["verbose"] is True
+ assert isinstance(args["verbose"], bool)
+
+
+# ---------------------------------------------------------------------------
+# Object param double-encoded as JSON-encoded Python repr
+# ---------------------------------------------------------------------------
+
+_DOUBLE_ENCODED_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "process",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "name": {"type": "string"},
+ "data": {"type": "object"},
+ },
+ },
+ },
+ )
+]
+
+_DOUBLE_ENCODED_OUTPUT = (
+ "\n"
+ "\n"
+ "\nhello\n\n"
+ "\n\"{'key': 'value', 'n': 1}\"\n\n"
+ "\n"
+ "\n"
+)
+
+
+def test_double_encoded_object_param_nonstreaming(qwen3_tokenizer, parser_cls):
+ """A model trained with a buggy template (json.dumps(str(dict))) emits
+ object args as a JSON-encoded Python repr string. The parser must
+ double-decode it back to a dict.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_DOUBLE_ENCODED_TOOLS)
+ request = ChatCompletionRequest(
+ model=MODEL, messages=[], tools=_DOUBLE_ENCODED_TOOLS
+ )
+ result = parser.extract_tool_calls(_DOUBLE_ENCODED_OUTPUT, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["name"] == "hello"
+ assert isinstance(args["data"], dict), (
+ f"Expected dict, got {type(args['data'])}: {args['data']!r}"
+ )
+ assert args["data"] == {"key": "value", "n": 1}
+
+
+def test_double_encoded_object_param_streaming(qwen3_tokenizer, parser_cls):
+ parser = parser_cls(qwen3_tokenizer, tools=_DOUBLE_ENCODED_TOOLS)
+ request = ChatCompletionRequest(
+ model=MODEL, messages=[], tools=_DOUBLE_ENCODED_TOOLS
+ )
+ reconstructor = run_tool_extraction_streaming(
+ parser, _DOUBLE_ENCODED_OUTPUT, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert args["name"] == "hello"
+ assert isinstance(args["data"], dict), (
+ f"Expected dict, got {type(args['data'])}: {args['data']!r}"
+ )
+ assert args["data"] == {"key": "value", "n": 1}
+
+
+# ---------------------------------------------------------------------------
+# Parameter value containing XML structural tags as literal text.
+# Expected: the value is preserved intact, no spurious extra parameters
+# are created from the embedded tags.
+# ---------------------------------------------------------------------------
+
+_WRITE_FILE_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "write_file",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "path": {"type": "string"},
+ "content": {"type": "string"},
+ },
+ },
+ },
+ )
+]
+
+_XML_TAGS_IN_CONTENT = (
+ "char_deltas = [\n"
+ ' "\\n",\n'
+ ' "\\n",\n'
+ ' "\\n\\n",\n'
+ ' "\\n",\n'
+ "]\n"
+)
+
+_WRITE_FILE_XML_TAGS_OUTPUT = (
+ "\n"
+ "\n"
+ "\ntest.py\n\n"
+ f"\n{_XML_TAGS_IN_CONTENT}\n"
+ "\n"
+ "\n"
+)
+
+
+def test_content_with_xml_structural_tags_nonstreaming(qwen3_tokenizer, parser_cls):
+ """Non-streaming: a string param whose value embeds ,
+ , , as literal text must be
+ extracted intact, with no spurious extra params being created from
+ the embedded tags.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ result = parser.extract_tool_calls(_WRITE_FILE_XML_TAGS_OUTPUT, request=request)
+
+ assert result.tools_called
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "write_file"
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params from embedded tags: {list(args.keys())}"
+ )
+ assert args["path"] == "test.py"
+ expected = _XML_TAGS_IN_CONTENT.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content was truncated/corrupted. Got: {args.get('content')!r}"
+ )
+
+
+def test_content_with_xml_structural_tags_streaming(qwen3_tokenizer, parser_cls):
+ """Streaming variant: pre-formed chunks, full content in one delta."""
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ char_deltas = [
+ "\n",
+ "\n",
+ "\ntest.py\n\n",
+ f"\n{_XML_TAGS_IN_CONTENT}\n",
+ "\n",
+ "\n",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ assert reconstructor.tool_calls[0].function.name == "write_file"
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params from embedded tags: {list(args.keys())}"
+ )
+ assert args["path"] == "test.py"
+ expected = _XML_TAGS_IN_CONTENT.rstrip("\n")
+ assert args["content"] == expected
+
+
+# ---------------------------------------------------------------------------
+# Parameter value containing and on their
+# OWN lines (Jinja2 templates, parser fixtures, etc.). Schema filtering
+# must prevent the unknown name from being treated as structural.
+# ---------------------------------------------------------------------------
+
+_CONTENT_WITH_PARAM_LIKE_LINES = (
+ 'TOOL_CALL_TEMPLATE = """\n'
+ "\n"
+ "\n"
+ "#!/usr/bin/env python3\n"
+ "\n"
+ '"""\n'
+)
+
+_WRITE_FILE_PARAM_LIKE_LINES_OUTPUT = (
+ "\n"
+ "\n"
+ "\ntest_template.py\n\n"
+ f"\n{_CONTENT_WITH_PARAM_LIKE_LINES}\n"
+ "\n"
+ "\n"
+)
+
+
+def test_content_with_param_like_lines_nonstreaming(qwen3_tokenizer, parser_cls):
+ """Non-streaming: ```` and ```` on their
+ own lines inside a string value must not terminate the parameter
+ early. Requires schema-based filtering so that ``new_string`` (not a
+ real parameter of write_file) is treated as literal text.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ result = parser.extract_tool_calls(
+ _WRITE_FILE_PARAM_LIKE_LINES_OUTPUT, request=request
+ )
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params: {list(args.keys())}"
+ )
+ assert args["path"] == "test_template.py"
+ expected = _CONTENT_WITH_PARAM_LIKE_LINES.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content truncated/wrong: {args.get('content')!r}"
+ )
+
+
+def test_content_with_param_like_lines_streaming(qwen3_tokenizer, parser_cls):
+ """Streaming variant: each structural-looking literal line arrives in
+ its own delta — the critical case is when ``\\n`` appears
+ alone with empty lookahead, which must NOT be treated as a real
+ structural close.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ char_deltas = [
+ "\n",
+ "\n",
+ "\ntest_template.py\n\n",
+ '\nTOOL_CALL_TEMPLATE = """\n',
+ "\n", # literal close — alone in its delta
+ "\n", # literal new-param line
+ "#!/usr/bin/env python3\n",
+ "\n", # second literal close
+ '"""\n',
+ "\n", # REAL close of content
+ "\n",
+ "\n",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params: {list(args.keys())}"
+ )
+ assert args["path"] == "test_template.py"
+ expected = _CONTENT_WITH_PARAM_LIKE_LINES.rstrip("\n")
+ assert args["content"] == expected
+
+
+# ---------------------------------------------------------------------------
+# Array param containing JSON true/false/null
+# ---------------------------------------------------------------------------
+
+_ARRAY_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "pick",
+ "parameters": {
+ "type": "object",
+ "properties": {"items": {"type": "array"}},
+ },
+ },
+ )
+]
+
+_ARRAY_WITH_JSON_BOOL_OUTPUT = (
+ "\n\n"
+ '\n["a", "b", 1, true]\n\n'
+ "\n"
+)
+
+
+def test_array_with_json_bool(qwen3_tokenizer, parser_cls):
+ """An array param containing a JSON literal (``true``/``false``/``null``)
+ must be parsed as a real Python list, not wrapped as a string.
+
+ Root cause for the XML parser: the deferred path used
+ ``ast.literal_eval`` first, which doesn't understand JSON tokens.
+ Both parsers must try ``json.loads`` before falling back to
+ ``ast.literal_eval``.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_ARRAY_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ARRAY_TOOLS)
+ result = parser.extract_tool_calls(_ARRAY_WITH_JSON_BOOL_OUTPUT, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert isinstance(args["items"], list), (
+ f"Array with JSON bool was not parsed as list: "
+ f"{type(args['items']).__name__} = {args['items']!r}"
+ )
+ assert args["items"] == ["a", "b", 1, True]
+
+
+# ---------------------------------------------------------------------------
+# Speculative decoding: two complete tool calls in a single streaming delta.
+# Both parsers must emit both tool calls, not drop the second.
+# ---------------------------------------------------------------------------
+
+_WEATHER_TOOLS = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "get_weather",
+ "parameters": {
+ "type": "object",
+ "properties": {"city": {"type": "string"}},
+ },
+ },
+ )
+]
+
+_TWO_TOOL_CALLS_IN_ONE_CHUNK = (
+ "\n\n"
+ "\nParis\n\n"
+ "\n\n"
+ "\n\n"
+ "\nLondon\n\n"
+ "\n"
+)
+
+
+def test_two_tool_calls_in_one_streaming_chunk(qwen3_tokenizer, parser_cls):
+ """Speculative decoding flushes can deliver several full
+ ``...`` blocks in a single delta. Both must be
+ emitted; dropping the second one is a regression.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS)
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ [_TWO_TOOL_CALLS_IN_ONE_CHUNK],
+ request,
+ assert_one_tool_per_delta=False,
+ )
+ assert len(reconstructor.tool_calls) == 2, (
+ f"Expected 2 tool calls in one delta, got {len(reconstructor.tool_calls)}"
+ )
+ args0 = json.loads(reconstructor.tool_calls[0].function.arguments)
+ args1 = json.loads(reconstructor.tool_calls[1].function.arguments)
+ assert args0 == {"city": "Paris"}
+ assert args1 == {"city": "London"}
+
+
+# ---------------------------------------------------------------------------
+# Trailing free text after the LAST in the SAME delta (MTP /
+# speculative decoding). The text must be emitted as content; dropping it
+# silently is a regression.
+# ---------------------------------------------------------------------------
+
+
+def test_python_none_value_for_nullable_int(qwen3_tokenizer, parser_cls):
+ """A Qwen3.5-trained model emits Python ``None`` (not ``null``) for a
+ nullable non-string parameter, because the Qwen3.5 chat template
+ renders ``args_value | string`` for non-container types — turning a
+ null arg from a previous tool call into the literal "None" in the
+ prompt. The model then learns to generate the same "None" verbatim.
+
+ The parser must recognise this and convert "None" to JSON null,
+ just like it already does for the literal "null" emitted by
+ Qwen3.6-trained models.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_count",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "count": {
+ "anyOf": [
+ {"type": "integer"},
+ {"type": "null"},
+ ],
+ },
+ },
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ model_output = (
+ "\n"
+ "\n"
+ "None\n"
+ "\n"
+ ""
+ )
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser.extract_tool_calls(model_output, request=request)
+
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["count"] is None, (
+ f"Python repr None was not converted to JSON null. Got: {args['count']!r}"
+ )
+
+
+def test_streaming_two_tool_calls_plus_trailing_text_one_delta(
+ qwen3_tokenizer, parser_cls
+):
+ """MTP: a single delta delivers tool 1 + tool 2 + trailing free text.
+ Both tool calls must be emitted AND the trailing text must surface as
+ content in the same delta — not be silently dropped.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS)
+ deltas = [
+ _TWO_TOOL_CALLS_IN_ONE_CHUNK + "\nAll done!",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 2, (
+ f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}"
+ )
+ assert "All done!" in reconstructor.other_content, (
+ f"Trailing text after the second tool call was dropped. "
+ f"Got content: {reconstructor.other_content!r}"
+ )
+
+
+def test_streaming_trailing_text_with_final_close_in_same_delta(
+ qwen3_tokenizer, parser_cls
+):
+ """MTP / speculative decoding can deliver the closing ````
+ together with trailing free text in a single delta. The text after
+ the close must be emitted as content rather than being silently
+ consumed by the parser's "advance to next tool" logic.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS)
+ deltas = [
+ # Build up the tool call up to and including .
+ "\n\n"
+ "Paris\n",
+ # Then deliver + trailing text in ONE delta.
+ "\n\nI hope this helps!",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ assert "I hope this helps!" in reconstructor.other_content, (
+ f"Trailing text after was dropped. "
+ f"Got content: {reconstructor.other_content!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Parameter value containing a literal ```` whose NAME IS
+# itself a real parameter of the same tool. The schema-based filter cannot
+# rule the literal out by name, so a stronger heuristic is required (e.g.
+# the literal does not pair with a structural ```` followed by
+# another structural delimiter). This is the exact pattern that breaks
+# qwen-code WriteFile when the file being written is itself a parser test
+# fixture.
+# ---------------------------------------------------------------------------
+
+_CONTENT_WITH_REAL_PARAM_NAME_LITERAL = (
+ 'doc = """\n\nliteral/value\n\n"""\n'
+)
+
+_REAL_PARAM_NAME_LITERAL_OUTPUT = (
+ "\n"
+ "\n"
+ "\nfixture.py\n\n"
+ f"\n{_CONTENT_WITH_REAL_PARAM_NAME_LITERAL}\n"
+ "\n"
+ ""
+)
+
+
+def test_content_with_real_param_name_literal_nonstreaming(qwen3_tokenizer, parser_cls):
+ """Non-streaming: parameter ``content`` value embeds
+ ``...`` where ``path`` IS the other real
+ parameter of the same ``write_file`` tool. Schema name filtering alone
+ cannot disambiguate — the parser must use a stronger rule (e.g. the
+ embedded ```` must be followed by a structural delimiter
+ that closes the OUTER param, not the inner literal).
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ result = parser.extract_tool_calls(_REAL_PARAM_NAME_LITERAL_OUTPUT, request=request)
+
+ assert result.tools_called
+ assert len(result.tool_calls) == 1
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params from embedded same-name literal: {list(args.keys())}"
+ )
+ assert args["path"] == "fixture.py", (
+ f"Outer ``path`` was overwritten by embedded literal: {args.get('path')!r}"
+ )
+ expected = _CONTENT_WITH_REAL_PARAM_NAME_LITERAL.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content was truncated at the embedded . "
+ f"Got: {args.get('content')!r}"
+ )
+
+
+def test_content_with_real_param_name_literal_streaming(qwen3_tokenizer, parser_cls):
+ """Streaming variant of the same case. Each meaningful structural-
+ looking line arrives in its own delta — the parser cannot wait for the
+ full text to disambiguate.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ char_deltas = [
+ "\n",
+ "\n",
+ "\nfixture.py\n\n",
+ '\ndoc = """\n',
+ "\n",
+ "literal/value\n",
+ "\n",
+ '"""\n',
+ "\n",
+ "\n",
+ "",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params from embedded same-name literal: {list(args.keys())}"
+ )
+ assert args["path"] == "fixture.py"
+ expected = _CONTENT_WITH_REAL_PARAM_NAME_LITERAL.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content was truncated at the embedded . "
+ f"Got: {args.get('content')!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Parameter value containing a COMPLETE nested tool_call (all four balise
+# types: , , , ,
+# , ) — the qwen-code WriteFile pattern when the
+# file being written is itself a parser fixture or a chat-template
+# example. Every literal must stay inside the value; no spurious extra
+# tool calls or params should be generated.
+# ---------------------------------------------------------------------------
+
+_CONTENT_WITH_FULL_NESTED_CALL = (
+ 'doc = """\n'
+ "\n"
+ "\n"
+ "\n"
+ "literal/value.txt\n"
+ "\n"
+ "\n"
+ "hello\n"
+ "\n"
+ "\n"
+ "\n"
+ '"""\n'
+)
+
+_FULL_NESTED_CALL_OUTPUT = (
+ "\n"
+ "\n"
+ "\nfixture.py\n\n"
+ f"\n{_CONTENT_WITH_FULL_NESTED_CALL}\n"
+ "\n"
+ ""
+)
+
+
+def test_content_with_full_nested_tool_call_nonstreaming(qwen3_tokenizer, parser_cls):
+ """Non-streaming: parameter ``content`` contains a complete literal
+ ``...`` whose function/parameter names match
+ the OUTER tool's schema. Every literal must stay inside the value;
+ no extra tool call must be generated.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ result = parser.extract_tool_calls(_FULL_NESTED_CALL_OUTPUT, request=request)
+
+ assert result.tools_called
+ assert len(result.tool_calls) == 1, (
+ f"Expected 1 tool call (the outer one), got "
+ f"{len(result.tool_calls)} — embedded literal tool_call was "
+ f"incorrectly promoted to a real call."
+ )
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"]
+ assert args["path"] == "fixture.py"
+ expected = _CONTENT_WITH_FULL_NESTED_CALL.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content truncated/corrupted: {args.get('content')!r}"
+ )
+
+
+def test_content_with_full_nested_tool_call_streaming(qwen3_tokenizer, parser_cls):
+ """Streaming variant: the literal nested ``...``
+ crosses many delta boundaries; the parser must not start a second
+ tool call.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ char_deltas = [
+ "\n",
+ "\n",
+ "\nfixture.py\n\n",
+ '\ndoc = """\n',
+ "\n",
+ "\n",
+ "\n",
+ "literal/value.txt\n",
+ "\n",
+ "\n",
+ "hello\n",
+ "\n",
+ "\n",
+ "\n",
+ '"""\n',
+ "\n",
+ "\n",
+ "",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1, (
+ f"Expected 1 tool call, got {len(reconstructor.tool_calls)} — "
+ f"a literal nested was promoted to a real call."
+ )
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+ assert list(args.keys()) == ["path", "content"]
+ assert args["path"] == "fixture.py"
+ expected = _CONTENT_WITH_FULL_NESTED_CALL.rstrip("\n")
+ assert args["content"] == expected, (
+ f"content truncated/corrupted: {args.get('content')!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Two consecutive tool calls, where the SECOND embeds a literal nested
+# tool_call whose ```` uses a NAME that is NOT in the
+# OUTER tool's schema (e.g. a description of a different tool's format).
+# Reproduces the qwen-code Qwen 3.6 freeze scenario: the depth tracker
+# in ``_find_true_param_end`` filters opens by schema, so the literal
+# ```` that closes the unknown-NAME literal open appears
+# unmatched and matches the structural lookahead of the trailing
+# ````, truncating the OUTER content value.
+# ---------------------------------------------------------------------------
+
+_OUT_OF_SCHEMA_NESTED_CONTENT = (
+ 'template = """\n'
+ "\n\n"
+ "baz\n"
+ "\n\n"
+ '"""\n'
+)
+
+_TWO_TOOLS_OUT_OF_SCHEMA_NESTED_OUTPUT = (
+ "\n\n"
+ "baz\n"
+ "\n"
+ "\n\n"
+ "\n\n"
+ "\nfixture.py\n\n"
+ f"\n{_OUT_OF_SCHEMA_NESTED_CONTENT}\n"
+ "\n"
+)
+
+
+def test_two_tools_second_with_out_of_schema_nested_literal_nonstreaming(
+ qwen3_tokenizer, parser_cls
+):
+ """Two structural tool calls; the second's ``content`` value embeds a
+ literal nested ```` block whose inner ````
+ uses a NAME not in the outer tool's schema (``write_file`` only knows
+ ``path`` and ``content``).
+
+ The walker must still match the outer ```` of ``content``,
+ not the literal ```` of the unknown-NAME nested open.
+ """
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ result = parser.extract_tool_calls(
+ _TWO_TOOLS_OUT_OF_SCHEMA_NESTED_OUTPUT, request=request
+ )
+ assert result.tools_called
+ assert len(result.tool_calls) == 2, (
+ f"Expected 2 tool calls, got {len(result.tool_calls)}: "
+ f"{[tc.function.name for tc in result.tool_calls]}"
+ )
+ args0 = json.loads(result.tool_calls[0].function.arguments)
+ args1 = json.loads(result.tool_calls[1].function.arguments)
+ assert args0 == {"bar": "baz"}, f"first tool args wrong: {args0!r}"
+ assert result.tool_calls[1].function.name == "write_file"
+ assert list(args1.keys()) == ["path", "content"], (
+ f"Spurious params on outer tool: {list(args1.keys())}"
+ )
+ assert args1["path"] == "fixture.py"
+ expected = _OUT_OF_SCHEMA_NESTED_CONTENT.rstrip("\n")
+ assert args1["content"] == expected, (
+ f"outer content truncated at literal : {args1.get('content')!r}"
+ )
+
+
+def test_two_tools_second_with_out_of_schema_nested_literal_streaming(
+ qwen3_tokenizer, parser_cls
+):
+ """Streaming variant of the same scenario."""
+ parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS)
+ char_deltas = [
+ "\n\n",
+ "baz\n",
+ "\n",
+ "\n\n",
+ "\n\n",
+ "\nfixture.py\n\n",
+ '\ntemplate = """\n',
+ "\n\n",
+ "baz\n",
+ "\n\n",
+ '"""\n',
+ "\n",
+ "\n",
+ "",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 2, (
+ f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}"
+ )
+ args0 = json.loads(reconstructor.tool_calls[0].function.arguments)
+ args1 = json.loads(reconstructor.tool_calls[1].function.arguments)
+ assert args0 == {"bar": "baz"}
+ assert reconstructor.tool_calls[1].function.name == "write_file"
+ assert list(args1.keys()) == ["path", "content"]
+ assert args1["path"] == "fixture.py"
+ expected = _OUT_OF_SCHEMA_NESTED_CONTENT.rstrip("\n")
+ assert args1["content"] == expected, (
+ f"outer content truncated/corrupted: {args1.get('content')!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Phantom tool calls produced when the model writes an UNRENDERED Jinja
+# template literally in its response: ``\n\n
+# ...``. The function name ``{{ x }}`` contains
+# template-syntax characters and CANNOT be a real function — the parser
+# must reject these tool calls (or render them as content) rather than
+# emit them as real ones, since the client will then raise "tool not
+# found" errors and cause the agent to loop.
+# ---------------------------------------------------------------------------
+
+_JINJA_PHANTOM_OUTPUT = (
+ "\n\n"
+ "\n{{ v }}\n\n"
+ "\n"
+ "\n\n"
+ "\n\n"
+ "\nout.txt\n\n"
+ "\nhello\n\n"
+ "\n"
+)
+
+
+def test_jinja_template_phantom_tool_call_is_rejected_nonstreaming(
+ qwen3_tokenizer, parser_cls
+):
+ """A ```` block (unrendered Jinja) emits a
+ function name that is not a valid identifier. It must NOT be
+ surfaced as a real tool call — the client would fail with "tool not
+ found" and the agent would loop.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "write_file",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "path": {"type": "string"},
+ "content": {"type": "string"},
+ },
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ result = parser.extract_tool_calls(_JINJA_PHANTOM_OUTPUT, request=request)
+ assert result.tools_called
+ names = [tc.function.name for tc in result.tool_calls]
+ assert "{{ tc.name }}" not in names, (
+ f"Phantom Jinja-template tool call surfaced as real: {names}"
+ )
+ assert names == ["write_file"], (
+ f"Expected only the real ``write_file`` tool call, got: {names}"
+ )
+
+
+# NOTE: a streaming counterpart of the above test is intentionally not
+# added. Filtering phantoms in streaming requires a separate
+# "client-visible index" counter (the existing ``current_tool_index`` is
+# also used for internal position bookkeeping). Until that refactor
+# lands, the streaming path may still surface phantoms and the client
+# is expected to drop unknown function names. The non-streaming path
+# is the one consumed by the offline tools-extraction code and by the
+# ``_parse_xml_function_call`` helper invoked at function-end during
+# streaming, so production users still see the filtered result for
+# completed tool calls.
+
+
+# ---------------------------------------------------------------------------
+# Inline empty ``...`` (no ````) before a
+# real tool call: the content text BETWEEN the inline literal and the real
+# tool call must be preserved. Previously the content was truncated at the
+# position of the FIRST ```` token regardless of whether that
+# block contained a real ````.
+# ---------------------------------------------------------------------------
+
+
+def test_inline_empty_tool_call_preserves_content_before_real_call(
+ qwen3_tokenizer, parser_cls
+):
+ """A bare ``example`` in the model's narrative
+ text (no ```` inside) must NOT consume the surrounding
+ content; only the real ```` block that contains a valid
+ function call should anchor ``content_index``.
+
+ The XML parser's SAX-based pipeline consumes the inline empty
+ block's body as XML text (so ``example`` is dropped), but the
+ surrounding narrative ("I'll show:" and "Now real:") must still be
+ preserved — both parsers are checked.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "log",
+ "parameters": {
+ "type": "object",
+ "properties": {"msg": {"type": "string"}},
+ },
+ },
+ )
+ ]
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ text = (
+ "I'll show: example. Now real:\n"
+ "\n\n\nhi\n\n"
+ "\n"
+ )
+ result = parser.extract_tool_calls(text, request=request)
+ assert result.tools_called
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "log"
+ # Content between the inline empty tool_call and the real one MUST be
+ # preserved — dropping it loses the model's contextual narrative.
+ assert result.content is not None
+ assert "I'll show:" in result.content, (
+ f"Pre-inline narrative lost from content: {result.content!r}"
+ )
+ assert "Now real:" in result.content, (
+ f"Content between inline literal and real tool_call lost: {result.content!r}"
+ )
+
+
+# ---------------------------------------------------------------------------
+# anyOf [{type: string}, {type: null}] with the literal "null" or "None"
+# value must convert to JSON null, NOT preserve as the string "null"/"None".
+# Observed against a real Qwen 3.6 server: the model emits ``None`` for a
+# nullable optional parameter and the parser kept it as the string "None",
+# breaking nullable-typed clients.
+# ---------------------------------------------------------------------------
+
+
+def test_anyof_string_null_with_null_literal_returns_none(qwen3_tokenizer, parser_cls):
+ """anyOf [{type: string}, {type: null}] with value "null" or "None"
+ must convert to JSON null. String-typed paths preserve the literal,
+ but a nullable schema MUST recognise the null sentinel — otherwise
+ the client receives the literal "null" / "None" string and downstream
+ type checks fail.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_value",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "optional": {
+ "anyOf": [{"type": "string"}, {"type": "null"}],
+ },
+ },
+ },
+ },
+ )
+ ]
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+ for literal in ("null", "None"):
+ parser = parser_cls(qwen3_tokenizer, tools=tools)
+ model_output = (
+ "\n"
+ "\n"
+ f"{literal}\n"
+ "\n"
+ ""
+ )
+ result = parser.extract_tool_calls(model_output, request=request)
+ assert result.tools_called
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["optional"] is None, (
+ f"anyOf string|null with value {literal!r} was kept as "
+ f"{type(args['optional']).__name__}: {args['optional']!r}"
+ )
+
+
+def test_get_vllm_registry_structural_tag_returns_structural_tag(
+ parser,
+ sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+ request_tools = _as_chat_completion_tools(sample_tools)
+ req = ChatCompletionRequest(
+ messages=[],
+ model="m",
+ tools=request_tools,
+ tool_choice="auto",
+ )
+ tag = parser.get_structural_tag(req)
+ assert isinstance(tag, StructuralTag)
+
+ req = ChatCompletionRequest(
+ messages=[],
+ model="m",
+ tools=request_tools,
+ tool_choice="required",
+ )
+ tag = parser.get_structural_tag(req)
+ assert isinstance(tag, StructuralTag)
+
+ if request_tools:
+ tool = request_tools[0]
+ req = ChatCompletionRequest(
+ messages=[],
+ model="m",
+ tools=request_tools,
+ )
+ req.tool_choice = ChatCompletionNamedToolChoiceParam(
+ function=ChatCompletionNamedFunction(name=tool.function.name)
+ )
+ tag = parser.get_structural_tag(req)
+ assert isinstance(tag, StructuralTag)
+
+
+@pytest.mark.parametrize("include_reasoning", [True, False])
+def test_adjust_request_auto_uses_vllm_registry_structural_tag(
+ monkeypatch: pytest.MonkeyPatch,
+ parser,
+ sample_tools: list[ChatCompletionToolsParam],
+ include_reasoning: bool,
+) -> None:
+ monkeypatch.setattr(
+ "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+ True,
+ )
+ request_tools = _as_chat_completion_tools(sample_tools)
+ req = ChatCompletionRequest(
+ messages=[],
+ model="m",
+ tools=request_tools,
+ tool_choice="auto",
+ include_reasoning=include_reasoning,
+ )
+ out = parser.adjust_request(req)
+ assert out.structured_outputs is not None
+ assert out.structured_outputs.structural_tag is not None
+ assert isinstance(out.structured_outputs.structural_tag, str)
+ loaded = json.loads(out.structured_outputs.structural_tag)
+ assert isinstance(loaded, dict)
+
+
+def test_adjust_request_required_prefers_structural_tag(
+ monkeypatch: pytest.MonkeyPatch,
+ parser,
+ sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+ monkeypatch.setattr(
+ "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+ True,
+ )
+ request_tools = _as_chat_completion_tools(sample_tools)
+ req = ChatCompletionRequest(
+ messages=[],
+ model="m",
+ tools=request_tools,
+ tool_choice="required",
+ )
+ out = parser.adjust_request(req)
+ assert out.structured_outputs is not None
+ assert out.structured_outputs.structural_tag is not None
diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py
index defc6d23eff4..9ff5a933a515 100644
--- a/tests/tool_parsers/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -1,30 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Coder-parser-specific tests.
+
+Tests that exercise behaviour shared with the XML parser live in
+``tests/tool_parsers/test_qwen3_xml_coder_shared.py``. Only tests that
+depend on Coder-only API (e.g. ``is_tool_call_started``) or on Coder-only
+streaming behaviour (e.g. character-by-character chunking) belong here.
+"""
+
import json
-from collections.abc import Generator
import pytest
-from openai.types.responses.function_tool import FunctionTool
-from xgrammar import StructuralTag
from vllm.entrypoints.openai.chat_completion.protocol import (
- ChatCompletionNamedFunction,
- ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
- ChatCompletionToolsParam,
-)
-from vllm.entrypoints.openai.engine.protocol import (
- DeltaMessage,
- FunctionCall,
- ToolCall,
-)
-from vllm.tokenizers import TokenizerLike, get_tokenizer
-from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
-from vllm.tool_parsers.qwen3coder_tool_parser import (
- Qwen3CoderToolParser,
)
-from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.qwen3coder_tool_parser import Qwen3CoderToolParser
MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
@@ -35,1407 +29,432 @@ def qwen3_tokenizer():
@pytest.fixture
-def qwen3_tool_parser(qwen3_tokenizer, sample_tools):
- return Qwen3CoderToolParser(qwen3_tokenizer, tools=sample_tools)
-
-
-@pytest.fixture
-def qwen3_xml_tool_parser(qwen3_tokenizer, sample_tools):
- return Qwen3XMLToolParser(qwen3_tokenizer, tools=sample_tools)
-
-
-@pytest.fixture(params=["xml"])
-def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request):
- """Parameterized fixture that provides both parser types for testing"""
- if request.param == "original":
- return qwen3_tool_parser
- else:
- return qwen3_xml_tool_parser
-
-
-WEATHER_PARAMS = {
- "type": "object",
- "properties": {
- "city": {"type": "string", "description": "The city name"},
- "state": {"type": "string", "description": "The state code"},
- "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
- },
- "required": ["city", "state"],
-}
-
-AREA_PARAMS = {
- "type": "object",
- "properties": {
- "shape": {"type": "string"},
- "dimensions": {"type": "object"},
- "precision": {"type": "integer"},
- },
-}
-
-
-@pytest.fixture(params=["chat_completion", "responses_api"])
-def sample_tools(request):
- if request.param == "chat_completion":
- return [
- ChatCompletionToolsParam(
- type="function",
- function={
- "name": "get_current_weather",
- "description": "Get the current weather",
- "parameters": WEATHER_PARAMS,
- },
- ),
- ChatCompletionToolsParam(
- type="function",
- function={
- "name": "calculate_area",
- "description": "Calculate area of a shape",
- "parameters": AREA_PARAMS,
- },
- ),
- ]
- else:
- return [
- FunctionTool(
- type="function",
- name="get_current_weather",
- description="Get the current weather",
- parameters=WEATHER_PARAMS,
- ),
- FunctionTool(
- type="function",
- name="calculate_area",
- description="Calculate area of a shape",
- parameters=AREA_PARAMS,
- ),
- ]
-
-
-def _as_chat_completion_tools(
- tools: list[ChatCompletionToolsParam | FunctionTool],
-) -> list[ChatCompletionToolsParam]:
- normalized: list[ChatCompletionToolsParam] = []
- for tool in tools:
- if isinstance(tool, ChatCompletionToolsParam):
- normalized.append(tool)
- else:
- normalized.append(
- ChatCompletionToolsParam(
- type="function",
- function={
- "name": tool.name,
- "description": tool.description,
- "parameters": tool.parameters,
- },
- )
- )
- return normalized
-
-
-def assert_tool_calls(
- actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
-):
- assert len(actual_tool_calls) == len(expected_tool_calls)
-
- for actual_tool_call, expected_tool_call in zip(
- actual_tool_calls, expected_tool_calls
- ):
- # Qwen3 parser doesn't generate IDs during extraction
- assert actual_tool_call.type == "function"
- assert actual_tool_call.function.name == expected_tool_call.function.name
- assert json.loads(actual_tool_call.function.arguments) == json.loads(
- expected_tool_call.function.arguments
- )
-
+def qwen3_tool_parser(qwen3_tokenizer):
+ return Qwen3CoderToolParser(qwen3_tokenizer, tools=None)
-def stream_delta_message_generator(
- qwen3_tool_parser,
- qwen3_tokenizer: TokenizerLike,
- model_output: str,
- request: ChatCompletionRequest | None = None,
-) -> Generator[DeltaMessage, None, None]:
- all_token_ids = qwen3_tokenizer.encode(model_output, add_special_tokens=False)
- previous_text = ""
- previous_tokens = None
- prefix_offset = 0
- read_offset = 0
- for i, delta_token in enumerate(all_token_ids):
- delta_token_ids = [delta_token]
- previous_token_ids = all_token_ids[:i]
- current_token_ids = all_token_ids[: i + 1]
-
- (new_tokens, delta_text, new_prefix_offset, new_read_offset) = (
- detokenize_incrementally(
- tokenizer=qwen3_tokenizer,
- all_input_ids=current_token_ids,
- prev_tokens=previous_tokens,
- prefix_offset=prefix_offset,
- read_offset=read_offset,
- skip_special_tokens=False,
- spaces_between_special_tokens=True,
- )
- )
-
- current_text = previous_text + delta_text
-
- delta_message = qwen3_tool_parser.extract_tool_calls_streaming(
- previous_text,
- current_text,
- delta_text,
- previous_token_ids,
- current_token_ids,
- delta_token_ids,
- request=request,
- )
- if delta_message:
- yield delta_message
-
- previous_text = current_text
- previous_tokens = (
- previous_tokens + new_tokens if previous_tokens else new_tokens
- )
- prefix_offset = new_prefix_offset
- read_offset = new_read_offset
-
-
-def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized):
- model_output = "This is a test response without any tool calls"
- extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
- model_output, request=None
- ) # type: ignore[arg-type]
- assert not extracted_tool_calls.tools_called
- assert extracted_tool_calls.tool_calls == []
- assert extracted_tool_calls.content == model_output
-
-
-@pytest.mark.parametrize(
- ids=[
- "single_tool",
- "single_tool_with_content",
- "single_tool_multiline_param",
- "parallel_tools",
- "tool_with_typed_params",
- ],
- argnames=["model_output", "expected_tool_calls", "expected_content"],
- argvalues=[
- (
- """
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- )
- ],
- None,
- ),
- (
- """Sure! Let me check the weather for you.
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- )
- ],
- "Sure! Let me check the weather for you.",
- ),
- (
- """
-
-
-rectangle
-
-
-{"width": 10,
- "height": 20}
-
-
-2
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="calculate_area",
- arguments=json.dumps(
- {
- "shape": "rectangle",
- "dimensions": {"width": 10, "height": 20},
- "precision": 2,
- }
- ),
- )
- )
- ],
- None,
- ),
- (
- """
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-
-
-
-
-Orlando
-
-
-FL
-
-
-fahrenheit
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- ),
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}
- ),
- )
- ),
- ],
- None,
- ),
- (
- """Let me calculate that area for you.
-
-
-circle
-
-
-{"radius": 15.5}
-
-
-3
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="calculate_area",
- arguments=json.dumps(
- {
- "shape": "circle",
- "dimensions": {"radius": 15.5},
- "precision": 3,
- }
- ),
- )
- )
- ],
- "Let me calculate that area for you.",
- ),
- ],
-)
-def test_extract_tool_calls(
- qwen3_tool_parser_parametrized,
- model_output,
- expected_tool_calls,
- expected_content,
-):
- request = ChatCompletionRequest(model=MODEL, messages=[])
- extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
- model_output, request=request
- )
- assert extracted_tool_calls.tools_called
-
- assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls)
-
- assert extracted_tool_calls.content == expected_content
-
-
-def test_extract_tool_calls_fallback_no_tags(
- qwen3_tool_parser_parametrized,
+def test_streaming_trailing_text_after_tool_with_literal_close_tag_in_value(
+ qwen3_tokenizer,
):
- """Test fallback parsing when XML tags are missing"""
- model_output = """
-
-Dallas
-
-
-TX
-
-"""
-
- request = ChatCompletionRequest(model=MODEL, messages=[])
- extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
- model_output, request=request
+ """A tool call's parameter value contains a literal ````
+ string. After the real tool call closes, trailing free text must
+ still be emitted as content.
+
+ The naive ``current_text.count()`` and
+ ``current_text.find()`` used by the early-advance and
+ ``_advance_to_next_tool`` logic don't distinguish literal text from
+ structural delimiters. This can cause ``_sent_content_idx`` to land
+ INSIDE the tool's parameter value, after which the trailing text
+ fails to be emitted.
+ """
+ from vllm.entrypoints.openai.chat_completion.protocol import (
+ ChatCompletionToolsParam,
)
- assert extracted_tool_calls.tools_called
- assert len(extracted_tool_calls.tool_calls) == 1
- assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
-
-
-def test_extract_tool_calls_type_conversion(qwen3_tokenizer):
- """Test parameter type conversion based on tool schema"""
tools = [
ChatCompletionToolsParam(
type="function",
function={
- "name": "test_types",
+ "name": "write_file",
"parameters": {
"type": "object",
"properties": {
- "int_param": {"type": "integer"},
- "float_param": {"type": "float"},
- "bool_param": {"type": "boolean"},
- "str_param": {"type": "string"},
- "obj_param": {"type": "object"},
+ "path": {"type": "string"},
+ "content": {"type": "string"},
},
},
},
)
]
-
- model_output = """
-
-
-42
-
-
-3.14
-
-
-true
-
-
-hello world
-
-
-{"key": "value"}
-
-
-"""
-
- parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools)
request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
- args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
- assert args["int_param"] == 42
- assert args["float_param"] == 3.14
- assert args["bool_param"] is True
- assert args["str_param"] == "hello world"
- assert args["obj_param"] == {"key": "value"}
+ # The parameter value contains a literal ```` string.
+ # The real ```` follows after ````.
+ delta_1 = (
+ "\n\n"
+ "foo.py\n"
+ "\n"
+ "doc = 'example'\n"
+ "\n\n"
+ )
+ parser.extract_tool_calls_streaming(
+ previous_text="",
+ current_text=delta_1,
+ delta_text=delta_1,
+ previous_token_ids=[],
+ current_token_ids=[1],
+ delta_token_ids=[1],
+ request=request,
+ )
+ delta_2 = "\nDone, file written!"
+ text2 = delta_1 + delta_2
+ msg2 = parser.extract_tool_calls_streaming(
+ previous_text=delta_1,
+ current_text=text2,
+ delta_text=delta_2,
+ previous_token_ids=[1],
+ current_token_ids=[1, 2],
+ delta_token_ids=[2],
+ request=request,
+ )
+ contents = []
+ if msg2 and msg2.content:
+ contents.append(msg2.content)
+ # EOS-style empty delta to flush
+ msg3 = parser.extract_tool_calls_streaming(
+ previous_text=text2,
+ current_text=text2,
+ delta_text="",
+ previous_token_ids=[1, 2],
+ current_token_ids=[1, 2, 3],
+ delta_token_ids=[3],
+ request=request,
+ )
+ if msg3 and msg3.content:
+ contents.append(msg3.content)
+
+ full = "".join(contents)
+ assert "Done, file written!" in full, (
+ f"Trailing text after a tool call whose parameter value contains "
+ f"a literal was dropped. Got content: {full!r}"
+ )
-def test_extract_tool_calls_anyof_type_conversion(qwen3_tokenizer):
- """Test type conversion for anyOf/oneOf nullable schemas (Pydantic v2).
- Pydantic v2 emits anyOf for Optional[T] fields, e.g.:
- Optional[int] -> {"anyOf": [{"type": "integer"}, {"type": "null"}]}
- The parser must extract the non-null type and apply the correct
- conversion (int(), float(), etc.) instead of returning a raw string.
+def test_streaming_second_tool_after_first_with_literal_close_tag_in_value(
+ qwen3_tokenizer,
+):
+ """A first tool call's parameter value contains a literal
+ ````. A SECOND structural tool call follows after the
+ real ````. Both tool calls and any inter-call content
+ must be emitted correctly.
"""
+ from vllm.entrypoints.openai.chat_completion.protocol import (
+ ChatCompletionToolsParam,
+ )
+
tools = [
ChatCompletionToolsParam(
type="function",
function={
- "name": "test_anyof",
+ "name": "write_file",
"parameters": {
"type": "object",
"properties": {
- "anyof_int": {
- "anyOf": [
- {"type": "integer"},
- {"type": "null"},
- ],
- "default": 5,
- },
- "anyof_str": {
- "anyOf": [
- {"type": "string"},
- {"type": "null"},
- ],
- },
- "anyof_array": {
- "anyOf": [
- {"type": "array", "items": {"type": "string"}},
- {"type": "null"},
- ],
- },
- "anyof_obj": {
- "anyOf": [
- {"type": "object"},
- {"type": "null"},
- ],
- },
- "type_as_array": {
- "type": ["integer", "null"],
- },
- "multi_non_null": {
- "anyOf": [
- {"type": "string"},
- {"type": "integer"},
- {"type": "null"},
- ],
- },
+ "path": {"type": "string"},
+ "content": {"type": "string"},
},
},
},
- )
- ]
-
- model_output = """
-
-
-5
-
-
-hello
-
-
-["a", "b", "c"]
-
-
-{"key": "value"}
-
-
-42
-
-
-some text
-
-
-"""
-
- parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools)
- request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- extracted = parser.extract_tool_calls(model_output, request=request)
-
- args = json.loads(extracted.tool_calls[0].function.arguments)
- assert args["anyof_int"] == 5
- assert isinstance(args["anyof_int"], int)
- assert args["anyof_str"] == "hello"
- assert isinstance(args["anyof_str"], str)
- assert args["anyof_array"] == ["a", "b", "c"]
- assert isinstance(args["anyof_array"], list)
- assert args["anyof_obj"] == {"key": "value"}
- assert isinstance(args["anyof_obj"], dict)
- assert args["type_as_array"] == 42
- assert isinstance(args["type_as_array"], int)
- # Multi non-null: anyOf[string, integer, null] → first non-null is string
- assert args["multi_non_null"] == "some text"
- assert isinstance(args["multi_non_null"], str)
-
-
-def test_extract_tool_calls_anyof_type_conversion_streaming(qwen3_tokenizer):
- """Test streaming e2e for anyOf/oneOf nullable schemas (Pydantic v2).
-
- Verifies that the full streaming pipeline — tokenize, incrementally
- decode, extract_tool_calls_streaming — correctly resolves types from
- anyOf schemas and produces valid JSON with properly typed values.
- """
- tools = [
+ ),
ChatCompletionToolsParam(
type="function",
function={
- "name": "search_web",
+ "name": "log",
"parameters": {
"type": "object",
- "properties": {
- "query": {
- "anyOf": [
- {"type": "string"},
- {"type": "null"},
- ],
- },
- "count": {
- "anyOf": [
- {"type": "integer"},
- {"type": "null"},
- ],
- "default": 5,
- },
- "verbose": {
- "anyOf": [
- {"type": "boolean"},
- {"type": "null"},
- ],
- },
- },
+ "properties": {"msg": {"type": "string"}},
},
},
- )
+ ),
]
-
- model_output = """
-
-
-vllm tool parser
-
-
-10
-
-
-true
-
-
-"""
-
parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools)
request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- tool_states = {}
- for delta_message in stream_delta_message_generator(
- parser, qwen3_tokenizer, model_output, request
- ):
- if delta_message.tool_calls:
- for tool_call in delta_message.tool_calls:
- idx = tool_call.index
- if idx not in tool_states:
- tool_states[idx] = {"name": None, "arguments": ""}
- if tool_call.function:
- if tool_call.function.name:
- tool_states[idx]["name"] = tool_call.function.name
- if tool_call.function.arguments is not None:
- tool_states[idx]["arguments"] += tool_call.function.arguments
-
- assert len(tool_states) == 1
- assert tool_states[0]["name"] == "search_web"
- assert tool_states[0]["arguments"] is not None
- args = json.loads(tool_states[0]["arguments"])
- assert args["query"] == "vllm tool parser"
- assert isinstance(args["query"], str)
- assert args["count"] == 10
- assert isinstance(args["count"], int)
- assert args["verbose"] is True
- assert isinstance(args["verbose"], bool)
-
-
-@pytest.mark.parametrize(
- ids=[
- "no_tools",
- "single_tool",
- "single_tool_with_content",
- "single_tool_multiline_param",
- "parallel_tools",
- "tool_with_typed_params", # Added this test case
- ],
- argnames=["model_output", "expected_tool_calls", "expected_content"],
- argvalues=[
- ("This is a test without tools", [], "This is a test without tools"),
- (
- """
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- )
- ],
- None,
- ),
- (
- """Sure! Let me check the weather for you.
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- )
- ],
- "Sure! Let me check the weather for you.",
- ),
- (
- """
-
-
-rectangle
-
-
-{"width": 10,
- "height": 20}
-
-
-2
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="calculate_area",
- arguments=json.dumps(
- {
- "shape": "rectangle",
- "dimensions": {"width": 10, "height": 20},
- "precision": 2,
- }
- ),
- )
- )
- ],
- None,
- ),
- (
- """
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-
-
-
-
-Orlando
-
-
-FL
-
-
-celsius
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
- ),
- )
- ),
- ToolCall(
- function=FunctionCall(
- name="get_current_weather",
- arguments=json.dumps(
- {"city": "Orlando", "state": "FL", "unit": "celsius"}
- ),
- )
- ),
- ],
- None,
- ),
- # Added tool_with_typed_params test case
- (
- """Let me calculate that area for you.
-
-
-circle
-
-
-{"radius": 15.5}
-
-
-3
-
-
-""",
- [
- ToolCall(
- function=FunctionCall(
- name="calculate_area",
- arguments=json.dumps(
- {
- "shape": "circle",
- "dimensions": {"radius": 15.5},
- "precision": 3,
- }
- ),
- )
- )
- ],
- "Let me calculate that area for you.",
- ),
- ],
-)
-def test_extract_tool_calls_streaming(
- qwen3_tool_parser_parametrized,
- qwen3_tokenizer,
- model_output,
- expected_tool_calls,
- expected_content,
-):
- """Test incremental streaming behavior including typed parameters"""
- request = ChatCompletionRequest(model=MODEL, messages=[])
-
- other_content = ""
- tool_states = {} # Track state per tool index
-
- for delta_message in stream_delta_message_generator(
- qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
- ):
- # role should never be streamed from tool parser
- assert not delta_message.role
-
- if delta_message.content:
- other_content += delta_message.content
-
- if delta_message.tool_calls:
- for tool_call in delta_message.tool_calls:
- idx = tool_call.index
-
- # Initialize state for new tool
- if idx not in tool_states:
- tool_states[idx] = {
- "id": None,
- "name": None,
- "arguments": "",
- "type": None,
- }
-
- # First chunk should have id, name, and type
- if tool_call.id:
- tool_states[idx]["id"] = tool_call.id
-
- if tool_call.type:
- assert tool_call.type == "function"
- tool_states[idx]["type"] = tool_call.type
-
- if tool_call.function:
- if tool_call.function.name:
- # Should only be set once
- assert tool_states[idx]["name"] is None
- tool_states[idx]["name"] = tool_call.function.name
-
- if tool_call.function.arguments is not None:
- # Accumulate arguments incrementally
- tool_states[idx]["arguments"] += tool_call.function.arguments
-
- # Verify final content
- assert other_content == (expected_content or "") # Handle None case
-
- # Verify we got all expected tool calls
- assert len(tool_states) == len(expected_tool_calls)
- assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len(
- expected_tool_calls
+ full = (
+ "\n\n"
+ "foo.py\n"
+ "\n"
+ "doc = 'example'\n"
+ "\n\n"
+ "\n"
+ "\n\n"
+ "done\n"
+ "\n"
)
- # Verify each tool call
- for idx, expected_tool in enumerate(expected_tool_calls):
- state = tool_states[idx]
- assert state["id"] is not None
- assert state["type"] == "function"
- assert state["name"] == expected_tool.function.name
-
- # Parse accumulated arguments
- arguments_str = state["arguments"]
- assert arguments_str is not None
- actual_args = json.loads(arguments_str)
- expected_args = json.loads(expected_tool.function.arguments)
- assert actual_args == expected_args
-
-
-def test_extract_tool_calls_missing_closing_parameter_tag(
- qwen3_tool_parser_parametrized,
-):
- """Test handling of missing closing tag"""
- # Using get_current_weather from sample_tools but with malformed XML
- model_output = """Let me check the weather for you:
-
-
-
-Dallas
-
-TX
-
-
-fahrenheit
-
-
-"""
-
- request = ChatCompletionRequest(model=MODEL, messages=[])
- extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls(
- model_output, request=request
+ msg = parser.extract_tool_calls_streaming(
+ previous_text="",
+ current_text=full,
+ delta_text=full,
+ previous_token_ids=[],
+ current_token_ids=[1],
+ delta_token_ids=[1],
+ request=request,
)
-
- # The parser should handle the malformed XML gracefully
- assert extracted_tool_calls.tools_called
- assert len(extracted_tool_calls.tool_calls) == 1
-
- # Verify the function name is correct
- assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather"
-
- # Verify the arguments are parsed despite the missing closing tag
- args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
- assert "city" in args
- assert args["city"] == "Dallas"
- assert args["state"] == "TX"
- assert args["unit"] == "fahrenheit"
-
- # Check that content before the tool call is preserved
- assert "Let me check the weather for you:" in extracted_tool_calls.content
+ assert msg is not None
+ assert msg.tool_calls is not None
+ assert len(msg.tool_calls) == 2, (
+ f"Expected 2 tool calls, got {len(msg.tool_calls)}: {msg.tool_calls}"
+ )
+ names = [tc.function.name for tc in msg.tool_calls]
+ assert names == ["write_file", "log"], f"Wrong tool names: {names}"
-def test_extract_tool_calls_streaming_missing_closing_tag(
- qwen3_tool_parser_parametrized, qwen3_tokenizer
+def test_streaming_content_before_and_between_two_tool_calls_one_delta(
+ qwen3_tool_parser,
):
- """Test streaming with missing closing tag"""
- # Using get_current_weather from sample_tools but with malformed XML
- model_output = """Let me check the weather for you:
-
-
-
-Dallas
-
-TX
-
-
-fahrenheit
-
-
-"""
-
+ """MTP / spec-decode: a single delta delivers free text BEFORE tool 1
+ AND free text BETWEEN tool 1 and tool 2. Both content fragments must
+ be emitted; the recursion path used to drop the second one because of a
+ ``not result.content`` guard that discarded the recursion's content
+ when the outer call already had content of its own.
+ """
request = ChatCompletionRequest(model=MODEL, messages=[])
-
- other_content = ""
- tool_states = {}
-
- for delta_message in stream_delta_message_generator(
- qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
- ):
- if delta_message.content:
- other_content += delta_message.content
-
- if delta_message.tool_calls:
- for tool_call in delta_message.tool_calls:
- idx = tool_call.index
-
- if idx not in tool_states:
- tool_states[idx] = {
- "id": None,
- "name": None,
- "arguments": "",
- "type": None,
- }
-
- if tool_call.id:
- tool_states[idx]["id"] = tool_call.id
-
- if tool_call.type:
- assert tool_call.type == "function"
- tool_states[idx]["type"] = tool_call.type
-
- if tool_call.function:
- if tool_call.function.name:
- tool_states[idx]["name"] = tool_call.function.name
-
- if tool_call.function.arguments is not None:
- tool_states[idx]["arguments"] += tool_call.function.arguments
-
- # Verify content was streamed
- assert "Let me check the weather for you:" in other_content
- # Verify we got the tool call
- assert len(tool_states) == 1
- assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
-
- state = tool_states[0]
- assert state["id"] is not None
- assert state["type"] == "function"
- assert state["name"] == "get_current_weather"
-
- # Verify arguments were parsed correctly despite missing closing tag
- assert state["arguments"] is not None
- args = json.loads(state["arguments"])
- assert args["city"] == "Dallas"
- assert args["state"] == "TX"
- assert args["unit"] == "fahrenheit"
+ delta = (
+ "before text "
+ "\n\n"
+ "\n1\n\n"
+ "\n"
+ "between text "
+ "\n\n"
+ "\n2\n\n"
+ "\n"
+ )
+ msg = qwen3_tool_parser.extract_tool_calls_streaming(
+ previous_text="",
+ current_text=delta,
+ delta_text=delta,
+ previous_token_ids=[],
+ current_token_ids=[1],
+ delta_token_ids=[1],
+ request=request,
+ )
+ assert msg is not None
+ assert msg.content is not None, "outer content lost"
+ assert "before text " in msg.content, (
+ f"missing 'before text' content: {msg.content!r}"
+ )
+ assert "between text " in msg.content, (
+ f"recursion content 'between text' was dropped because the outer "
+ f"already had content. Got: {msg.content!r}"
+ )
-def test_extract_tool_calls_streaming_incremental(
- qwen3_tool_parser_parametrized, qwen3_tokenizer
-):
- """Test that streaming is truly incremental"""
- model_output = """I'll check the weather.
-
-
-Dallas
-
-
-TX
-
-
-"""
+def test_extract_tool_calls_streaming_split_tag(qwen3_tool_parser):
+ """```` arrives split across two deltas (````). ``is_tool_call_started`` must flip to ``True`` once the
+ full tag exists in ``current_text``, and the partial tag must not leak
+ into ``DeltaMessage.content``.
+ This relies on the Coder parser's ``is_tool_call_started`` attribute,
+ which has no equivalent on the XML parser.
+ """
request = ChatCompletionRequest(model=MODEL, messages=[])
- chunks = []
- for delta_message in stream_delta_message_generator(
- qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
- ):
- chunks.append(delta_message)
-
- # Should have multiple chunks
- assert len(chunks) > 3
+ prev_text_1 = "I will use a tool."
+ delta_text_1 = "" not in msg2.content
- # Arguments should be streamed incrementally
- assert len(arg_chunks) > 1
- # Concatenated arguments should form valid JSON
- full_args = "".join(arg_chunks)
- parsed_args = json.loads(full_args)
- assert parsed_args["city"] == "Dallas"
- assert parsed_args["state"] == "TX"
+def test_streaming_char_by_char_literal_balises_in_value(qwen3_tokenizer):
+ """Stress test: a WriteFile tool call whose ``content`` value embeds a
+ complete literal ``...`` block — including
+ ``...`` and ``...
+ `` with names that match the OUTER tool's schema —
+ streamed one character at a time.
+ Reproduces the qwen-code scenario where the model writes a parser
+ fixture file: every literal ````, ````,
+ ````, ````, ```` and
+ ```` inside the ``content`` value must stay inside the
+ value; no spurious second tool call, no value truncation.
+ """
+ from vllm.entrypoints.openai.chat_completion.protocol import (
+ ChatCompletionToolsParam,
+ )
-def test_extract_tool_calls_complex_type_with_single_quote(
- qwen3_tokenizer,
-):
- """Test parameter type conversion based on tool schema"""
tools = [
ChatCompletionToolsParam(
type="function",
function={
- "name": "test_types",
+ "name": "write_file",
"parameters": {
"type": "object",
"properties": {
- "int_param": {"type": "integer"},
- "float_param": {"type": "float"},
- "bool_param": {"type": "boolean"},
- "str_param": {"type": "string"},
- "obj_param": {"type": "object"},
+ "path": {"type": "string"},
+ "content": {"type": "string"},
},
},
},
)
]
-
- model_output = """
-
-
-{'key': 'value'}
-
-
-"""
-
- parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools)
request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- extracted_tool_calls = parser.extract_tool_calls(model_output, request=request)
-
- args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
- assert args["obj_param"] == {"key": "value"}
-
-
-def test_extract_tool_calls_streaming_missing_opening_tag(
- qwen3_tool_parser_parametrized, qwen3_tokenizer
-):
- """Test streaming with missing opening tag
-
- This tests that the streaming parser correctly handles
- tool calls that start directly with
- """
- model_output = """I'll check the weather for you.
-
-
-
-Dallas
-
-
-TX
-
-
-fahrenheit
-
-
-"""
-
- request = ChatCompletionRequest(model=MODEL, messages=[])
-
- other_content = ""
- tool_states = {}
-
- for delta_message in stream_delta_message_generator(
- qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
- ):
- if delta_message.content:
- other_content += delta_message.content
-
- if delta_message.tool_calls:
- for tool_call in delta_message.tool_calls:
- idx = tool_call.index
-
- if idx not in tool_states:
- tool_states[idx] = {
- "id": None,
- "name": None,
- "arguments": "",
- "type": None,
- }
-
- if tool_call.id:
- tool_states[idx]["id"] = tool_call.id
-
- if tool_call.type:
- assert tool_call.type == "function"
- tool_states[idx]["type"] = tool_call.type
-
- if tool_call.function:
- if tool_call.function.name:
- tool_states[idx]["name"] = tool_call.function.name
- if tool_call.function.arguments is not None:
- tool_states[idx]["arguments"] += tool_call.function.arguments
-
- # Verify content was streamed
- assert "I'll check the weather for you." in other_content
-
- # Verify we got the tool call
- assert len(tool_states) == 1
- assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
-
- state = tool_states[0]
- assert state["id"] is not None
- assert state["type"] == "function"
- assert state["name"] == "get_current_weather"
-
- # Verify arguments were parsed correctly despite missing opening tag
- assert state["arguments"] is not None
- args = json.loads(state["arguments"])
- assert args["city"] == "Dallas"
- assert args["state"] == "TX"
- assert args["unit"] == "fahrenheit"
-
-
-def test_malformed_xml_no_gt_delimiter(qwen3_tool_parser):
- """Regression: malformed XML without '>' must not crash (PR #36774)."""
- model_output = (
+ nested_content = (
+ 'doc = """\n'
"\n"
- "Dallas\n"
- "\n"
- ""
- )
-
- request = ChatCompletionRequest(model=MODEL, messages=[])
- result = qwen3_tool_parser.extract_tool_calls(model_output, request=request)
- assert result is not None
- assert isinstance(result.tool_calls, list)
- assert all(tc is not None for tc in result.tool_calls)
-
-
-def test_none_tool_calls_filtered(qwen3_tool_parser):
- """Regression: None tool calls filtered from output (PR #36774)."""
- model_output = (
- "\n"
- "\n"
+ "\nliteral/value.txt\n\n"
+ "\nhello\n\n"
"\n"
"\n"
- "\n"
- "\n"
- "Dallas\n"
- "TX\n"
- "\n"
- ""
+ '"""\n'
)
- request = ChatCompletionRequest(model=MODEL, messages=[])
- result = qwen3_tool_parser.extract_tool_calls(model_output, request=request)
- assert all(tc is not None for tc in result.tool_calls)
- assert result.tools_called
- assert len(result.tool_calls) == 1
- assert result.tool_calls[0].function.name == "get_current_weather"
- args = json.loads(result.tool_calls[0].function.arguments)
- assert args["city"] == "Dallas"
- assert args["state"] == "TX"
-
-
-def test_anyof_parameter_not_double_encoded(qwen3_tokenizer):
- """Regression: anyOf parameters must not be double-encoded (PR #36032)."""
- tools = [
- ChatCompletionToolsParam(
- type="function",
- function={
- "name": "update_record",
- "parameters": {
- "type": "object",
- "properties": {
- "data": {
- "anyOf": [{"type": "object"}, {"type": "null"}],
- },
- },
- },
- },
- )
- ]
-
- parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools)
-
- model_output = (
+ full_output = (
"\n"
- "\n"
- '{"key": "value", "count": 42}\n'
+ "\n"
+ "\nfixture.py\n\n"
+ f"\n{nested_content}\n"
"\n"
""
)
- request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- result = parser.extract_tool_calls(model_output, request=request)
-
- assert result.tools_called
- assert len(result.tool_calls) == 1
- args = json.loads(result.tool_calls[0].function.arguments)
- assert isinstance(args["data"], dict)
- assert args["data"] == {"key": "value", "count": 42}
-
-
-def test_streaming_multi_param_single_chunk(qwen3_tool_parser, qwen3_tokenizer):
- """Regression: speculative decode delivering multiple params at once (PR #35615)."""
- request = ChatCompletionRequest(model=MODEL, messages=[])
-
- deltas = [
- "",
- "\n",
- "\n", # triggers json_started -> sends "{"
- # This single delta delivers all three parameters at once
- "\nDallas\n"
- "\n\nTX\n"
- "\n\nfahrenheit\n",
- "\n",
- "\n",
- ]
+ tool_states: dict[int, dict] = {}
+ current_text = ""
+ previous_text = ""
+ for ch in full_output:
+ previous_text = current_text
+ current_text += ch
+ delta_message = parser.extract_tool_calls_streaming(
+ previous_text=previous_text,
+ current_text=current_text,
+ delta_text=ch,
+ previous_token_ids=[],
+ current_token_ids=[],
+ delta_token_ids=[],
+ request=request,
+ )
+ if delta_message and delta_message.tool_calls:
+ for tool_call in delta_message.tool_calls:
+ idx = tool_call.index
+ state = tool_states.setdefault(
+ idx, {"id": None, "name": None, "arguments": ""}
+ )
+ if tool_call.id:
+ state["id"] = tool_call.id
+ if tool_call.function:
+ if tool_call.function.name:
+ state["name"] = tool_call.function.name
+ if tool_call.function.arguments is not None:
+ state["arguments"] += tool_call.function.arguments
- from tests.tool_parsers.utils import (
- run_tool_extraction_streaming,
+ assert list(tool_states.keys()) == [0], (
+ f"Expected exactly one tool call; got indices "
+ f"{list(tool_states.keys())} — a literal nested "
+ f"was promoted to a real call."
)
-
- reconstructor = run_tool_extraction_streaming(
- qwen3_tool_parser,
- deltas,
- request,
- assert_one_tool_per_delta=False,
+ state = tool_states[0]
+ assert state["name"] == "write_file"
+ args = json.loads(state["arguments"])
+ assert list(args.keys()) == ["path", "content"], (
+ f"Spurious params from embedded literals: {list(args.keys())}"
)
-
- assert len(reconstructor.tool_calls) == 1
- args = json.loads(reconstructor.tool_calls[0].function.arguments)
- assert args["city"] == "Dallas"
- assert args["state"] == "TX"
- assert args["unit"] == "fahrenheit"
-
-
-def test_no_double_serialization_string_args(qwen3_tool_parser):
- """Regression: string arguments must not be double-serialized (PR #35615)."""
- tools = [
- ChatCompletionToolsParam(
- type="function",
- function={
- "name": "greet",
- "parameters": {
- "type": "object",
- "properties": {
- "message": {"type": "string"},
- },
- },
- },
- )
- ]
-
- model_output = (
- "\n"
- "\n"
- "hello world\n"
- "\n"
- ""
+ assert args["path"] == "fixture.py"
+ assert args["content"] == nested_content.rstrip("\n"), (
+ f"content was truncated/corrupted: {args.get('content')!r}"
)
- request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
- result = qwen3_tool_parser.extract_tool_calls(model_output, request=request)
- assert result.tools_called
- assert len(result.tool_calls) == 1
- raw_arguments = result.tool_calls[0].function.arguments
- args = json.loads(raw_arguments)
- assert args["message"] == "hello world"
- assert '\\"hello world\\"' not in raw_arguments
+def test_extract_tool_calls_streaming_various_chunk_sizes(
+ qwen3_tokenizer,
+):
+ """Coder streaming must reconstruct arguments correctly even when the
+ deltas arrive a single character at a time.
+ The XML parser's SAX-based streaming cannot tolerate ``chunk_size=1``
+ by design (an XML tag is not parseable until ``>`` arrives), so this
+ robustness test stays Coder-only.
+ """
+ request = ChatCompletionRequest(model="test", messages=[])
-def test_get_vllm_registry_structural_tag_returns_structural_tag(
- qwen3_tool_parser: Qwen3CoderToolParser,
- sample_tools: list[ChatCompletionToolsParam],
-) -> None:
- request_tools = _as_chat_completion_tools(sample_tools)
- req = ChatCompletionRequest(
- messages=[],
- model="m",
- tools=request_tools,
- tool_choice="auto",
- )
- tag = qwen3_tool_parser.get_structural_tag(req)
- assert isinstance(tag, StructuralTag)
+ template_text = """
+
+
+value_1
+
+
+This is the value for the second parameter
+that can span
+multiple lines
+
+
+"""
- req = ChatCompletionRequest(
- messages=[],
- model="m",
- tools=request_tools,
- tool_choice="required",
- )
- tag = qwen3_tool_parser.get_structural_tag(req)
- assert isinstance(tag, StructuralTag)
+ for chunk_size in [1, 3, 15, len(template_text)]:
+ parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=None)
+
+ tool_states = {}
+ current_text = ""
+ previous_text = ""
+ ptr = 0
+
+ while ptr < len(template_text):
+ delta = template_text[ptr : ptr + chunk_size]
+ previous_text = current_text
+ current_text += delta
+ ptr += chunk_size
+
+ delta_message = parser.extract_tool_calls_streaming(
+ previous_text=previous_text,
+ current_text=current_text,
+ delta_text=delta,
+ previous_token_ids=[],
+ current_token_ids=[],
+ delta_token_ids=[],
+ request=request,
+ )
- if request_tools:
- tool = request_tools[0]
- req = ChatCompletionRequest(
- messages=[],
- model="m",
- tools=request_tools,
+ if delta_message and delta_message.tool_calls:
+ for tool_call in delta_message.tool_calls:
+ idx = tool_call.index
+ if idx not in tool_states:
+ tool_states[idx] = {
+ "id": None,
+ "name": None,
+ "arguments": "",
+ "type": None,
+ }
+ if tool_call.id:
+ tool_states[idx]["id"] = tool_call.id
+ if tool_call.type:
+ tool_states[idx]["type"] = tool_call.type
+ if tool_call.function:
+ if tool_call.function.name:
+ tool_states[idx]["name"] = tool_call.function.name
+ if tool_call.function.arguments is not None:
+ tool_states[idx]["arguments"] += (
+ tool_call.function.arguments
+ )
+
+ assert 0 in tool_states, f"chunk_size={chunk_size}"
+ assert tool_states[0]["name"] == "example_function_name"
+ args = json.loads(tool_states[0]["arguments"])
+ assert args["example_parameter_1"] == "value_1"
+ assert args["example_parameter_2"] == (
+ "This is the value for the second parameter\nthat can span\nmultiple lines"
)
- req.tool_choice = ChatCompletionNamedToolChoiceParam(
- function=ChatCompletionNamedFunction(name=tool.function.name)
- )
- tag = qwen3_tool_parser.get_structural_tag(req)
- assert isinstance(tag, StructuralTag)
-
-
-@pytest.mark.parametrize("include_reasoning", [True, False])
-def test_adjust_request_auto_uses_vllm_registry_structural_tag(
- monkeypatch: pytest.MonkeyPatch,
- qwen3_tool_parser: Qwen3CoderToolParser,
- sample_tools: list[ChatCompletionToolsParam],
- include_reasoning: bool,
-) -> None:
- monkeypatch.setattr(
- "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
- True,
- )
- request_tools = _as_chat_completion_tools(sample_tools)
- req = ChatCompletionRequest(
- messages=[],
- model="m",
- tools=request_tools,
- tool_choice="auto",
- include_reasoning=include_reasoning,
- )
- out = qwen3_tool_parser.adjust_request(req)
- assert out.structured_outputs is not None
- assert out.structured_outputs.structural_tag is not None
- assert isinstance(out.structured_outputs.structural_tag, str)
- loaded = json.loads(out.structured_outputs.structural_tag)
- assert isinstance(loaded, dict)
-
-
-def test_adjust_request_required_prefers_structural_tag(
- monkeypatch: pytest.MonkeyPatch,
- qwen3_tool_parser: Qwen3CoderToolParser,
- sample_tools: list[ChatCompletionToolsParam],
-) -> None:
- monkeypatch.setattr(
- "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
- True,
- )
- request_tools = _as_chat_completion_tools(sample_tools)
- req = ChatCompletionRequest(
- messages=[],
- model="m",
- tools=request_tools,
- tool_choice="required",
- )
- out = qwen3_tool_parser.adjust_request(req)
- assert out.structured_outputs is not None
- assert out.structured_outputs.structural_tag is not None
diff --git a/tests/tool_parsers/test_qwen3xml_tool_parser.py b/tests/tool_parsers/test_qwen3xml_tool_parser.py
index 1ea9a1d65c04..c38268c62ec9 100644
--- a/tests/tool_parsers/test_qwen3xml_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3xml_tool_parser.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
import pytest
@@ -8,6 +9,23 @@
ToolParserTestConfig,
ToolParserTests,
)
+from tests.tool_parsers.test_qwen3_xml_coder_shared import (
+ stream_delta_message_generator,
+)
+from tests.tool_parsers.utils import run_tool_extraction_streaming
+from vllm.entrypoints.openai.chat_completion.protocol import (
+ ChatCompletionRequest,
+ ChatCompletionToolsParam,
+)
+from vllm.tokenizers import get_tokenizer
+from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
+
+MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+ return get_tokenizer(tokenizer_name=MODEL)
class TestQwen3xmlToolParser(ToolParserTests):
@@ -54,19 +72,508 @@ def test_config(self) -> ToolParserTestConfig:
single_tool_call_expected_args={"city": "Tokyo"},
parallel_tool_calls_count=2,
parallel_tool_calls_names=["get_weather", "get_time"],
- # xfail markers - Qwen3XML has systematic streaming issues
- xfail_streaming={
- "test_single_tool_call_simple_args": (
- "Qwen3XML streaming has systematic issues"
- ),
- "test_parallel_tool_calls": "Qwen3XML streaming has systematic issues",
- "test_various_data_types": "Qwen3XML streaming has systematic issues",
- "test_empty_arguments": "Qwen3XML streaming has systematic issues",
- "test_surrounding_text": "Qwen3XML streaming has systematic issues",
- "test_escaped_strings": "Qwen3XML streaming has systematic issues",
- "test_streaming_reconstruction": (
- "Qwen3XML streaming reconstruction has known issues"
- ),
- },
supports_typed_arguments=False,
)
+
+ def test_qwen3xml_async_streaming_free_text(self, qwen3_tokenizer):
+ parser = Qwen3XMLToolParser(qwen3_tokenizer)
+
+ # 1. First tool call
+ # 2. Free text
+ # 3. Second tool call
+ text_to_stream = (
+ "\n\nParis\n\n"
+ "\nNext, I will check the weather for London:\n"
+ "\n\nLondon\n\n"
+ )
+
+ request = ChatCompletionRequest(messages=[], model="test")
+ emitted_messages = []
+ previous_text = ""
+ previous_tokens = []
+ token_ids = qwen3_tokenizer.encode(text_to_stream, add_special_tokens=False)
+
+ for i in range(1, len(token_ids) + 1):
+ current_token_ids = token_ids[:i]
+ current_text = qwen3_tokenizer.decode(current_token_ids)
+ delta_text = current_text[len(previous_text) :]
+ token_delta = current_token_ids[len(previous_tokens) :]
+
+ delta = parser.extract_tool_calls_streaming(
+ previous_text,
+ current_text,
+ delta_text,
+ previous_tokens,
+ current_token_ids,
+ token_delta,
+ request,
+ )
+ if delta is not None:
+ emitted_messages.append(delta)
+
+ previous_text = current_text
+ previous_tokens = current_token_ids
+
+ # Check that the free text is emitted BEFORE London's arguments are emitted.
+ found_early = False
+ accumulated_content = ""
+ for i, msg in enumerate(emitted_messages):
+ if msg.content:
+ accumulated_content += msg.content
+
+ if "Next, I will check the weather for London" in accumulated_content:
+ # Check if we already saw "London" in any previous or
+ # current tool call arguments
+ is_london_emitted = any(
+ tc.function.arguments and "London" in tc.function.arguments
+ for m in emitted_messages[: i + 1]
+ if m.tool_calls
+ for tc in m.tool_calls
+ )
+ if not is_london_emitted:
+ found_early = True
+ break
+
+ assert found_early, (
+ "Free text between tool calls should be emitted as soon as the "
+ "second tool call starts, not delayed."
+ )
+
+ def test_qwen3xml_streaming_text_after_tool_call(self, qwen3_tokenizer):
+ parser = Qwen3XMLToolParser(qwen3_tokenizer)
+
+ # Tool call followed by free text
+ text_to_stream = (
+ "\n\nParis\n\n"
+ "\nI hope this helps!"
+ )
+
+ request = ChatCompletionRequest(messages=[], model="test")
+ emitted_messages = []
+ previous_text = ""
+ previous_tokens = []
+ token_ids = qwen3_tokenizer.encode(text_to_stream, add_special_tokens=False)
+
+ for i in range(1, len(token_ids) + 1):
+ current_token_ids = token_ids[:i]
+ current_text = qwen3_tokenizer.decode(current_token_ids)
+ delta_text = current_text[len(previous_text) :]
+ token_delta = current_token_ids[len(previous_tokens) :]
+
+ delta = parser.extract_tool_calls_streaming(
+ previous_text,
+ current_text,
+ delta_text,
+ previous_tokens,
+ current_token_ids,
+ token_delta,
+ request,
+ )
+ if delta is not None:
+ emitted_messages.append(delta)
+
+ previous_text = current_text
+ previous_tokens = current_token_ids
+
+ # Aggregate all emitted content
+ all_content = "".join([m.content for m in emitted_messages if m.content])
+
+ assert "I hope this helps!" in all_content, (
+ "Free text after the last tool call should be emitted."
+ )
+
+
+def test_qwen3xml_streaming_trailing_text_after_literal_close_in_value(
+ qwen3_tokenizer,
+):
+ """XML parser: a tool_call's parameter value contains a literal
+ ````. After the real ````, trailing free
+ text must still be emitted.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "write_file",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "path": {"type": "string"},
+ "content": {"type": "string"},
+ },
+ },
+ },
+ )
+ ]
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ deltas = [
+ # Tool 1 with literal embedded in 'content'.
+ "\n\n"
+ "foo.py\n"
+ "\n"
+ "doc = 'example'\n"
+ "\n\n",
+ # Trailing text in a separate delta.
+ "\nDone, file written!",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser, deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1, (
+ f"Expected 1 tool call, got {len(reconstructor.tool_calls)}"
+ )
+ assert "Done, file written!" in reconstructor.other_content, (
+ f"Trailing text after a tool with literal in its "
+ f"value was dropped. Got content: {reconstructor.other_content!r}"
+ )
+
+
+def test_qwen3xml_streaming_python_none_int_char_by_char(qwen3_tokenizer):
+ """Streaming a nullable INTEGER param value of "None" (Qwen3.5 style)
+ char-by-char must produce VALID JSON. The XML parser's incremental
+ char path used to emit "Non" then a "l" delta computed from the diff
+ between "Non" and "null", giving the cumulative invalid string
+ "Nonl". The fix defers int/float conversion just like bool/object
+ so the full value is parsed at close.
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_count",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "count": {
+ "anyOf": [
+ {"type": "integer"},
+ {"type": "null"},
+ ],
+ },
+ },
+ },
+ },
+ )
+ ]
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ # Char-by-char deltas emulate worst-case slow streaming.
+ char_deltas = [
+ "\n",
+ "\n",
+ "",
+ "\n",
+ "N",
+ "o",
+ "n",
+ "e",
+ "\n",
+ "\n",
+ "\n",
+ "",
+ ]
+ reconstructor = run_tool_extraction_streaming(
+ parser, char_deltas, request, assert_one_tool_per_delta=False
+ )
+ assert len(reconstructor.tool_calls) == 1
+ raw = reconstructor.tool_calls[0].function.arguments
+ args = json.loads(raw) # must be valid JSON
+ assert args["count"] is None, (
+ f"streaming nullable int 'None' produced invalid JSON or wrong "
+ f"value. Raw: {raw!r}"
+ )
+
+
+def test_qwen36_xml_streaming_double_close_brace(qwen3_tokenizer):
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "get_weather",
+ "parameters": {
+ "type": "object",
+ "properties": {"city": {"type": "string"}},
+ },
+ },
+ )
+ ]
+
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ deltas = [
+ "",
+ "\n",
+ "\n\nDallas\n",
+ "\n",
+ "\n",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ deltas,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+
+ assert len(reconstructor.tool_calls) == 1
+ full_args = reconstructor.tool_calls[0].function.arguments
+
+ assert not full_args.endswith("}}"), (
+ f"XML streaming parser emitted double closing brace: {full_args!r}. "
+ "parse_single_streaming_chunks fallback called _end_element('function') twice."
+ )
+ args = json.loads(full_args)
+ assert args == {"city": "Dallas"}
+
+
+def test_xml_streaming_parallel_tool_calls_preformed_chunks(qwen3_tokenizer):
+ """
+ Note: in normal token-by-token streaming this rarely triggers because
+ the tokenizer splits XML tags across multiple tokens. It CAN trigger with
+ speculative decoding multi-token flushes.
+ """
+
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "get_weather",
+ "parameters": {
+ "type": "object",
+ "properties": {"city": {"type": "string"}},
+ },
+ },
+ )
+ ]
+
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ deltas = [
+ "",
+ "\n",
+ "\nParis",
+ "\n",
+ "\n",
+ "",
+ "\n",
+ "\nLondon",
+ "\n",
+ "\n",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ deltas,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+
+ assert len(reconstructor.tool_calls) == 2, (
+ f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}"
+ )
+
+ args0 = json.loads(reconstructor.tool_calls[0].function.arguments)
+ args1 = json.loads(reconstructor.tool_calls[1].function.arguments)
+
+ assert reconstructor.tool_calls[0].function.name == "get_weather"
+ assert reconstructor.tool_calls[1].function.name == "get_weather"
+ assert args0 == {"city": "Paris"}, f"First call args wrong: {args0!r}"
+ assert args1 == {"city": "London"}, f"Second call args wrong: {args1!r}"
+
+
+# ---------------------------------------------------------------------------
+# XML-specific streaming bugs (Coder parser is not affected)
+# ---------------------------------------------------------------------------
+
+
+def test_xml_streaming_boolean_true_not_false(qwen3_tokenizer):
+ """
+ Bug B: In streaming mode, a boolean parameter with value "true" is
+ streamed as "false".
+
+ Root cause: When "true" arrives character by character:
+ - 't' → _convert_param_value("t", "boolean") = False → emits "false"
+ - 'r','u','e' → no new delta (output_data[len("false"):] = "")
+ Final accumulated arguments contain "false" instead of "true".
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "set_flag",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "enabled": {"type": "boolean"},
+ },
+ },
+ },
+ )
+ ]
+
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ # Feed character-by-character to trigger the streaming accumulation bug.
+ # Each chunk simulates a single-character token arriving in streaming.
+ char_deltas = [
+ "",
+ "\n",
+ "\n",
+ "t", # ← first char triggers False → emits "false"
+ "r",
+ "u",
+ "e", # ← full "true" but delta = "true"[5:] = ""
+ "",
+ "\n",
+ "\n",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ char_deltas,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+
+ assert args["enabled"] is True, (
+ f"Boolean streaming bug: expected True, got {args['enabled']!r}. "
+ f"First char 't' emits 'false'; subsequent chars emit nothing; "
+ f"final value is 'false' even though the model said 'true'."
+ )
+
+
+def test_xml_streaming_string_null_last_char_not_dropped(qwen3_tokenizer):
+ """
+ Bug A (streaming variant): String parameter with value "null" loses
+ the last character 'l' when tokens arrive one by one.
+
+ Root cause: Accumulating 'n','u','l' emits correctly, but on the
+ fourth char 'l' the full value is "null" →
+ _convert_param_value("null", "string") → None →
+ _convert_for_json_streaming(None, "string") → "" → delta = ""[3:] = "".
+ The closing quote is then emitted, yielding "nul" not "null".
+ """
+ tools = [
+ ChatCompletionToolsParam(
+ type="function",
+ function={
+ "name": "search",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "query": {"type": "string"},
+ },
+ },
+ },
+ )
+ ]
+
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools)
+ request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools)
+
+ char_deltas = [
+ "",
+ "\n",
+ "\n",
+ "n",
+ "u",
+ "l",
+ "l", # ← triggers _convert_param_value("null",…) = None → nothing emitted
+ "",
+ "\n",
+ "\n",
+ ]
+
+ reconstructor = run_tool_extraction_streaming(
+ parser,
+ char_deltas,
+ request,
+ assert_one_tool_per_delta=False,
+ )
+
+ assert len(reconstructor.tool_calls) == 1
+ args = json.loads(reconstructor.tool_calls[0].function.arguments)
+
+ assert "query" in args
+ assert args["query"] == "null", (
+ f"String 'null' streaming bug: last 'l' was dropped. "
+ f"Got: {args['query']!r}. "
+ f"When full value reaches 'null', _convert_param_value returns None "
+ f"and _convert_for_json_streaming(None, 'string') returns '', "
+ f"so the final delta is empty and the 'l' is never emitted."
+ )
+
+
+def test_xml_streaming_missing_opening_tool_call_tag(qwen3_tokenizer):
+ """The XML streaming parser must recover when the model emits a tool
+ call without the leading ```` tag — i.e. directly with
+ ````. The Coder parser does not support this in
+ streaming mode, so this regression stays XML-specific.
+ """
+ parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=None)
+
+ model_output = """I'll check the weather for you.
+
+
+
+Dallas
+
+
+TX
+
+
+fahrenheit
+
+
+"""
+
+ request = ChatCompletionRequest(model=MODEL, messages=[])
+ other_content = ""
+ tool_states: dict = {}
+
+ for delta_message in stream_delta_message_generator(
+ parser, qwen3_tokenizer, model_output, request
+ ):
+ if delta_message.content:
+ other_content += delta_message.content
+ if delta_message.tool_calls:
+ for tool_call in delta_message.tool_calls:
+ idx = tool_call.index
+ if idx not in tool_states:
+ tool_states[idx] = {
+ "id": None,
+ "name": None,
+ "arguments": "",
+ "type": None,
+ }
+ if tool_call.id:
+ tool_states[idx]["id"] = tool_call.id
+ if tool_call.type:
+ assert tool_call.type == "function"
+ tool_states[idx]["type"] = tool_call.type
+ if tool_call.function:
+ if tool_call.function.name:
+ tool_states[idx]["name"] = tool_call.function.name
+ if tool_call.function.arguments is not None:
+ tool_states[idx]["arguments"] += tool_call.function.arguments
+
+ assert "I'll check the weather for you." in other_content
+ assert len(tool_states) == 1
+ state = tool_states[0]
+ assert state["id"] is not None
+ assert state["type"] == "function"
+ assert state["name"] == "get_current_weather"
+ args = json.loads(state["arguments"])
+ assert args["city"] == "Dallas"
+ assert args["state"] == "TX"
+ assert args["unit"] == "fahrenheit"
diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index 7457590c5ac0..a3875118861d 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import contextlib
import json
import uuid
from collections.abc import Sequence
@@ -29,11 +31,7 @@
get_enable_structured_outputs_in_reasoning,
get_model_structural_tag,
)
-from vllm.tool_parsers.utils import (
- coerce_to_schema_type,
- extract_types_from_schema,
- find_tool_properties,
-)
+from vllm.tool_parsers.utils import find_tool_properties, partial_tag_overlap
logger = init_logger(__name__)
@@ -119,16 +117,464 @@ def _reset_streaming_state(self):
# Store accumulated parameters for type conversion
self.accumulated_params = {}
self.streaming_request = None
+ self._sent_content_idx = 0
def _convert_param_value(
self, param_value: str, param_name: str, param_config: dict, func_name: str
) -> Any:
"""Convert parameter value based on its type in the schema."""
- if not isinstance(param_value, str):
+ if param_name not in param_config:
+ if param_config != {}:
+ logger.debug(
+ "Parsed parameter '%s' is not defined in the tool "
+ "parameters for tool '%s', directly returning the "
+ "string value.",
+ param_name,
+ func_name,
+ )
+ return param_value
+
+ # ``allows_null`` is True when the schema explicitly admits a
+ # null value (either via ``"type": "null"`` or in an ``anyOf``
+ # union). A nullable parameter must convert the literal
+ # ``"null"`` / ``"None"`` to JSON null even when the primary
+ # type is ``string`` — otherwise a Qwen3.5-trained model that
+ # emits the Python ``None`` literal leaves the client with the
+ # string ``"None"`` for a nullable optional.
+ allows_null = False
+ if (
+ isinstance(param_config[param_name], dict)
+ and "type" in param_config[param_name]
+ ):
+ param_type = str(param_config[param_name]["type"]).strip().lower()
+ allows_null = param_type == "null"
+ elif (
+ isinstance(param_config[param_name], dict)
+ and "anyOf" in param_config[param_name]
+ ):
+ # Extract the first non-null type from the anyOf list so that
+ # nullable schemas like {"anyOf": [{"type": "string"},
+ # {"type": "null"}]} behave as "string", not "object".
+ param_type = "string"
+ picked = False
+ for option in param_config[param_name]["anyOf"]:
+ if isinstance(option, dict) and "type" in option:
+ opt_type = str(option["type"]).strip().lower()
+ if opt_type == "null":
+ allows_null = True
+ elif not picked:
+ param_type = opt_type
+ picked = True
+ else:
+ param_type = "string"
+ # Nullable schemas: recognise "null" / "None" up front so a
+ # string-typed nullable still maps to JSON null.
+ if allows_null and param_value.lower() in ("null", "none"):
+ return None
+ # String type takes precedence: preserve the raw value (including
+ # the literal "null") rather than converting it to Python None.
+ if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
return param_value
- param_schema = param_config.get(param_name, {})
- param_types = extract_types_from_schema(param_schema)
- return coerce_to_schema_type(param_value, param_types)
+ # For non-string types, "null" maps to JSON null. Also accept
+ # the Python literal "None" so that Qwen3.5-trained models — whose
+ # chat template renders null args via ``| string`` (yielding the
+ # literal "None" in the prompt) — round-trip nullable values
+ # correctly.
+ if param_value.lower() in ("null", "none"):
+ return None
+ if (
+ param_type.startswith("int")
+ or param_type.startswith("uint")
+ or param_type.startswith("long")
+ or param_type.startswith("short")
+ or param_type.startswith("unsigned")
+ ):
+ try:
+ return int(param_value)
+ except (ValueError, TypeError):
+ logger.debug(
+ "Parsed value '%s' of parameter '%s' is not an "
+ "integer in tool '%s', degenerating to string.",
+ param_value,
+ param_name,
+ func_name,
+ )
+ return param_value
+ elif param_type.startswith("num") or param_type.startswith("float"):
+ try:
+ float_param_value = float(param_value)
+ return (
+ float_param_value
+ if float_param_value - int(float_param_value) != 0
+ else int(float_param_value)
+ )
+ except (ValueError, TypeError):
+ logger.debug(
+ "Parsed value '%s' of parameter '%s' is not a float "
+ "in tool '%s', degenerating to string.",
+ param_value,
+ param_name,
+ func_name,
+ )
+ return param_value
+ elif param_type in ["boolean", "bool", "binary"]:
+ param_value = param_value.lower()
+ if param_value not in ["true", "false"]:
+ logger.debug(
+ "Parsed value '%s' of parameter '%s' is not a boolean "
+ "(`true` or `false`) in tool '%s', degenerating to "
+ "false.",
+ param_value,
+ param_name,
+ func_name,
+ )
+ return param_value == "true"
+ else:
+ is_container_type = (
+ param_type in ["object", "array", "arr"]
+ or param_type.startswith("dict")
+ or param_type.startswith("list")
+ )
+ if is_container_type:
+ try:
+ parsed = json.loads(param_value)
+ # A model trained with a buggy template
+ # (json.dumps(str(dict))) may output a JSON-encoded
+ # Python repr like "{'k': 'v'}". json.loads returns a
+ # string in that case — try one more parse.
+ if isinstance(parsed, str):
+ with contextlib.suppress(ValueError, SyntaxError, TypeError):
+ parsed = ast.literal_eval(parsed)
+ return parsed
+ except (json.JSONDecodeError, TypeError, ValueError):
+ logger.debug(
+ "Parsed value '%s' of parameter '%s' cannot be "
+ "parsed with json.loads in tool '%s', will try "
+ "other methods to parse it.",
+ param_value,
+ param_name,
+ func_name,
+ )
+ try:
+ param_value = ast.literal_eval(param_value) # safer
+ # Same double-decode for container types whose raw text
+ # had no JSON outer layer (e.g. bare Python repr
+ # "{'k': 'v'}").
+ if is_container_type and isinstance(param_value, str):
+ with contextlib.suppress(ValueError, SyntaxError, TypeError):
+ param_value = ast.literal_eval(param_value)
+ except (ValueError, SyntaxError, TypeError):
+ logger.debug(
+ "Parsed value '%s' of parameter '%s' cannot be "
+ "converted via Python `ast.literal_eval()` in tool "
+ "'%s', degenerating to string.",
+ param_value,
+ param_name,
+ func_name,
+ )
+ return param_value
+
+ def _next_structural_param_start(
+ self,
+ text: str,
+ start_pos: int = 0,
+ valid_param_names: set[str] | None = None,
+ ) -> int:
+ """Return index of next structural ```` from
+ start_pos. Structural means preceded by ``\\n`` or at position 0.
+ If valid_param_names is given, NAME must also be in that set.
+ Returns -1 if none found.
+ """
+ ni = start_pos
+ prefix_len = len(self.parameter_prefix)
+ while True:
+ ni = text.find(self.parameter_prefix, ni)
+ if ni == -1:
+ return -1
+ if ni == 0 or text[ni - 1] == "\n":
+ if valid_param_names is not None:
+ name_end = text.find(">", ni + prefix_len)
+ if (
+ name_end != -1
+ and text[ni + prefix_len : name_end] in valid_param_names
+ ):
+ return ni
+ ni += 1
+ continue
+ return ni
+ ni += 1
+
+ def _find_true_function_end(self, text: str) -> int:
+ """Return the index of the real structural ```` in text
+ (followed with optional whitespace by ```` or end of
+ string), or -1 if none found. Skips ```` that appears
+ as literal text inside a parameter value.
+ """
+ search_pos = 0
+ while True:
+ idx = text.find(self.function_end_token, search_pos)
+ if idx == -1:
+ return -1
+ after = text[idx + len(self.function_end_token) :]
+ stripped = after.lstrip()
+ if stripped == "" or stripped.startswith(self.tool_call_end_token):
+ return idx
+ search_pos = idx + len(self.function_end_token)
+
+ def _scan_to_structural_function_end(
+ self,
+ after_func_open: str,
+ valid_param_names: set[str] | None = None,
+ ) -> int:
+ """Scan a function body — text immediately following the closing
+ ``>`` of ```` — by walking through structural
+ ``...`` blocks and return the index of
+ the structural ```` in ``after_func_open``.
+
+ This is more robust than ``_find_true_function_end`` when the
+ parameter value embeds a complete literal ``...
+ \\n`` block: that nested ````
+ is followed by ```` and would pass the lookahead
+ heuristic, but it is INSIDE a parameter and must be skipped.
+
+ Handles a "missing " malformation by treating the
+ next structural ```` (with NAME unseen so far)
+ as an implicit end.
+
+ Returns -1 if the body is incomplete or malformed.
+ """
+ pos = 0
+ n = len(after_func_open)
+ seen: set[str] = set()
+ while pos < n:
+ # Skip whitespace between params
+ while pos < n and after_func_open[pos] in " \t\n\r":
+ pos += 1
+ if pos >= n:
+ return -1
+ if after_func_open[pos:].startswith(self.function_end_token):
+ return pos
+ if not after_func_open[pos:].startswith(self.parameter_prefix):
+ # Unexpected token before ; fall back to the
+ # legacy heuristic on the rest of the text.
+ rest_offset = self._find_true_function_end(after_func_open[pos:])
+ return pos + rest_offset if rest_offset != -1 else -1
+ name_end = after_func_open.find(">", pos + len(self.parameter_prefix))
+ if name_end == -1:
+ return -1
+ param_name = after_func_open[pos + len(self.parameter_prefix) : name_end]
+ value_start = name_end + 1
+ if value_start < n and after_func_open[value_start] == "\n":
+ value_start += 1
+ param_end = self._find_true_param_end(
+ after_func_open[value_start:],
+ valid_param_names,
+ require_lookahead=True,
+ )
+ if param_end == -1:
+ # Missing malformation: try the next
+ # structural with NAME unseen so far
+ # as the implicit end.
+ unseen: set[str] | None = (
+ (valid_param_names - seen - {param_name})
+ if valid_param_names is not None
+ else None
+ )
+ implicit_end = self._next_structural_param_start(
+ after_func_open[value_start:], 0, unseen
+ )
+ if implicit_end == -1:
+ return -1
+ pos = value_start + implicit_end
+ seen.add(param_name)
+ continue
+ seen.add(param_name)
+ pos = value_start + param_end + len(self.parameter_end_token)
+ return -1
+
+ def _advance_to_next_tool(self, current_text: str) -> None:
+ """Advance streaming state to the next tool call.
+
+ Updates _sent_content_idx to skip past the completed tool call's
+ closing tag, then resets per-tool state for the next invocation.
+ Called both on normal delta boundaries and during speculative-
+ decoding recursion when multiple complete tool calls arrive in one
+ delta.
+
+ Uses STRUCTURAL ```` positions so a literal
+ ```` embedded in a parameter value (e.g. a code
+ snippet) does not move ``_sent_content_idx`` to the wrong place.
+ """
+ end_positions = self._structural_tool_call_end_positions(current_text)
+ target = self.current_tool_index
+ if target < len(end_positions):
+ self._sent_content_idx = max(
+ self._sent_content_idx,
+ end_positions[target] + len(self.tool_call_end_token),
+ )
+
+ self.current_tool_index += 1
+ self.header_sent = False
+ self.param_count = 0
+ self.json_started = False
+ self.json_closed = False
+ self.accumulated_params = {}
+ self.is_tool_call_started = False
+
+ def _find_true_tool_call_end(self, text: str) -> int:
+ """Return the index of the real structural ```` in
+ text (followed with optional whitespace by another ````
+ or end of string), or -1 if none found.
+ """
+ search_pos = 0
+ while True:
+ idx = text.find(self.tool_call_end_token, search_pos)
+ if idx == -1:
+ return -1
+ after = text[idx + len(self.tool_call_end_token) :]
+ stripped = after.lstrip()
+ if stripped == "" or stripped.startswith(self.tool_call_start_token):
+ return idx
+ search_pos = idx + len(self.tool_call_end_token)
+
+ def _structural_tool_call_end_positions(self, text: str) -> list[int]:
+ """Return positions of every STRUCTURAL ```` in text.
+
+ Walks each ``...`` top-level block by
+ following ````, scanning the body via
+ ``_scan_to_structural_function_end`` (which steps over parameter
+ values that may contain literal ````, ````,
+ ```` or ```` strings), then matching the
+ trailing ````.
+
+ Falls back to a lookahead heuristic when the walker cannot
+ determine a structural close (incomplete body, malformed XML).
+ """
+ positions: list[int] = []
+ pos = 0
+ n = len(text)
+ while pos < n:
+ tc_start = text.find(self.tool_call_start_token, pos)
+ if tc_start == -1:
+ break
+ body_start = tc_start + len(self.tool_call_start_token)
+ func_open = text.find(self.tool_call_prefix, body_start)
+ if func_open == -1:
+ break
+ name_end = text.find(">", func_open + len(self.tool_call_prefix))
+ if name_end == -1:
+ break
+ func_name = text[func_open + len(self.tool_call_prefix) : name_end]
+ valid_params: set[str] | None = None
+ if self.tools:
+ cfg = find_tool_properties(self.tools, func_name)
+ if cfg:
+ valid_params = set(cfg.keys())
+ body_after_name = text[name_end + 1 :]
+ func_end_rel = self._scan_to_structural_function_end(
+ body_after_name, valid_params
+ )
+ if func_end_rel == -1:
+ # Body incomplete; the structural is not
+ # yet known. Stop walking — DO NOT fall back to the
+ # legacy heuristic for the rest of the text, because a
+ # literal embedded in an unfinished
+ # parameter would be erroneously treated as structural.
+ break
+ func_end_abs = (name_end + 1) + func_end_rel
+ after = text[func_end_abs + len(self.function_end_token) :]
+ i = 0
+ while i < len(after) and after[i] in " \t\n\r":
+ i += 1
+ if not after[i:].startswith(self.tool_call_end_token):
+ break
+ tc_end_pos = func_end_abs + len(self.function_end_token) + i
+ positions.append(tc_end_pos)
+ pos = tc_end_pos + len(self.tool_call_end_token)
+ return positions
+
+ def _find_true_param_end(
+ self,
+ value_text: str,
+ valid_param_names: set[str] | None = None,
+ require_lookahead: bool = False,
+ ) -> int:
+ """Find the true end of a parameter value in value_text.
+
+ A ```` is structural only when it is followed by
+ another structural delimiter (schema-known ````,
+ ````, ````) or — in non-streaming mode —
+ end-of-string. Nested ```` opens are tracked
+ for depth REGARDLESS of whether NAME is in the schema: a
+ literal nested tool_call may use NAMEs that are not in the
+ outer tool's schema, but its literal ```` still
+ pairs with the literal open and must not be mistaken for a
+ structural close.
+
+ Returns the index of the true ```` in value_text, or
+ -1 if incomplete.
+ """
+ depth = 0
+ pos = 0
+ param_prefix_len = len(self.parameter_prefix)
+ param_end_len = len(self.parameter_end_token)
+
+ while pos < len(value_text):
+ # Use UNFILTERED structural opens for depth tracking so that
+ # a literal ```` (NAME not in the outer
+ # schema) still increments depth and its matching literal
+ # ```` is balanced — otherwise that close would
+ # appear unmatched and pass the structural lookahead.
+ next_open = self._next_structural_param_start(value_text, pos, None)
+ next_close = value_text.find(self.parameter_end_token, pos)
+ if next_close == -1:
+ return -1
+
+ if next_open != -1 and next_open < next_close:
+ depth += 1
+ pos = next_open + param_prefix_len
+ elif depth == 0:
+ after = value_text[next_close + param_end_len :]
+ stripped = after.lstrip()
+ structural_next_param = False
+ if stripped.startswith(self.parameter_prefix):
+ if valid_param_names is not None:
+ name_start = len(self.parameter_prefix)
+ name_end = stripped.find(">", name_start)
+ if name_end != -1:
+ structural_next_param = (
+ stripped[name_start:name_end] in valid_param_names
+ )
+ else:
+ structural_next_param = True
+ if (
+ (stripped == "" and not require_lookahead)
+ or structural_next_param
+ or stripped.startswith(self.function_end_token)
+ or stripped.startswith(self.tool_call_end_token)
+ ):
+ return next_close
+ pos = next_close + param_end_len
+ else:
+ depth -= 1
+ pos = next_close + param_end_len
+
+ return -1
+
+ @staticmethod
+ def _is_valid_function_name(name: str) -> bool:
+ """Return True when ``name`` looks like a real function identifier
+ and not a stray template token, malformed tag, or freeform text.
+
+ Rejects names that contain template-syntax characters (``{``,
+ ``}``, ``<``, ``>``), whitespace, quotes, or are empty. Permits
+ identifiers, dashes (``max-retries``), dots (``user.name``),
+ slashes (``namespace/tool``), and Unicode letters.
+ """
+ if not name:
+ return False
+ forbidden = set("{}<>\"' \t\n\r")
+ return not any(c in forbidden for c in name)
def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None:
# Extract function name
@@ -137,13 +583,59 @@ def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None:
if end_index == -1:
return None
function_name = function_call_str[:end_index]
+ # Reject phantom tool calls produced when the model writes an
+ # unrendered Jinja template or pseudo-XML in its response (e.g.
+ # ````). Surfacing such names as real
+ # tool calls causes "tool not found" errors at the client and
+ # makes agents loop.
+ if not self._is_valid_function_name(function_name):
+ return None
param_config = find_tool_properties(self.tools, function_name)
+ valid_param_names: set[str] | None = (
+ set(param_config.keys()) if param_config else None
+ )
parameters = function_call_str[end_index + 1 :]
- param_dict = {}
- for match_text in self.tool_call_parameter_regex.findall(parameters):
- idx = match_text.index(">")
- param_name = match_text[:idx]
- param_value = str(match_text[idx + 1 :])
+ param_dict: dict = {}
+ pos = 0
+ while True:
+ # Find next structural at the top level. We
+ # do NOT filter the outer search by schema: callers may
+ # legitimately send a parameter whose name is not declared
+ # in the schema (e.g. renamed fields). Schema filtering is
+ # applied only when scanning INSIDE a parameter value, to
+ # disambiguate real nested delimiters from literal text.
+ param_start = self._next_structural_param_start(parameters, pos, None)
+ if param_start == -1:
+ break
+ name_start = param_start + len(self.parameter_prefix)
+ name_end = parameters.find(">", name_start)
+ if name_end == -1:
+ break
+ param_name = parameters[name_start:name_end]
+ value_text = parameters[name_end + 1 :]
+
+ param_end = self._find_true_param_end(value_text, valid_param_names)
+ if param_end == -1:
+ # No true found (malformed XML or incomplete).
+ # Fallback 1: next structural boundary or end
+ func_end = self._find_true_function_end(value_text)
+ if func_end != -1:
+ param_value = value_text[:func_end]
+ else:
+ param_value = value_text
+ pos = len(parameters)
+ else:
+ param_value = value_text[:param_end]
+ pos = (name_end + 1) + param_end + len(self.parameter_end_token)
+
# Remove prefix and trailing \n
if param_value.startswith("\n"):
param_value = param_value[1:]
@@ -161,23 +653,79 @@ def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None:
)
def _get_function_calls(self, model_output: str) -> list[str]:
- # Find all tool calls
- matched_ranges = self.tool_call_regex.findall(model_output)
- raw_tool_calls = [
- match[0] if match[0] else match[1] for match in matched_ranges
- ]
+ # Find tool_calls using a structural delimiter approach:
+ # a real is followed by another or
+ # end-of-text. This skips that appears as literal
+ # text inside a parameter value.
+ raw_tool_calls: list[str] = []
+ search_pos = 0
+ while True:
+ tc_start = model_output.find(self.tool_call_start_token, search_pos)
+ if tc_start == -1:
+ break
+ after_open = model_output[tc_start + len(self.tool_call_start_token) :]
+ tc_end = -1
+ inner_search = 0
+ while True:
+ idx = after_open.find(self.tool_call_end_token, inner_search)
+ if idx == -1:
+ tc_end = -1
+ break
+ after_close = after_open[idx + len(self.tool_call_end_token) :]
+ stripped = after_close.lstrip()
+ if stripped == "" or stripped.startswith(self.tool_call_start_token):
+ tc_end = idx
+ break
+ inner_search = idx + len(self.tool_call_end_token)
+ if tc_end == -1:
+ raw_tool_calls.append(after_open)
+ break
+ raw_tool_calls.append(after_open[:tc_end])
+ search_pos = (
+ tc_start
+ + len(self.tool_call_start_token)
+ + tc_end
+ + len(self.tool_call_end_token)
+ )
# Back-off strategy if no tool_call tags found
if len(raw_tool_calls) == 0:
raw_tool_calls = [model_output]
- raw_function_calls = []
+ # Use a parameter-aware walk to find the structural :
+ # when the value of a parameter embeds a complete literal
+ # ``...\n`` block, the nested
+ # ```` is followed by ```` and would pass
+ # the simple "followed by " lookahead. Walking the
+ # body parameter-by-parameter with ``_find_true_param_end``
+ # correctly steps over the literal.
+ function_calls: list[str] = []
for tool_call in raw_tool_calls:
- raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call))
-
- function_calls = [
- match[0] if match[0] else match[1] for match in raw_function_calls
- ]
+ func_start = tool_call.find(self.tool_call_prefix)
+ if func_start == -1:
+ continue
+ after_func_open = tool_call[func_start + len(self.tool_call_prefix) :]
+ name_end = after_func_open.find(">")
+ valid_param_names: set[str] | None = None
+ body_start = 0
+ if name_end != -1:
+ func_name = after_func_open[:name_end]
+ cfg = find_tool_properties(self.tools, func_name)
+ if cfg:
+ valid_param_names = set(cfg.keys())
+ body_start = name_end + 1
+ scan_end = self._scan_to_structural_function_end(
+ after_func_open[body_start:], valid_param_names
+ )
+ if scan_end != -1:
+ function_calls.append(after_func_open[: body_start + scan_end])
+ continue
+ # Fallback to legacy heuristic.
+ func_end = self._find_true_function_end(after_func_open)
+ if func_end == -1:
+ function_calls.append(after_func_open)
+ else:
+ function_calls.append(after_func_open[:func_end])
return function_calls
def extract_tool_calls(
@@ -213,11 +761,39 @@ def extract_tool_calls(
}
)
- # Extract content before tool calls
- content_index = model_output.find(self.tool_call_start_token)
- idx = model_output.find(self.tool_call_prefix)
- content_index = content_index if content_index >= 0 else idx
- content = model_output[:content_index] # .rstrip()
+ # Extract content before tool calls. Anchor at the FIRST
+ # ```` that contains a real ````
+ # opener — a bare ``...`` written by
+ # the model in its narrative text (no function inside) is
+ # NOT a real tool call and the surrounding text MUST stay
+ # in ``content``.
+ content_index = -1
+ search_pos = 0
+ tc_start_token = self.tool_call_start_token
+ tc_end_token = self.tool_call_end_token
+ while True:
+ tc_pos = model_output.find(tc_start_token, search_pos)
+ if tc_pos == -1:
+ break
+ tc_close = model_output.find(tc_end_token, tc_pos + len(tc_start_token))
+ # Look for a ```` block contains a
+ # ``= 0 else model_output
+ )
valid_tool_calls = [tc for tc in tool_calls if tc is not None]
return ExtractedToolCallInformation(
tools_called=(len(valid_tool_calls) > 0),
@@ -277,77 +853,116 @@ def extract_tool_calls_streaming(
# Check if we need to advance to next tool
if self.json_closed and not self.in_function:
- # Check if this tool call has ended
- tool_ends = current_text.count(self.tool_call_end_token)
+ # Use structural count: a literal
+ # embedded in a parameter value must not trigger spurious
+ # advance.
+ tool_ends = len(self._structural_tool_call_end_positions(current_text))
if tool_ends > self.current_tool_index:
- # This tool has ended, advance to next
- self.current_tool_index += 1
- self.header_sent = False
- self.param_count = 0
- self.json_started = False
- self.json_closed = False
- self.accumulated_params = {}
-
- # Check if there are more tool calls
- tool_starts = current_text.count(self.tool_call_start_token)
- if self.current_tool_index >= tool_starts:
- # No more tool calls
- self.is_tool_call_started = False
- # Continue processing next tool
- return None
-
+ # Advance to next tool; is_tool_call_started is reset so
+ # content between or after tool calls is emitted correctly.
+ # We deliberately fall through (no early ``return None``):
+ # the rest of this delta may carry trailing free text after
+ # the closed or even an entire next tool call
+ # (MTP / speculative decoding). The downstream code handles
+ # both — emitting trailing content via the not-started
+ # branch, or starting the next tool via tool_starts_count.
+ self._advance_to_next_tool(current_text)
+
+ content_message = None
# Handle normal content before tool calls
if not self.is_tool_call_started:
- # Check if tool call is starting
- if (
+ tool_starts_count = current_text.count(self.tool_call_start_token)
+ start_signal = (
self.tool_call_start_token_id in delta_token_ids
- or self.tool_call_start_token in delta_text
- ):
+ or tool_starts_count > self.current_tool_index
+ )
+ # ``tool_starts_count`` is naive and over-counts when an
+ # earlier tool's parameter value contains a literal
+ # ````. Confirm a REAL next tool by locating an
+ # opener past ``_sent_content_idx`` (which sits after the last
+ # processed tool's structural ````).
+ last_start = -1
+ if start_signal:
+ last_start = current_text.find(
+ self.tool_call_start_token, self._sent_content_idx
+ )
+ if start_signal and last_start != -1:
self.is_tool_call_started = True
# Return any content before the tool call
- if self.tool_call_start_token in delta_text:
- content_before = delta_text[
- : delta_text.index(self.tool_call_start_token)
- ]
+ if last_start > self._sent_content_idx:
+ content_before = current_text[self._sent_content_idx : last_start]
+ self._sent_content_idx = last_start
if content_before:
- return DeltaMessage(content=content_before)
- return None
+ content_message = DeltaMessage(content=content_before)
else:
- # Check if we're between tool calls - skip whitespace
+ # No real new tool starting in this delta — emit any
+ # trailing/inter-call content.
+ overlap = partial_tag_overlap(current_text, self.tool_call_start_token)
+ sendable_idx = len(current_text) - overlap
+
+ # Skip whitespace-only deltas right after a closed tool.
if (
current_text.rstrip().endswith(self.tool_call_end_token)
and delta_text.strip() == ""
):
- # We just ended a tool call, skip whitespace
+ self._sent_content_idx = len(current_text)
return None
- # Normal content, no tool call
- return DeltaMessage(content=delta_text)
-
- # Check if we're between tool calls (waiting for next one)
- # Count tool calls we've seen vs processed
- tool_starts_count = current_text.count(self.tool_call_start_token)
- if self.current_tool_index >= tool_starts_count:
- # We're past all tool calls, shouldn't be here
- return None
- # We're in a tool call, find the current tool call portion
- # Need to find the correct tool call based on current_tool_index
+ if sendable_idx > self._sent_content_idx:
+ content = current_text[self._sent_content_idx : sendable_idx]
+ self._sent_content_idx = sendable_idx
+ if content:
+ return DeltaMessage(content=content)
+ return None
+
+ # Check if we're between tool calls (waiting for next one).
+ # Only count structural starts (skip past each
+ # of completed calls) so that tokens
+ # embedded in a parameter value of a completed call are not
+ # counted as spurious new tool calls.
+ if self.tool_call_start_token not in current_text[self._sent_content_idx :]:
+ return content_message
+
+ # We're in a tool call, find the current tool call portion.
+ # Build tool_start_positions by jumping OVER completed tool
+ # calls (past each ), so that tokens
+ # embedded in parameter values of completed calls are never
+ # included.
+ # Use STRUCTURAL positions when jumping past
+ # completed tool calls — naive ``current_text.find()``
+ # matches a literal ```` embedded in a parameter
+ # value and would land inside an earlier tool's content.
+ structural_ends = self._structural_tool_call_end_positions(current_text)
tool_start_positions: list[int] = []
- idx = 0
- while True:
- idx = current_text.find(self.tool_call_start_token, idx)
+ search_pos = 0
+ for i in range(self.current_tool_index + 1):
+ idx = current_text.find(self.tool_call_start_token, search_pos)
if idx == -1:
break
tool_start_positions.append(idx)
- idx += len(self.tool_call_start_token)
+ if i < self.current_tool_index:
+ # Completed tool call: jump past its STRUCTURAL .
+ end_idx = -1
+ for end_pos in structural_ends:
+ if end_pos > idx:
+ end_idx = end_pos
+ break
+ if end_idx == -1:
+ break
+ search_pos = end_idx + len(self.tool_call_end_token)
if self.current_tool_index >= len(tool_start_positions):
- # No more tool calls to process yet
- return None
+ return content_message
tool_start_idx = tool_start_positions[self.current_tool_index]
- # Find where this tool call ends (or current position if not ended yet)
- tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx)
+ # Find this tool call's STRUCTURAL end (or use rest of text if
+ # the tool isn't closed yet). A naive find would truncate at a
+ # literal inside a parameter value.
+ tool_end_idx = -1
+ for end_pos in structural_ends:
+ if end_pos > tool_start_idx:
+ tool_end_idx = end_pos
+ break
if tool_end_idx == -1:
tool_text = current_text[tool_start_idx:]
else:
@@ -355,6 +970,7 @@ def extract_tool_calls_streaming(
tool_start_idx : tool_end_idx + len(self.tool_call_end_token)
]
+ tool_call_fragments = None
# Looking for function header
if not self.header_sent:
if self.tool_call_prefix in tool_text:
@@ -387,21 +1003,18 @@ def extract_tool_calls_streaming(
# accesses streamed_args_for_tool[index].
self.streamed_args_for_tool.append("")
- # Send header with function info
- return DeltaMessage(
- tool_calls=[
- DeltaToolCall(
- index=self.current_tool_index,
- id=self.current_tool_id,
- function=DeltaFunctionCall(
- name=self.current_function_name, arguments=""
- ),
- type="function",
- )
- ]
+ tool_call_fragments = DeltaToolCall(
+ index=self.current_tool_index,
+ id=self.current_tool_id,
+ function=DeltaFunctionCall(
+ name=self.current_function_name, arguments=""
+ ),
+ type="function",
)
- return None
+ if not self.header_sent:
+ return content_message
+ arguments_to_emit = ""
# We've sent header, now handle function body
if self.in_function:
# Always send opening brace first, regardless of whether
@@ -412,24 +1025,91 @@ def extract_tool_calls_streaming(
if not self.json_started:
self.json_started = True
self.streamed_args_for_tool[self.current_tool_index] += "{"
- return DeltaMessage(
- tool_calls=[
- DeltaToolCall(
- index=self.current_tool_index,
- function=DeltaFunctionCall(arguments="{"),
- )
- ]
- )
-
- # Find all parameter start positions in current tool_text
- param_starts = []
+ arguments_to_emit += "{"
+
+ # Build param_starts using structural-aware lookup. Plain
+ # tool_text.find(parameter_prefix) would return positions
+ # inside parameter VALUES (e.g. Python code that embeds the
+ # XML format), creating spurious extra params. Use the
+ # schema to filter nested and advance
+ # sequentially past each complete parameter's value.
+ streaming_param_config = find_tool_properties(
+ self.tools, self.current_function_name or ""
+ )
+ valid_param_names: set[str] | None = (
+ set(streaming_param_config.keys()) if streaming_param_config else None
+ )
+ param_starts: list[int] = []
search_idx = 0
while True:
- search_idx = tool_text.find(self.parameter_prefix, search_idx)
- if search_idx == -1:
+ # Don't filter top-level by schema:
+ # callers may send params whose names aren't declared
+ # (e.g. renamed fields). Schema filtering is applied
+ # below when walking INSIDE a parameter value to
+ # disambiguate nested literal XML.
+ param_start_pos = self._next_structural_param_start(
+ tool_text, search_idx, None
+ )
+ if param_start_pos == -1:
break
- param_starts.append(search_idx)
- search_idx += len(self.parameter_prefix)
+ param_starts.append(param_start_pos)
+ # Advance past this parameter's content.
+ name_end_pos = tool_text.find(
+ ">", param_start_pos + len(self.parameter_prefix)
+ )
+ if name_end_pos == -1:
+ break
+ after_name = tool_text[name_end_pos + 1 :]
+ after_name_stripped = (
+ after_name[1:] if after_name.startswith("\n") else after_name
+ )
+ end_in_after = self._find_true_param_end(
+ after_name_stripped,
+ valid_param_names,
+ require_lookahead=True,
+ )
+ if end_in_after == -1:
+ # No structural ```` close yet. A
+ # legitimate "missing " malformation —
+ # the model jumps from ```` straight to
+ # ```` — is recoverable: treat the
+ # next structural ```` as implicit
+ # end of the current param. But only if NAME has
+ # NOT already been parsed as a sibling param of this
+ # tool call (and is not the param currently being
+ # scanned). A repeated NAME is almost always a
+ # literal embedded in the unfinished value, not a
+ # real next parameter.
+ cand_name = tool_text[
+ param_start_pos + len(self.parameter_prefix) : name_end_pos
+ ]
+ already_seen = set(self.accumulated_params.keys()) | (
+ {cand_name} if cand_name else set()
+ )
+ unseen_valid: set[str] | None = (
+ (valid_param_names - already_seen)
+ if valid_param_names is not None
+ else None
+ )
+ implicit_end = self._next_structural_param_start(
+ after_name_stripped, 0, unseen_valid
+ )
+ if implicit_end != -1:
+ search_idx = (
+ (name_end_pos + 1)
+ + (1 if after_name.startswith("\n") else 0)
+ + implicit_end
+ )
+ else:
+ # Wait for more data.
+ break
+ else:
+ search_idx = (
+ (name_end_pos + 1)
+ + (1 if after_name.startswith("\n") else 0)
+ + end_in_after
+ + len(self.parameter_end_token)
+ )
# Process ALL complete params in a loop (spec decode fix).
# With speculative decoding a single delta can deliver
@@ -455,30 +1135,67 @@ def extract_tool_calls_streaming(
if value_text.startswith("\n"):
value_text = value_text[1:]
- param_end_idx = value_text.find(self.parameter_end_token)
+ param_end_idx = self._find_true_param_end(
+ value_text, valid_param_names, require_lookahead=True
+ )
if param_end_idx == -1:
- next_param_idx = value_text.find(self.parameter_prefix)
- func_end_idx = value_text.find(self.function_end_token)
-
- if next_param_idx != -1 and (
- func_end_idx == -1 or next_param_idx < func_end_idx
- ):
- param_end_idx = next_param_idx
- elif func_end_idx != -1:
- param_end_idx = func_end_idx
- else:
- # Fallback for malformed XML where
- # is missing. Use as a delimiter
- # if present in the value so we don't include
- # the closing tag as part of the param value.
- tool_end_in_value = value_text.find(self.tool_call_end_token)
- if tool_end_in_value != -1:
- param_end_idx = tool_end_in_value
+ # Confirm via the parameter-aware walker that the
+ # function body is truly complete. The legacy
+ # ``_find_true_function_end`` matches a ````
+ # at end-of-buffer (lstripped lookahead == ""), which
+ # is wrong in streaming when the literal close of a
+ # nested tool_call inside a parameter value sits at
+ # the buffer's end. Walking the body via
+ # ``_scan_to_structural_function_end`` correctly
+ # steps over literal tags inside parameter values
+ # and returns -1 if any param is still open.
+ tc_open_in_tool = tool_text.find(self.tool_call_prefix)
+ body_func_end_in_value = -1
+ if tc_open_in_tool != -1:
+ name_end_in_tool = tool_text.find(
+ ">", tc_open_in_tool + len(self.tool_call_prefix)
+ )
+ if name_end_in_tool != -1:
+ body_after_name = tool_text[name_end_in_tool + 1 :]
+ body_func_end_rel = self._scan_to_structural_function_end(
+ body_after_name, valid_param_names
+ )
+ if body_func_end_rel != -1:
+ body_func_end_abs = (
+ name_end_in_tool + 1 + body_func_end_rel
+ )
+ body_func_end_in_value = body_func_end_abs - value_start
+
+ if body_func_end_in_value > 0:
+ # Function body is structurally complete; the
+ # current param has missing . Use
+ # the next legitimate (NAME
+ # unseen) before the structural as
+ # the implicit end.
+ already_seen = set(self.accumulated_params.keys()) | (
+ {current_param_name} if current_param_name else set()
+ )
+ unseen_valid: set[str] | None = (
+ (valid_param_names - already_seen)
+ if valid_param_names is not None
+ else None
+ )
+ next_param_idx = self._next_structural_param_start(
+ value_text, 0, unseen_valid
+ )
+ if (
+ next_param_idx != -1
+ and next_param_idx < body_func_end_in_value
+ ):
+ param_end_idx = next_param_idx
else:
- # Parameter incomplete — break so we still
- # emit any fragments accumulated by earlier
- # loop iterations.
- break
+ param_end_idx = body_func_end_in_value
+ else:
+ # Body not yet complete — wait for more data.
+ # Do NOT truncate at a literal or
+ # that may sit inside a still-open
+ # parameter value.
+ break
if param_end_idx == -1:
break
@@ -522,15 +1239,7 @@ def extract_tool_calls_streaming(
self.current_tool_index,
len(self.streamed_args_for_tool),
)
-
- return DeltaMessage(
- tool_calls=[
- DeltaToolCall(
- index=self.current_tool_index,
- function=DeltaFunctionCall(arguments=combined),
- )
- ]
- )
+ arguments_to_emit += combined
# Check for function end AFTER processing parameters.
# This ordering is critical: with speculative decoding a
@@ -538,13 +1247,31 @@ def extract_tool_calls_streaming(
# . If the close check ran first it would emit
# "}" and set in_function=False before the parameter loop
# ever ran, causing the parameter to be silently dropped.
- if not self.json_closed and self.function_end_token in tool_text:
+ # Use the parameter-aware walker so a literal ''
+ # inside a parameter value (e.g. a content arg embedding a
+ # complete nested tool_call) does not trigger a premature
+ # close.
+ true_func_end = -1
+ tc_open_in_tool_for_close = tool_text.find(self.tool_call_prefix)
+ if tc_open_in_tool_for_close != -1:
+ name_end_in_tool = tool_text.find(
+ ">",
+ tc_open_in_tool_for_close + len(self.tool_call_prefix),
+ )
+ if name_end_in_tool != -1:
+ body_after_name = tool_text[name_end_in_tool + 1 :]
+ body_func_end_rel = self._scan_to_structural_function_end(
+ body_after_name, valid_param_names
+ )
+ if body_func_end_rel != -1:
+ true_func_end = name_end_in_tool + 1 + body_func_end_rel
+ if not self.json_closed and true_func_end != -1:
self.json_closed = True
func_start = tool_text.find(self.tool_call_prefix) + len(
self.tool_call_prefix
)
- func_content_end = tool_text.find(self.function_end_token, func_start)
+ func_content_end = true_func_end
if func_content_end != -1:
func_content = tool_text[func_start:func_content_end]
try:
@@ -572,23 +1299,88 @@ def extract_tool_calls_streaming(
self.current_tool_index,
len(self.streamed_args_for_tool),
)
-
- result = DeltaMessage(
- tool_calls=[
- DeltaToolCall(
- index=self.current_tool_index,
- function=DeltaFunctionCall(arguments="}"),
- )
- ]
- )
-
+ arguments_to_emit += "}"
self.in_function = False
self.json_closed = True
self.accumulated_params = {}
- return result
+ if tool_call_fragments or arguments_to_emit:
+ if not tool_call_fragments:
+ tool_call_fragments = DeltaToolCall(
+ index=self.current_tool_index,
+ function=DeltaFunctionCall(arguments=arguments_to_emit),
+ )
+ else:
+ tool_call_fragments.function.arguments += arguments_to_emit
+
+ if content_message:
+ content_message.tool_calls = [tool_call_fragments]
+ result = content_message
+ else:
+ result = DeltaMessage(tool_calls=[tool_call_fragments])
+
+ # Speculative decoding can deliver multiple complete tool
+ # calls in a single delta. If we just finished one and
+ # another complete ... remains in
+ # current_text, advance and re-enter to emit it. We pass a
+ # non-empty `previous_text` sentinel so reset_streaming_state
+ # is NOT triggered inside the recursion (which would clear
+ # current_tool_index back to 0 and loop forever).
+ if (
+ self.json_closed
+ and not self.in_function
+ and len(self._structural_tool_call_end_positions(current_text))
+ > self.current_tool_index + 1
+ ):
+ # Speculative decoding delivered multiple complete tool
+ # calls in one delta; advance and recurse for the next.
+ self._advance_to_next_tool(current_text)
+
+ # Recurse with a sentinel previous_text so the entry
+ # check `if not previous_text` does NOT reset the state.
+ next_delta = self.extract_tool_calls_streaming(
+ previous_text or " ",
+ current_text,
+ delta_text,
+ previous_token_ids,
+ current_token_ids,
+ delta_token_ids,
+ request,
+ )
+ if next_delta is not None and next_delta.tool_calls:
+ if result.tool_calls is None:
+ result.tool_calls = []
+ result.tool_calls.extend(next_delta.tool_calls)
+ # Concatenate the recursion's content (e.g. text
+ # BETWEEN tool 1 and tool 2) with the outer's content
+ # (e.g. text BEFORE tool 1). Without this, the "between"
+ # fragment is silently dropped whenever the outer
+ # already produced its own content.
+ if next_delta.content:
+ result.content = (result.content or "") + next_delta.content
+
+ # Emit trailing free text that follows the LAST structural
+ # in this delta (MTP / spec-decoding bursts that
+ # bundle N tool calls + trailing content into one chunk).
+ # Without this the trailing text is buffered indefinitely:
+ # the per-tool processing never advances ``_sent_content_idx``
+ # past its tool's ````, and an EOS-style empty
+ # delta cannot recover content that was never emitted.
+ if self.json_closed and not self.in_function:
+ end_positions = self._structural_tool_call_end_positions(current_text)
+ if end_positions:
+ last_end = end_positions[-1] + len(self.tool_call_end_token)
+ if (
+ last_end < len(current_text)
+ and last_end > self._sent_content_idx
+ ):
+ trailing = current_text[last_end:]
+ if trailing:
+ self._sent_content_idx = len(current_text)
+ result.content = (result.content or "") + trailing
+ return result
- return None
+ return content_message
def get_structural_tag(self, request: ChatCompletionRequest):
return get_model_structural_tag(
diff --git a/vllm/tool_parsers/qwen3xml_tool_parser.py b/vllm/tool_parsers/qwen3xml_tool_parser.py
index 8ee10dcbc9e6..3f2ae4d253bf 100644
--- a/vllm/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/tool_parsers/qwen3xml_tool_parser.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
+import contextlib
import json
from collections.abc import Sequence
from typing import Any
@@ -26,11 +27,28 @@
Tool,
ToolParser,
)
+from vllm.tool_parsers.structural_tag_registry import (
+ get_enable_structured_outputs_in_reasoning,
+ get_model_structural_tag,
+)
from vllm.tool_parsers.utils import find_tool_properties
logger = init_logger(__name__)
+def _is_valid_function_name(name: str) -> bool:
+ """Return True when ``name`` looks like a real function identifier and
+ not a stray template token, malformed tag, or freeform text.
+
+ Rejects names that contain template-syntax characters (``{``, ``}``,
+ ``<``, ``>``), whitespace, quotes, or are empty.
+ """
+ if not name:
+ return False
+ forbidden = set("{}<>\"' \t\n\r")
+ return not any(c in forbidden for c in name)
+
+
class StreamingXMLToolCallParser:
"""
Simplified streaming XML tool call parser
@@ -53,9 +71,16 @@ def reset_streaming_state(self):
"""Reset streaming parsing state"""
self.deltas = []
+ # When True (delta-by-delta streaming), _process_complete_xml_elements
+ # holds off on when nothing follows in the buffer yet —
+ # that would be ambiguous since more tokens may still arrive. When
+ # False (full output passed at once), an empty lookahead is a
+ # genuine end.
+ self._streaming_mode: bool = False
# state for streaming
self.tool_call_index = 0
self.current_call_id = None
+ self.id_emitted = False
self.last_completed_call_id = None
self.current_function_name = None
self.current_function_open = False
@@ -79,6 +104,21 @@ def reset_streaming_state(self):
self.defer_current_parameter = False
self.deferred_param_raw_value = ""
+ # Depth of LITERAL nested ````/```` opens
+ # encountered inside the current parameter's value. Each literal
+ # opener bumps the depth; each ````/````
+ # encountered while depth > 0 is also literal (decrements the
+ # depth) and must not be treated as a structural close. Reset
+ # to 0 when leaving a parameter.
+ self._literal_tag_depth = 0
+ # Number of literal tool_call/function open or close events seen
+ # in the current ``parse_single_streaming_chunks`` call. Used to
+ # suppress the post-processing structural-close fallback when
+ # the chunk contained literal nested-tag events: those events
+ # are already handled (escaped) by the preprocess pass and must
+ # not trigger ``_end_element`` calls.
+ self._literal_events_this_chunk = 0
+
# recreate parser
self.parser = ParserCreate()
self.setup_parser()
@@ -98,72 +138,58 @@ def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage:
# Record delta count before processing
initial_delta_count = len(self.deltas)
+ # Reset literal-event counter for this chunk: it will be
+ # incremented by the preprocess pass whenever it encounters a
+ # literal nested ````/```` open or
+ # the matching close inside a parameter value.
+ self._literal_events_this_chunk = 0
+
self.streaming_buffer += xml_chunk
found_elements = self._process_complete_xml_elements()
if found_elements:
# If complete elements found, check if end events were missed
- # some tags may not have been triggered
+ # some tags may not have been triggered. Use structural-aware
+ # checks so that / appearing as literal
+ # text inside a parameter value (e.g. file content) does NOT
+ # trigger a spurious close that emits a duplicate '}' or ''.
+ # When ``_literal_tag_depth > 0`` we are still inside a
+ # literal nested ````/```` block in
+ # the current parameter's value — the chunk's ``
+ # or `` matches a literal opener, not a real
+ # structural close, so skip the fallback close events.
try:
- new_deltas = self.deltas[initial_delta_count:]
- # If this chunk contains
- # but didn't generate '}', then complete it
+ # Skip the fallback close events when this chunk
+ # contained any literal nested-tag event: those
+ # ````/```` strings are matched
+ # to literal openers in the param value and have
+ # already been escaped — firing ``_end_element`` here
+ # would prematurely close the OUTER parameter and
+ # truncate its value.
+ literals_in_chunk = self._literal_events_this_chunk > 0
if (
self.current_call_id is not None
- and self.function_end_token in xml_chunk
+ and not literals_in_chunk
+ and self._literal_tag_depth == 0
+ and self._chunk_has_structural_function_end(xml_chunk)
+ and self.current_function_open
):
- # - Added '}' (non-empty parameter ending)
- # - Added '{}' (empty parameter function)
- has_function_close = any(
- (
- td.tool_calls
- and any(
- (
- tc.function
- and tc.id == self.current_call_id
- and isinstance(tc.function.arguments, str)
- and (tc.function.arguments in ("}", "{}"))
- )
- for tc in td.tool_calls
- )
- )
- for td in new_deltas
- )
- if not has_function_close:
- # Close potentially unclosed element
- if self.current_param_name:
- self._end_element("parameter")
- if self.current_function_name:
- self._end_element("function")
- # If this chunk contains
- # but didn't generate final empty delta, then complete it
+ if self.current_param_name:
+ self._end_element("parameter")
+ if self.current_function_name:
+ self._end_element("function")
if (
self.current_call_id is not None
- and self.tool_call_end_token in xml_chunk
+ and not literals_in_chunk
+ and self._literal_tag_depth == 0
+ and self._chunk_has_structural_tool_call_end(xml_chunk)
):
- has_toolcall_close = any(
- (
- td.tool_calls
- and any(
- (
- tc.type == "function"
- and tc.function
- and tc.function.arguments == ""
- and tc.id == self.current_call_id
- )
- for tc in td.tool_calls
- )
- )
- for td in new_deltas
- )
- if not has_toolcall_close:
- # Close potentially unclosed element
- if self.current_param_name:
- self._end_element("parameter")
- if self.current_function_name:
- self._end_element("function")
- self._end_element("tool_call")
+ if self.current_param_name:
+ self._end_element("parameter")
+ if self.current_function_open:
+ self._end_element("function")
+ self._end_element("tool_call")
except Exception as e:
logger.warning("Error with fallback parsing: %s", e)
# Merge newly generated deltas into single response
@@ -173,29 +199,37 @@ def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage:
return result_delta
else:
# No complete elements, check if there's unoutput text content
- if self.text_content_buffer and self.tool_call_index == 0:
- # Has text content but no tool_call yet, output text content
+ if self.text_content_buffer:
+ # Output buffered text content
text_delta = DeltaMessage(content=self.text_content_buffer)
self._emit_delta(text_delta)
# Clear buffer to avoid duplicate output
self.text_content_buffer = ""
return text_delta
- # If this chunk contains end tags but wasn't triggered by parser,
- # manually complete end events
- # Only execute when still on the same call as when entered,
- # to prevent accidentally closing new calls
- # in multi scenarios
- if self.current_call_id is not None and (
- self.function_end_token in xml_chunk
- or self.tool_call_end_token in xml_chunk
+ # If this chunk contains structural end tags but wasn't
+ # triggered by parser, manually complete end events. Only
+ # execute when still on the same call as when entered, to
+ # prevent accidentally closing new calls in multi-
+ # scenarios. Also skip when ``_literal_tag_depth > 0``: the
+ # chunk's ``/`` matches a literal
+ # opener inside the current parameter's value.
+ if (
+ self.current_call_id is not None
+ and self._literal_tag_depth == 0
+ and (
+ self._chunk_has_structural_function_end(xml_chunk)
+ or self._chunk_has_structural_tool_call_end(xml_chunk)
+ )
):
- # Close potentially unclosed element
if self.current_param_name:
self._end_element("parameter")
- if self.function_end_token in xml_chunk and self.current_function_name:
+ if (
+ self._chunk_has_structural_function_end(xml_chunk)
+ and self.current_function_name
+ ):
self._end_element("function")
- if self.tool_call_end_token in xml_chunk:
+ if self._chunk_has_structural_tool_call_end(xml_chunk):
self._end_element("tool_call")
# Return the merged delta result generated by this fallback
result_delta = self._merge_new_deltas_to_single_response(
@@ -227,6 +261,141 @@ def _escape_xml_special_chars(self, text: str) -> str:
return text
+ def _is_structural_tag_position(self) -> bool:
+ """Return True when the current element is at a structural position.
+
+ A structural opening tag (e.g. ) must appear at the
+ beginning of a line in the raw output — i.e. the character
+ immediately before it in the streaming buffer is a newline (or it
+ is at position 0). Opening tags inside parameter content (e.g.
+ '""') are preceded by a non-newline character
+ such as a quote.
+ """
+ if self.last_processed_pos == 0:
+ return True
+ return self.streaming_buffer[self.last_processed_pos - 1] == "\n"
+
+ def _get_valid_param_names(self) -> set[str] | None:
+ """Return the set of parameter names defined in the schema for the
+ current function, or None when the schema is not available.
+
+ Used to filter structural-looking tokens that
+ appear as literal text inside a parameter value (e.g. Jinja2
+ templates, test fixtures, or files that document the tool-call
+ format).
+ """
+ if not self.tools or not self.current_function_name:
+ return None
+ props = find_tool_properties(self.tools, self.current_function_name)
+ return set(props.keys()) if props else None
+
+ def _is_already_emitted_param(self, name: str) -> bool:
+ """Return True when ``name`` has already appeared as a parameter
+ of the current tool call (either fully closed or currently open).
+
+ A ```` whose NAME is already used for the same
+ tool is almost always literal text inside another parameter's
+ value (e.g. a parser fixture or a file that documents the
+ tool-call format). Treating it as a real structural opening
+ causes silent value truncation and spurious extra params.
+ """
+ if name == self.current_param_name:
+ return True
+ return name in self.parameters
+
+ def _is_structural_closing_tag(self, chunk: str) -> bool:
+ """Return True when a closing tag at the current buffer position is
+ a real structural delimiter rather than literal text content.
+
+ A closing tag is structural when the text that follows it in the
+ streaming buffer (after stripping leading whitespace) begins with
+ another structural token or is empty (end of buffered output).
+
+ When the schema is available, a following is only
+ considered structural if NAME is a known parameter of the current
+ function. This prevents literal lines like ````
+ in file content from being mistaken for real structural boundaries.
+ """
+ after_pos = self.last_processed_pos + len(chunk)
+ rest = self.streaming_buffer[after_pos:].lstrip()
+
+ structural_param_follows = False
+ if rest.startswith(self.parameter_start_token):
+ valid_names = self._get_valid_param_names()
+ name_start = len(self.parameter_start_token)
+ name_end = rest.find(">", name_start)
+ if name_end != -1:
+ candidate = rest[name_start:name_end]
+ if valid_names is not None:
+ structural_param_follows = (
+ candidate in valid_names
+ and not self._is_already_emitted_param(candidate)
+ )
+ else:
+ # Fallback (no schema): trust the name unless it is a
+ # repeat of the current/already-emitted param, which
+ # is almost always a literal in a parser fixture.
+ structural_param_follows = not self._is_already_emitted_param(
+ candidate
+ )
+
+ # Return True when rest is an incomplete prefix of a structural
+ # closing token (e.g. rest="" when "" hasn't fully
+ # arrived yet). The empty-rest case is handled by the deferral in
+ # _process_complete_xml_elements; this guards against the
+ # partial-tag scenario where the deferral does not fire (rest is
+ # non-empty) but the token is still incomplete.
+ is_partial_structural_prefix = any(
+ tok.startswith(rest)
+ for tok in (
+ self.parameter_end_token,
+ self.function_end_token,
+ self.tool_call_end_token,
+ )
+ )
+
+ return (
+ not rest
+ or is_partial_structural_prefix
+ or structural_param_follows
+ or rest.startswith(self.parameter_end_token)
+ or rest.startswith(self.function_end_token)
+ or rest.startswith(self.tool_call_end_token)
+ )
+
+ def _chunk_has_structural_function_end(self, chunk: str) -> bool:
+ """Return True if `chunk` contains a structural tag.
+
+ A structural is followed (after optional whitespace)
+ by or end-of-string — not inside parameter content
+ such as a file whose body contains ''.
+ """
+ search = 0
+ token = self.function_end_token
+ end_token = self.tool_call_end_token
+ while True:
+ idx = chunk.find(token, search)
+ if idx == -1:
+ return False
+ rest = chunk[idx + len(token) :].lstrip()
+ if not rest or rest.startswith(end_token):
+ return True
+ search = idx + len(token)
+
+ def _chunk_has_structural_tool_call_end(self, chunk: str) -> bool:
+ """Return True if `chunk` contains a structural tag."""
+ search = 0
+ token = self.tool_call_end_token
+ start_token = self.tool_call_start_token
+ while True:
+ idx = chunk.find(token, search)
+ if idx == -1:
+ return False
+ rest = chunk[idx + len(token) :].lstrip()
+ if not rest or rest.startswith(start_token):
+ return True
+ search = idx + len(token)
+
def _process_complete_xml_elements(self) -> bool:
"""
Process complete XML elements in buffer
@@ -243,6 +412,23 @@ def _process_complete_xml_elements(self) -> bool:
# No complete element found, wait for more data
break
+ # In streaming mode, hold off on when nothing
+ # follows in the buffer yet. We need the lookahead to
+ # distinguish a real structural close (followed by
+ # or a schema-known ) from
+ # literal text content that happens to be ```` on
+ # its own line (e.g. Jinja2 template files). When not in
+ # _pre_inside_parameter mode the SAX-level decision is made
+ # here; skip for now and re-evaluate on the next delta.
+ if (
+ self._streaming_mode
+ and element == self.parameter_end_token
+ and self.current_param_name is not None
+ and not self._pre_inside_parameter
+ and not self.streaming_buffer[end_pos:].lstrip()
+ ):
+ break
+
# Check if this element should be skipped
if self._should_skip_element(element):
self.last_processed_pos = end_pos
@@ -251,16 +437,12 @@ def _process_complete_xml_elements(self) -> bool:
# Found complete XML element, process it
try:
preprocessed_element = self._preprocess_xml_chunk(element)
- # Check if this is the first tool_call start
+ # Check if a new tool_call starts and we have buffered text content
if (
- (
- preprocessed_element.strip().startswith("")
- or preprocessed_element.strip().startswith("")
+ or preprocessed_element.strip().startswith(" bool:
# Update processed position
self.last_processed_pos = end_pos
+ # Flush any text accumulated AFTER the last processed
+ # in this batch. Without this, trailing free text that arrives in
+ # the SAME delta as the closing (MTP / speculative
+ # decoding) is buffered but never emitted — and is lost entirely
+ # if EOS comes before any subsequent delta.
+ if found_any and self.text_content_buffer and self.current_call_id is None:
+ text_delta = DeltaMessage(content=self.text_content_buffer)
+ self._emit_delta(text_delta)
+ self.text_content_buffer = ""
+
return found_any
def _should_skip_element(self, element: str) -> bool:
@@ -441,10 +633,10 @@ def _merge_new_deltas_to_single_response(self, initial_count: int) -> DeltaMessa
if delta.tool_calls:
# For tool_calls, we need to intelligently merge arguments
for tool_call in delta.tool_calls:
- # Find if there's already a tool_call with the same call_id
+ # Find if there's already a tool_call with the same index
existing_call = None
for existing in merged_tool_calls:
- if existing.id == tool_call.id:
+ if existing.index == tool_call.index:
existing_call = existing
break
@@ -534,36 +726,59 @@ def _preprocess_xml_chunk(self, chunk: str) -> str:
if self._pre_current_param_name
else "string"
)
- # Only these types need deferred parsing to
- # handle Python literals containing single quotes
- is_object_type = param_type in ["object"]
+ # Container types always need deferred parsing so the
+ # full value is available for json.loads /
+ # ast.literal_eval — even when the first streaming
+ # token is just "\n".
+ is_object_type = param_type == "object"
is_complex_type = (
param_type in ["array", "arr", "sequence"]
or param_type.startswith("dict")
or param_type.startswith("list")
)
-
- # Only delay when contains container symbols
- # and has single quotes and is complex type
- has_container_hint = (
- ("[" in original_chunk)
- or ("{" in original_chunk)
- or ("(" in original_chunk)
+ # Boolean also needs deferral: streaming "t" as the
+ # first char would otherwise be converted to False and
+ # emit "false", shadowing the real "true" that follows.
+ is_bool_type = param_type in ["boolean", "bool", "binary"]
+ # Numeric types need deferral too: a nullable
+ # parameter rendered as the literal "None" (Qwen3.5
+ # template) or "null" (Qwen3.6 template) flips from
+ # the partial-string fallback to JSON ``null`` only
+ # when the FULL value is in. Without deferral the
+ # diff-based char emission would interleave the
+ # partial string ("Non") with the JSON literal
+ # ("null") and produce invalid output ("Nonl").
+ is_numeric_type = (
+ param_type.startswith("int")
+ or param_type.startswith("uint")
+ or param_type.startswith("long")
+ or param_type.startswith("short")
+ or param_type.startswith("unsigned")
+ or param_type.startswith("num")
+ or param_type.startswith("float")
)
- # Determine if deferred parsing is needed
- need_defer = False
- if is_complex_type:
- # Complex type, always need deferred parsing
- need_defer = True
- elif (
- is_object_type
- and has_container_hint
- and ("'" in original_chunk)
- ):
- # Object type with container symbols
- # and single quotes, need deferred parsing
- need_defer = True
+ # Nullable string params (``anyOf: [string, null]``)
+ # must defer too: the literal ``null`` / ``None`` is
+ # only recognisable when the full value is in.
+ # Without deferral, the streaming string path emits
+ # ``"`` + chars + ``"`` and the literal stays
+ # quoted.
+ is_nullable_string = param_type in [
+ "string",
+ "str",
+ "text",
+ "varchar",
+ "char",
+ "enum",
+ ] and self._param_allows_null(self._pre_current_param_name)
+ need_defer = (
+ is_complex_type
+ or is_object_type
+ or is_bool_type
+ or is_numeric_type
+ or is_nullable_string
+ )
if not need_defer:
# No need for deferred parsing,
@@ -573,6 +788,69 @@ def _preprocess_xml_chunk(self, chunk: str) -> str:
self._pre_param_buffer += original_chunk
return ""
+ # When a parameter value is being streamed (SAX state says we are
+ # inside a ), structural-looking tokens that arrive as
+ # subsequent elements are literal text — e.g. a file whose content
+ # describes the tool-call format. Escape them unless they are
+ # genuine structural delimiters.
+ if self.current_param_name is not None:
+ if chunk.startswith(self.tool_call_start_token) or chunk.startswith(
+ self.function_start_token
+ ):
+ # Opening tool_call/function tags are always literal inside
+ # a parameter value. Track nesting depth so that the
+ # matching ```` / ```` is also
+ # treated as literal even when its lookahead would
+ # otherwise satisfy the structural heuristic.
+ self._literal_tag_depth += 1
+ self._literal_events_this_chunk += 1
+ return self._escape_xml_special_chars(chunk)
+ if chunk.startswith(self.parameter_start_token):
+ # A structural always follows a newline in
+ # the buffer. When a schema is available, also require
+ # NAME to be a known parameter of the current function so
+ # that literal ```` inside file
+ # content is treated as text. A NAME already emitted
+ # for this tool (or equal to the param currently being
+ # parsed) is also literal text — a parser fixture or a
+ # file that documents the tool-call format.
+ if not self._is_structural_tag_position():
+ return self._escape_xml_special_chars(chunk)
+ name_start = len(self.parameter_start_token)
+ name_end = chunk.find(">", name_start)
+ if name_end != -1:
+ candidate = chunk[name_start:name_end]
+ if self._is_already_emitted_param(candidate):
+ return self._escape_xml_special_chars(chunk)
+ valid_names = self._get_valid_param_names()
+ if valid_names is not None and candidate not in valid_names:
+ return self._escape_xml_special_chars(chunk)
+ if (
+ chunk.startswith(self.parameter_end_token)
+ or chunk.startswith(self.function_end_token)
+ or chunk.startswith(self.tool_call_end_token)
+ ):
+ # Inside a literal nested tool_call/function (depth > 0),
+ # any closing tag pairs with the literal opener and is
+ # itself literal — regardless of what the lookahead says.
+ # ```` does not affect depth (parameters do
+ # not nest in the Qwen format).
+ if self._literal_tag_depth > 0:
+ if chunk.startswith(self.function_end_token) or (
+ chunk.startswith(self.tool_call_end_token)
+ ):
+ self._literal_tag_depth -= 1
+ self._literal_events_this_chunk += 1
+ else:
+ # Literal `` inside a nested literal
+ # block — count it as a literal event so the
+ # post-processing fallback knows the chunk
+ # contained literals and skips spurious closes.
+ self._literal_events_this_chunk += 1
+ return self._escape_xml_special_chars(chunk)
+ if not self._is_structural_closing_tag(chunk):
+ return self._escape_xml_special_chars(chunk)
+
# Parameter start: enable accumulation
if processed.startswith("', processed)
@@ -593,6 +871,12 @@ def _emit_delta(self, delta: DeltaMessage):
"""Emit Delta response (streaming output)"""
self.deltas.append(delta)
+ def _get_call_id_for_delta(self) -> str | None:
+ if not self.id_emitted:
+ self.id_emitted = True
+ return self.current_call_id
+ return None
+
def _auto_close_open_parameter_if_needed(self, incoming_tag: str | None = None):
"""Before starting to process new elements,
if there are unclosed tags from before,
@@ -648,7 +932,7 @@ def _start_element(self, name: str, attrs: dict[str, str]):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(
name=function_name, arguments=""
@@ -679,7 +963,7 @@ def _start_element(self, name: str, attrs: dict[str, str]):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(
name=None, arguments=json_start
@@ -697,7 +981,7 @@ def _start_element(self, name: str, attrs: dict[str, str]):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(
name=None, arguments=json_continue
@@ -740,7 +1024,7 @@ def _char_data(self, data: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments='"'),
)
@@ -775,7 +1059,7 @@ def _char_data(self, data: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments=delta_data),
)
@@ -799,7 +1083,9 @@ def _end_element(self, name: str):
if (
name.startswith("parameter") or name == "parameter"
) and self.current_param_name:
- # End current parameter
+ # End current parameter; reset literal-tag depth tracker
+ # since we are leaving the param's value scope.
+ self._literal_tag_depth = 0
param_name = self.current_param_name
param_value = self.current_param_value
@@ -812,27 +1098,118 @@ def _end_element(self, name: str):
if self.deferred_param_raw_value
else param_value
)
- parsed_value = None
- output_arguments = None
- try:
- # If previously delayed trailing newline,
- # add it back before parsing
- if self.should_emit_end_newline:
- raw_for_parse = raw_text + "\n"
+ parsed_value: Any = None
+ output_arguments: str | None = None
+ if self.should_emit_end_newline:
+ raw_for_parse = raw_text + "\n"
+ else:
+ raw_for_parse = raw_text
+ # Nullable-string short-circuit: when the schema is
+ # ``anyOf: [string, null]``, ``"null"`` and Python's
+ # ``"None"`` map to JSON null. Any other value is
+ # kept verbatim as a string — never parsed as int,
+ # float, JSON, etc., even if it LOOKS like one.
+ _param_type_for_check = self._get_param_type(param_name)
+ if _param_type_for_check in [
+ "string",
+ "str",
+ "text",
+ "varchar",
+ "char",
+ "enum",
+ ] and self._param_allows_null(param_name):
+ if raw_for_parse.strip().lower() in ("null", "none"):
+ parsed_value = None
+ output_arguments = "null"
else:
- raw_for_parse = raw_text
- parsed_value = ast.literal_eval(raw_for_parse)
- output_arguments = json.dumps(parsed_value, ensure_ascii=False)
- except Exception:
- # Fallback: output as string as-is
- output_arguments = json.dumps(raw_text, ensure_ascii=False)
- parsed_value = raw_text
+ parsed_value = raw_for_parse
+ output_arguments = json.dumps(raw_for_parse, ensure_ascii=False)
+ delta = DeltaMessage(
+ tool_calls=[
+ DeltaToolCall(
+ index=self.tool_call_index - 1,
+ id=self._get_call_id_for_delta(),
+ type="function",
+ function=DeltaFunctionCall(
+ name=None, arguments=output_arguments
+ ),
+ )
+ ]
+ )
+ self._emit_delta(delta)
+ self.parameters[param_name] = parsed_value
+ self.current_param_name = None
+ self.current_param_value = ""
+ self.current_param_value_converted = ""
+ self.start_quote_emitted = False
+ self.should_emit_end_newline = False
+ self.defer_current_parameter = False
+ self.deferred_param_raw_value = ""
+ return
+ raw_lower = raw_for_parse.strip().lower()
+ # Handle JSON literals that ast.literal_eval cannot parse
+ # (true/false/null are JSON, not Python).
+ if raw_lower == "null":
+ parsed_value = None
+ output_arguments = "null"
+ elif raw_lower == "true":
+ parsed_value = True
+ output_arguments = "true"
+ elif raw_lower == "false":
+ parsed_value = False
+ output_arguments = "false"
+ else:
+ # Try JSON first: handles arrays/objects that use JSON
+ # native tokens (true, false, null) which
+ # ast.literal_eval cannot parse.
+ try:
+ parsed_value = json.loads(raw_for_parse)
+ # A model trained with a buggy template
+ # (json.dumps(str(dict))) may output a JSON-encoded
+ # Python repr like "\"{'k': 'v'}\"". json.loads
+ # returns a str in that case — try one more level.
+ if isinstance(parsed_value, str):
+ try:
+ parsed_value = ast.literal_eval(parsed_value)
+ except (ValueError, SyntaxError, TypeError):
+ with contextlib.suppress(
+ json.JSONDecodeError, ValueError
+ ):
+ parsed_value = json.loads(parsed_value)
+ output_arguments = json.dumps(parsed_value, ensure_ascii=False)
+ except (json.JSONDecodeError, ValueError):
+ try:
+ parsed_value = ast.literal_eval(raw_for_parse)
+ # A model trained with a buggy template
+ # (json.dumps(str(dict))) may output a
+ # JSON-encoded Python repr like "{'k': 'v'}".
+ # ast.literal_eval returns a str in that
+ # case — try one more level.
+ if isinstance(parsed_value, str):
+ try:
+ parsed_value = ast.literal_eval(parsed_value)
+ except (
+ ValueError,
+ SyntaxError,
+ TypeError,
+ ):
+ with contextlib.suppress(
+ json.JSONDecodeError, ValueError
+ ):
+ parsed_value = json.loads(parsed_value)
+ output_arguments = json.dumps(
+ parsed_value, ensure_ascii=False
+ )
+ except (ValueError, SyntaxError, TypeError):
+ # Fallback: output as string as-is
+ output_arguments = json.dumps(raw_text, ensure_ascii=False)
+ parsed_value = raw_text
delta = DeltaMessage(
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(
name=None, arguments=output_arguments
@@ -868,7 +1245,7 @@ def _end_element(self, name: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments='""'),
)
@@ -881,7 +1258,7 @@ def _end_element(self, name: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments='"'),
)
@@ -904,7 +1281,7 @@ def _end_element(self, name: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments="}"),
)
@@ -917,7 +1294,7 @@ def _end_element(self, name: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments="{}"),
)
@@ -940,7 +1317,7 @@ def _end_element(self, name: str):
tool_calls=[
DeltaToolCall(
index=self.tool_call_index - 1,
- id=self.current_call_id,
+ id=self._get_call_id_for_delta(),
type="function",
function=DeltaFunctionCall(name=None, arguments=""),
)
@@ -1003,11 +1380,52 @@ def _get_param_type(self, param_name: str) -> str:
properties = find_tool_properties(self.tools, self.current_function_name)
if param_name in properties and isinstance(properties[param_name], dict):
- return self.repair_param_type(
- str(properties[param_name].get("type", "string"))
- )
+ prop = properties[param_name]
+ param_type = prop.get("type")
+ if isinstance(param_type, list):
+ # JSON-Schema list-form type, e.g.
+ # {"type": ["integer", "null"]}. Pick the first non-null
+ # type, mirroring the anyOf handling below.
+ for option_type in param_type:
+ if str(option_type).lower() != "null":
+ return self.repair_param_type(str(option_type))
+ return "string"
+ if param_type is None and "anyOf" in prop:
+ # Handle anyOf schemas (e.g. nullable types like
+ # anyOf: [{type: "integer"}, {type: "null"}]).
+ # Pick the first non-null type; fall back to "string".
+ for option in prop["anyOf"]:
+ if isinstance(option, dict) and "type" in option:
+ opt_type = str(option["type"])
+ if opt_type != "null":
+ return self.repair_param_type(opt_type)
+ return "string"
+
+ return self.repair_param_type(str(param_type or "string"))
return "string"
+ def _param_allows_null(self, param_name: str | None) -> bool:
+ """Return True when the schema for ``param_name`` admits a null
+ value — either via ``"type": "null"`` or as one alternative in
+ an ``anyOf`` union. Used to recognise the literal ``"null"`` /
+ ``"None"`` as JSON null even when the primary type is string.
+ """
+ if not self.tools or not self.current_function_name or not param_name:
+ return False
+ properties = find_tool_properties(self.tools, self.current_function_name)
+ if param_name not in properties or not isinstance(properties[param_name], dict):
+ return False
+ prop = properties[param_name]
+ if str(prop.get("type", "")).lower() == "null":
+ return True
+ for option in prop.get("anyOf", []) or []:
+ if (
+ isinstance(option, dict)
+ and str(option.get("type", "")).lower() == "null"
+ ):
+ return True
+ return False
+
def repair_param_type(self, param_type: str) -> str:
"""Repair unknown parameter types by treating them as string
Args:
@@ -1045,13 +1463,29 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any:
Returns:
Converted value
"""
- if param_value.lower() == "null":
- return None
-
param_type = param_type.strip().lower()
+ # Nullable schemas (``anyOf: [string, null]`` or similar): the
+ # primary type may be string but the literal ``"null"`` /
+ # ``"None"`` must still convert to JSON null. Caller passes the
+ # current parameter name via the parser state so we can query
+ # the schema.
+ if self._param_allows_null(self.current_param_name) and param_value.lower() in (
+ "null",
+ "none",
+ ):
+ return None
+ # String type takes precedence: the literal value "null" must remain
+ # the string "null" instead of being converted to Python None.
if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
return param_value
- elif (
+ # Non-string: "null" → Python None → JSON null. Also accept the
+ # Python literal "None" so that Qwen3.5-trained models — whose
+ # chat template renders null args via ``| string`` (yielding the
+ # literal "None" in the prompt) — round-trip nullable values
+ # correctly.
+ if param_value.lower() in ("null", "none"):
+ return None
+ if (
param_type.startswith("int")
or param_type.startswith("uint")
or param_type.startswith("long")
@@ -1062,11 +1496,10 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any:
return int(param_value)
except (ValueError, TypeError):
logger.warning(
- "Parsed value '%s' of parameter '%s' is not an integer "
- "in tool '%s', degenerating to string.",
+ "Parsed value '%s' is not an integer, degenerating to string.",
param_value,
)
- return param_value
+ return param_value
elif param_type.startswith("num") or param_type.startswith("float"):
try:
float_param_value: float = float(param_value)
@@ -1077,14 +1510,12 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any:
)
except (ValueError, TypeError):
logger.warning(
- "Parsed value '%s' of parameter '%s' is not a float "
- "in tool '%s', degenerating to string.",
+ "Parsed value '%s' is not a float, degenerating to string.",
param_value,
)
- return param_value
+ return param_value
elif param_type in ["boolean", "bool", "binary"]:
- param_value = param_value.lower()
- return param_value == "true"
+ return param_value.lower() == "true"
else:
return param_value
@@ -1098,9 +1529,12 @@ def _convert_for_json_streaming(self, converted_value: Any, param_type: str) ->
Returns:
Converted string for streaming output
"""
- # Check if value is empty, but exclude numeric 0
- if converted_value is None or converted_value == "":
+ # Empty string: no output.
+ if converted_value == "":
return ""
+ # None → JSON null literal (e.g. for nullable integer/object params).
+ if converted_value is None:
+ return "null"
if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
# String type, remove double quotes
@@ -1126,6 +1560,7 @@ def _reset_xml_parser_after_tool_call(self):
if self.current_call_id:
self.last_completed_call_id = self.current_call_id
self.current_call_id = None
+ self.id_emitted = False
self.current_function_name = None
self.current_function_open = False
self.parameters = {}
@@ -1179,6 +1614,13 @@ def extract_tool_calls(
tool_calls = []
for tool_call in result.tool_calls:
if tool_call.function and tool_call.function.name:
+ # Reject phantom tool calls produced when the model
+ # writes an unrendered Jinja template or pseudo-XML
+ # in its response (e.g. ````).
+ # Surfacing such names as real tool calls causes
+ # "tool not found" errors at the client.
+ if not _is_valid_function_name(tool_call.function.name):
+ continue
tool_calls.append(
ToolCall(
id=tool_call.id,
@@ -1235,6 +1677,7 @@ def extract_tool_calls_streaming(
) -> DeltaMessage | None:
if not previous_text:
self.parser.reset_streaming_state()
+ self.parser._streaming_mode = True
# Reset tool call tracking arrays for new streaming session
self.prev_tool_call_arr = []
self.streamed_args_for_tool = []
@@ -1296,3 +1739,11 @@ def extract_tool_calls_streaming(
# If no content and no tool calls, return None to indicate no update
return None
return delta
+
+ def get_structural_tag(self, request: ChatCompletionRequest):
+ return get_model_structural_tag(
+ model="qwen_3_5",
+ tools=request.tools,
+ tool_choice=request.tool_choice,
+ reasoning=get_enable_structured_outputs_in_reasoning(),
+ )