diff --git a/tests/tool_parsers/test_qwen3_xml_coder_shared.py b/tests/tool_parsers/test_qwen3_xml_coder_shared.py new file mode 100644 index 000000000000..6c56c6d47063 --- /dev/null +++ b/tests/tool_parsers/test_qwen3_xml_coder_shared.py @@ -0,0 +1,2230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Shared tests for the Qwen3 XML and Coder tool parsers. + +These tests cover behaviour that BOTH parsers must implement identically. +Each test runs twice — once against ``Qwen3XMLToolParser`` and once against +``Qwen3CoderToolParser`` — via the ``parser_cls`` fixture. Tests that +target streaming-mode-specific quirks of one parser only stay in their +parser-specific file (``test_qwen3xml_tool_parser.py`` or +``test_qwen3coder_tool_parser.py``). +""" + +import json +from collections.abc import Generator + +import pytest +from openai.types.responses.function_tool import FunctionTool +from xgrammar import StructuralTag + +from tests.tool_parsers.utils import run_tool_extraction_streaming +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionNamedFunction, + ChatCompletionNamedToolChoiceParam, + ChatCompletionRequest, + ChatCompletionToolsParam, +) +from vllm.entrypoints.openai.engine.protocol import ( + DeltaMessage, + FunctionCall, + ToolCall, +) +from vllm.tokenizers import TokenizerLike, get_tokenizer +from vllm.tokenizers.detokenizer_utils import detokenize_incrementally +from vllm.tool_parsers.qwen3coder_tool_parser import Qwen3CoderToolParser +from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser + +MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" + + +@pytest.fixture(scope="module") +def qwen3_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture( + params=[Qwen3XMLToolParser, Qwen3CoderToolParser], + ids=["xml", "coder"], +) +def parser_cls(request): + return request.param + + +WEATHER_PARAMS = { + "type": "object", + "properties": { + "city": {"type": "string", "description": "The city name"}, + "state": {"type": "string", "description": "The state code"}, + "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]}, + }, + "required": ["city", "state"], +} + +AREA_PARAMS = { + "type": "object", + "properties": { + "shape": {"type": "string"}, + "dimensions": {"type": "object"}, + "precision": {"type": "integer"}, + }, +} + + +@pytest.fixture(params=["chat_completion", "responses_api"]) +def sample_tools(request): + if request.param == "chat_completion": + return [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "get_current_weather", + "description": "Get the current weather", + "parameters": WEATHER_PARAMS, + }, + ), + ChatCompletionToolsParam( + type="function", + function={ + "name": "calculate_area", + "description": "Calculate area of a shape", + "parameters": AREA_PARAMS, + }, + ), + ] + else: + return [ + FunctionTool( + type="function", + name="get_current_weather", + description="Get the current weather", + parameters=WEATHER_PARAMS, + ), + FunctionTool( + type="function", + name="calculate_area", + description="Calculate area of a shape", + parameters=AREA_PARAMS, + ), + ] + + +@pytest.fixture +def parser(parser_cls, qwen3_tokenizer, sample_tools): + return parser_cls(qwen3_tokenizer, tools=sample_tools) + + +def _as_chat_completion_tools( + tools: list[ChatCompletionToolsParam | FunctionTool], +) -> list[ChatCompletionToolsParam]: + normalized: list[ChatCompletionToolsParam] = [] + for tool in tools: + if isinstance(tool, ChatCompletionToolsParam): + normalized.append(tool) + else: + normalized.append( + ChatCompletionToolsParam( + type="function", + function={ + "name": tool.name, + "description": tool.description, + "parameters": tool.parameters, + }, + ) + ) + return normalized + + +def assert_tool_calls( + actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall] +): + assert len(actual_tool_calls) == len(expected_tool_calls) + for actual_tool_call, expected_tool_call in zip( + actual_tool_calls, expected_tool_calls + ): + assert actual_tool_call.type == "function" + assert actual_tool_call.function.name == expected_tool_call.function.name + assert json.loads(actual_tool_call.function.arguments) == json.loads( + expected_tool_call.function.arguments + ) + + +def stream_delta_message_generator( + parser, + tokenizer: TokenizerLike, + model_output: str, + request: ChatCompletionRequest | None = None, +) -> Generator[DeltaMessage, None, None]: + all_token_ids = tokenizer.encode(model_output, add_special_tokens=False) + + previous_text = "" + previous_tokens = None + prefix_offset = 0 + read_offset = 0 + for i, delta_token in enumerate(all_token_ids): + delta_token_ids = [delta_token] + previous_token_ids = all_token_ids[:i] + current_token_ids = all_token_ids[: i + 1] + + (new_tokens, delta_text, new_prefix_offset, new_read_offset) = ( + detokenize_incrementally( + tokenizer=tokenizer, + all_input_ids=current_token_ids, + prev_tokens=previous_tokens, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=False, + spaces_between_special_tokens=True, + ) + ) + + current_text = previous_text + delta_text + + delta_message = parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request=request, + ) + if delta_message: + yield delta_message + + previous_text = current_text + previous_tokens = ( + previous_tokens + new_tokens if previous_tokens else new_tokens + ) + prefix_offset = new_prefix_offset + read_offset = new_read_offset + + +# --------------------------------------------------------------------------- +# Basic extraction +# --------------------------------------------------------------------------- + + +def test_extract_tool_calls_no_tools(parser): + model_output = "This is a test response without any tool calls" + extracted_tool_calls = parser.extract_tool_calls(model_output, request=None) + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +_EXTRACT_CASES = [ + ( + """ + + +Dallas + + +TX + + +fahrenheit + + +""", + [ + ToolCall( + function=FunctionCall( + name="get_current_weather", + arguments=json.dumps( + {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} + ), + ) + ) + ], + None, + ), + ( + """Sure! Let me check the weather for you. + + +Dallas + + +TX + + +fahrenheit + + +""", + [ + ToolCall( + function=FunctionCall( + name="get_current_weather", + arguments=json.dumps( + {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} + ), + ) + ) + ], + "Sure! Let me check the weather for you.", + ), + ( + """ + + +rectangle + + +{"width": 10, + "height": 20} + + +2 + + +""", + [ + ToolCall( + function=FunctionCall( + name="calculate_area", + arguments=json.dumps( + { + "shape": "rectangle", + "dimensions": {"width": 10, "height": 20}, + "precision": 2, + } + ), + ) + ) + ], + None, + ), + ( + """ + + +Dallas + + +TX + + +fahrenheit + + + + + + +Orlando + + +FL + + +fahrenheit + + +""", + [ + ToolCall( + function=FunctionCall( + name="get_current_weather", + arguments=json.dumps( + {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} + ), + ) + ), + ToolCall( + function=FunctionCall( + name="get_current_weather", + arguments=json.dumps( + {"city": "Orlando", "state": "FL", "unit": "fahrenheit"} + ), + ) + ), + ], + "\n", + ), + ( + """Let me calculate that area for you. + + +circle + + +{"radius": 15.5} + + +3 + + +""", + [ + ToolCall( + function=FunctionCall( + name="calculate_area", + arguments=json.dumps( + { + "shape": "circle", + "dimensions": {"radius": 15.5}, + "precision": 3, + } + ), + ) + ) + ], + "Let me calculate that area for you.", + ), +] + +_EXTRACT_IDS = [ + "single_tool", + "single_tool_with_content", + "single_tool_multiline_param", + "parallel_tools", + "tool_with_typed_params", +] + + +@pytest.mark.parametrize( + ids=_EXTRACT_IDS, + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=_EXTRACT_CASES, +) +def test_extract_tool_calls( + parser, model_output, expected_tool_calls, expected_content +): + request = ChatCompletionRequest(model=MODEL, messages=[]) + extracted_tool_calls = parser.extract_tool_calls(model_output, request=request) + assert extracted_tool_calls.tools_called + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + # Both ``None`` and ``""`` are acceptable when the expected content is + # only whitespace — the two parsers differ on whether they preserve the + # newline that separates parallel tool-call blocks. + actual_content = extracted_tool_calls.content + if expected_content and expected_content.strip(): + assert actual_content == expected_content + else: + assert (actual_content or "").strip() == (expected_content or "").strip() + + +def test_extract_tool_calls_fallback_no_tags(parser): + """Test fallback parsing when XML tags are missing.""" + model_output = """ + +Dallas + + +TX + +""" + request = ChatCompletionRequest(model=MODEL, messages=[]) + extracted_tool_calls = parser.extract_tool_calls(model_output, request=request) + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather" + + +# --------------------------------------------------------------------------- +# Type conversion +# --------------------------------------------------------------------------- + + +def test_extract_tool_calls_type_conversion(qwen3_tokenizer, parser_cls): + """Test parameter type conversion based on tool schema.""" + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "test_types", + "parameters": { + "type": "object", + "properties": { + "int_param": {"type": "integer"}, + "float_param": {"type": "float"}, + "bool_param": {"type": "boolean"}, + "str_param": {"type": "string"}, + "obj_param": {"type": "object"}, + }, + }, + }, + ) + ] + + model_output = """ + + +42 + + +3.14 + + +true + + +hello world + + +{"key": "value"} + + +""" + + parser_inst = parser_cls(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + extracted_tool_calls = parser_inst.extract_tool_calls(model_output, request=request) + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["int_param"] == 42 + assert args["float_param"] == 3.14 + assert args["bool_param"] is True + assert args["str_param"] == "hello world" + assert args["obj_param"] == {"key": "value"} + + +def test_extract_tool_calls_complex_type_with_single_quote(qwen3_tokenizer, parser_cls): + """Object parameter expressed as a Python repr (single quotes).""" + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "test_types", + "parameters": { + "type": "object", + "properties": { + "int_param": {"type": "integer"}, + "float_param": {"type": "float"}, + "bool_param": {"type": "boolean"}, + "str_param": {"type": "string"}, + "obj_param": {"type": "object"}, + }, + }, + }, + ) + ] + + model_output = """ + + +{'key': 'value'} + + +""" + + parser_inst = parser_cls(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + extracted_tool_calls = parser_inst.extract_tool_calls(model_output, request=request) + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["obj_param"] == {"key": "value"} + + +# --------------------------------------------------------------------------- +# Streaming extraction +# --------------------------------------------------------------------------- + + +_STREAMING_CASES = [ + ("This is a test without tools", [], "This is a test without tools"), +] + _EXTRACT_CASES + +_STREAMING_IDS = ["no_tools"] + _EXTRACT_IDS + + +@pytest.mark.parametrize( + ids=_STREAMING_IDS, + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=_STREAMING_CASES, +) +def test_extract_tool_calls_streaming( + parser, + qwen3_tokenizer, + model_output, + expected_tool_calls, + expected_content, +): + """Test incremental streaming behavior including typed parameters.""" + request = ChatCompletionRequest(model=MODEL, messages=[]) + + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + parser, qwen3_tokenizer, model_output, request + ): + assert not delta_message.role + + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + + if tool_call.function: + if tool_call.function.name: + assert tool_states[idx]["name"] is None + tool_states[idx]["name"] = tool_call.function.name + + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + # Be tolerant about whitespace-only deltas between parallel tool calls; + # see ``test_extract_tool_calls`` for the same reasoning. + if expected_content and expected_content.strip(): + assert other_content == expected_content + else: + assert other_content.strip() == (expected_content or "").strip() + assert len(tool_states) == len(expected_tool_calls) + assert len(parser.prev_tool_call_arr) == len(expected_tool_calls) + + for idx, expected_tool in enumerate(expected_tool_calls): + state = tool_states[idx] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == expected_tool.function.name + + arguments_str = state["arguments"] + assert arguments_str is not None + actual_args = json.loads(arguments_str) + expected_args = json.loads(expected_tool.function.arguments) + assert actual_args == expected_args + + +def test_extract_tool_calls_missing_closing_parameter_tag(parser): + """Test handling of missing closing tag.""" + model_output = """Let me check the weather for you: + + + +Dallas + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[]) + extracted_tool_calls = parser.extract_tool_calls(model_output, request=request) + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather" + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert "city" in args + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" + assert "Let me check the weather for you:" in extracted_tool_calls.content + + +def test_extract_tool_calls_streaming_missing_closing_tag(parser, qwen3_tokenizer): + """Streaming with missing closing tag.""" + model_output = """Let me check the weather for you: + + + +Dallas + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[]) + other_content = "" + tool_states = {} + + for delta_message in stream_delta_message_generator( + parser, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + assert "Let me check the weather for you:" in other_content + assert len(tool_states) == 1 + assert len(parser.prev_tool_call_arr) == 1 + + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" + + +def test_extract_tool_calls_streaming_incremental(parser, qwen3_tokenizer): + """Test that streaming is truly incremental.""" + model_output = """I'll check the weather. + + +Dallas + + +TX + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[]) + chunks = [] + for delta_message in stream_delta_message_generator( + parser, qwen3_tokenizer, model_output, request + ): + chunks.append(delta_message) + + assert len(chunks) > 3 + assert chunks[0].content is not None + assert chunks[0].tool_calls is None or chunks[0].tool_calls == [] + + header_found = False + for chunk in chunks: + if chunk.tool_calls and chunk.tool_calls[0].id: + header_found = True + assert chunk.tool_calls[0].function.name == "get_current_weather" + assert chunk.tool_calls[0].type == "function" + # XML emits an empty arguments string with the header; Coder + # emits the opening "{" with the header. Both are valid. + assert chunk.tool_calls[0].function.arguments in ("", "{") + break + assert header_found + + arg_chunks = [] + for chunk in chunks: + if chunk.tool_calls and chunk.tool_calls[0].function.arguments: + arg_chunks.append(chunk.tool_calls[0].function.arguments) + + assert len(arg_chunks) > 1 + full_args = "".join(arg_chunks) + parsed_args = json.loads(full_args) + assert parsed_args["city"] == "Dallas" + assert parsed_args["state"] == "TX" + + +# --------------------------------------------------------------------------- +# Robustness regressions +# --------------------------------------------------------------------------- + + +def test_malformed_xml_no_gt_delimiter(parser): + """Regression: malformed XML without '>' must not crash (PR #36774).""" + model_output = ( + "\n" + "Dallas\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[]) + result = parser.extract_tool_calls(model_output, request=request) + assert result is not None + assert isinstance(result.tool_calls, list) + assert all(tc is not None for tc in result.tool_calls) + + +def test_none_tool_calls_filtered(parser): + """Regression: None tool calls filtered from output (PR #36774).""" + model_output = ( + "\n" + "\n" + "\n" + "\n" + "\n" + "Dallas\n" + "TX\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[]) + result = parser.extract_tool_calls(model_output, request=request) + assert all(tc is not None for tc in result.tool_calls) + assert result.tools_called + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "get_current_weather" + args = json.loads(result.tool_calls[0].function.arguments) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + + +def test_streaming_multi_param_single_chunk(parser): + """Regression: speculative decode delivering multiple params at once + (PR #35615).""" + request = ChatCompletionRequest(model=MODEL, messages=[]) + + deltas = [ + "", + "\n", + "\n", + # This single delta delivers all three parameters at once + "\nDallas\n" + "\n\nTX\n" + "\n\nfahrenheit\n", + "\n", + "\n", + ] + + reconstructor = run_tool_extraction_streaming( + parser, + deltas, + request, + assert_one_tool_per_delta=False, + ) + + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" + + +def test_no_double_serialization_string_args(qwen3_tokenizer, parser_cls): + """Regression: string arguments must not be double-serialized + (PR #35615).""" + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "greet", + "parameters": { + "type": "object", + "properties": { + "message": {"type": "string"}, + }, + }, + }, + ) + ] + + model_output = ( + "\n" + "\n" + "hello world\n" + "\n" + "" + ) + + parser_inst = parser_cls(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser_inst.extract_tool_calls(model_output, request=request) + + assert result.tools_called + assert len(result.tool_calls) == 1 + raw_arguments = result.tool_calls[0].function.arguments + args = json.loads(raw_arguments) + assert args["message"] == "hello world" + assert '\\"hello world\\"' not in raw_arguments + + +def test_extract_tool_calls_streaming_speculative_decode_loss(parser): + """If the parser hasn't started JSON yet and the delta contains the + parameters AND the end of the tool call, the parser should not just + return '{' and lose the parameters. + """ + request = ChatCompletionRequest(model="test", messages=[]) + + text1 = "\n\n" + parser.extract_tool_calls_streaming("", text1, text1, [], [1], [1], request) + + delta_str = "\nParis\n\n\n" + text2 = text1 + delta_str + delta2 = parser.extract_tool_calls_streaming( + text1, text2, delta_str, [1], [1, 2], [2], request + ) + + assert delta2 is not None + assert delta2.tool_calls is not None + assert len(delta2.tool_calls) == 1 + args = delta2.tool_calls[0].function.arguments + assert "Paris" in args, f"Arguments lost! Got: {args}" + + +# --------------------------------------------------------------------------- +# Value conversion: string "null" must NOT become JSON null +# --------------------------------------------------------------------------- + + +def test_string_null_value_preserved(qwen3_tokenizer, parser_cls): + """A string-typed parameter with literal value "null" must be preserved + as the string "null" (not converted to Python None / JSON null). + + Root cause: _convert_param_value must check the schema's ``string`` + type BEFORE the "null" shortcut — otherwise any param whose raw text + is "null" becomes None regardless of declared type. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "search", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + model_output = ( + "\n" + "\n" + "null\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser.extract_tool_calls(model_output, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["query"] == "null", ( + f"String parameter 'null' was converted incorrectly. Got: {args.get('query')!r}" + ) + + +# --------------------------------------------------------------------------- +# anyOf nullable schema — type detection +# --------------------------------------------------------------------------- + + +def test_anyof_string_null_keeps_value_as_string(qwen3_tokenizer, parser_cls): + """anyOf [{type: string}, {type: null}] with a numeric-looking value + must keep the value as a string (the schema declares ``string``). + + Root cause: anyOf was previously treated as ``object`` (for the Coder + parser) or fell back to ``string`` only when no object/array option + was present (for the XML parser). The correct behaviour is to pick + the FIRST non-null type from the anyOf list. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_code", + "parameters": { + "type": "object", + "properties": { + "code": { + "anyOf": [{"type": "string"}, {"type": "null"}], + }, + }, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + model_output = ( + "\n" + "\n" + "42\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser.extract_tool_calls(model_output, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["code"] == "42", ( + f"anyOf string|null param '42' was parsed as " + f"{type(args['code']).__name__}: {args['code']!r}" + ) + + +def test_anyof_integer_null_parses_as_int(qwen3_tokenizer, parser_cls): + """anyOf [{type: integer}, {type: null}] must parse a numeric value as + an int. Previously the XML parser ignored anyOf for non-container + types and silently treated the param as ``string``. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_count", + "parameters": { + "type": "object", + "properties": { + "count": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + }, + }, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + model_output = ( + "\n" + "\n" + "42\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser.extract_tool_calls(model_output, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["count"] == 42, ( + f"anyOf integer|null: expected int 42, got {args['count']!r}" + ) + + +# --------------------------------------------------------------------------- +# anyOf object schema — value not double-encoded +# --------------------------------------------------------------------------- + +_ANYOF_OBJECT_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "update_record", + "parameters": { + "type": "object", + "properties": { + "data": { + "anyOf": [{"type": "object"}, {"type": "null"}], + }, + }, + }, + }, + ) +] + +_ANYOF_OBJECT_OUTPUT = ( + "\n" + "\n" + '{"key": "value", "count": 42}\n' + "\n" + "" +) + + +def test_anyof_object_param_not_double_encoded_nonstreaming( + qwen3_tokenizer, parser_cls +): + parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_OBJECT_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_OBJECT_TOOLS) + result = parser.extract_tool_calls(_ANYOF_OBJECT_OUTPUT, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert isinstance(args["data"], dict), ( + f"anyOf object param was double-encoded: data={args['data']!r}" + ) + assert args["data"] == {"key": "value", "count": 42} + + +def test_anyof_object_param_not_double_encoded_streaming(qwen3_tokenizer, parser_cls): + parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_OBJECT_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_OBJECT_TOOLS) + deltas = [ + "", + "\n", + '\n{"key": "value", "count": 42}', + "\n", + "\n", + ] + reconstructor = run_tool_extraction_streaming( + parser, deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert isinstance(args["data"], dict), ( + f"anyOf object param was double-encoded in streaming: data={args['data']!r}" + ) + + +# --------------------------------------------------------------------------- +# anyOf / nullable (Pydantic v2 Optional[T]) type resolution. +# Both parsers extract the first non-null type from the anyOf union. +# --------------------------------------------------------------------------- + +_ANYOF_TYPES_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "test_anyof", + "parameters": { + "type": "object", + "properties": { + "anyof_int": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 5, + }, + "anyof_str": { + "anyOf": [{"type": "string"}, {"type": "null"}], + }, + "anyof_array": { + "anyOf": [ + {"type": "array", "items": {"type": "string"}}, + {"type": "null"}, + ], + }, + "anyof_obj": { + "anyOf": [{"type": "object"}, {"type": "null"}], + }, + "type_as_array": { + "type": ["integer", "null"], + }, + "multi_non_null": { + "anyOf": [ + {"type": "string"}, + {"type": "integer"}, + {"type": "null"}, + ], + }, + }, + }, + }, + ) +] + +_ANYOF_TYPES_OUTPUT = ( + "\n" + "\n" + "5\n" + "hello\n" + '["a", "b", "c"]\n' + '{"key": "value"}\n' + "42\n" + "some text\n" + "\n" + "" +) + + +def test_extract_tool_calls_anyof_type_conversion(qwen3_tokenizer, parser_cls): + """anyOf nullable schemas (Pydantic v2 ``Optional[T]``) must resolve to + the first non-null type and apply the matching conversion: int(), + list/dict via json, string passthrough. + """ + parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_TYPES_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ANYOF_TYPES_TOOLS) + result = parser.extract_tool_calls(_ANYOF_TYPES_OUTPUT, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["anyof_int"] == 5 + assert isinstance(args["anyof_int"], int) + assert args["anyof_str"] == "hello" + assert isinstance(args["anyof_str"], str) + assert args["anyof_array"] == ["a", "b", "c"] + assert isinstance(args["anyof_array"], list) + assert args["anyof_obj"] == {"key": "value"} + assert isinstance(args["anyof_obj"], dict) + # JSON-Schema list-form type {"type": ["integer", "null"]} → int + assert args["type_as_array"] == 42 + assert isinstance(args["type_as_array"], int) + # anyOf[string, integer, null] → first non-null type is string + assert args["multi_non_null"] == "some text" + assert isinstance(args["multi_non_null"], str) + + +_ANYOF_STREAMING_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "search_web", + "parameters": { + "type": "object", + "properties": { + "query": { + "anyOf": [{"type": "string"}, {"type": "null"}], + }, + "count": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 5, + }, + "verbose": { + "anyOf": [{"type": "boolean"}, {"type": "null"}], + }, + }, + }, + }, + ) +] + +_ANYOF_STREAMING_OUTPUT = ( + "\n" + "\n" + "vllm tool parser\n" + "10\n" + "true\n" + "\n" + "" +) + + +def test_extract_tool_calls_anyof_type_conversion_streaming( + qwen3_tokenizer, parser_cls +): + """Streaming e2e for anyOf nullable schemas: string/int/bool types must + be resolved through the incremental pipeline for both parsers. + """ + parser = parser_cls(qwen3_tokenizer, tools=_ANYOF_STREAMING_TOOLS) + request = ChatCompletionRequest( + model=MODEL, messages=[], tools=_ANYOF_STREAMING_TOOLS + ) + reconstructor = run_tool_extraction_streaming( + parser, + _ANYOF_STREAMING_OUTPUT, + request, + assert_one_tool_per_delta=False, + ) + assert len(reconstructor.tool_calls) == 1 + assert reconstructor.tool_calls[0].function.name == "search_web" + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert args["query"] == "vllm tool parser" + assert isinstance(args["query"], str) + assert args["count"] == 10 + assert isinstance(args["count"], int) + assert args["verbose"] is True + assert isinstance(args["verbose"], bool) + + +# --------------------------------------------------------------------------- +# Object param double-encoded as JSON-encoded Python repr +# --------------------------------------------------------------------------- + +_DOUBLE_ENCODED_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "process", + "parameters": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "data": {"type": "object"}, + }, + }, + }, + ) +] + +_DOUBLE_ENCODED_OUTPUT = ( + "\n" + "\n" + "\nhello\n\n" + "\n\"{'key': 'value', 'n': 1}\"\n\n" + "\n" + "\n" +) + + +def test_double_encoded_object_param_nonstreaming(qwen3_tokenizer, parser_cls): + """A model trained with a buggy template (json.dumps(str(dict))) emits + object args as a JSON-encoded Python repr string. The parser must + double-decode it back to a dict. + """ + parser = parser_cls(qwen3_tokenizer, tools=_DOUBLE_ENCODED_TOOLS) + request = ChatCompletionRequest( + model=MODEL, messages=[], tools=_DOUBLE_ENCODED_TOOLS + ) + result = parser.extract_tool_calls(_DOUBLE_ENCODED_OUTPUT, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["name"] == "hello" + assert isinstance(args["data"], dict), ( + f"Expected dict, got {type(args['data'])}: {args['data']!r}" + ) + assert args["data"] == {"key": "value", "n": 1} + + +def test_double_encoded_object_param_streaming(qwen3_tokenizer, parser_cls): + parser = parser_cls(qwen3_tokenizer, tools=_DOUBLE_ENCODED_TOOLS) + request = ChatCompletionRequest( + model=MODEL, messages=[], tools=_DOUBLE_ENCODED_TOOLS + ) + reconstructor = run_tool_extraction_streaming( + parser, _DOUBLE_ENCODED_OUTPUT, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert args["name"] == "hello" + assert isinstance(args["data"], dict), ( + f"Expected dict, got {type(args['data'])}: {args['data']!r}" + ) + assert args["data"] == {"key": "value", "n": 1} + + +# --------------------------------------------------------------------------- +# Parameter value containing XML structural tags as literal text. +# Expected: the value is preserved intact, no spurious extra parameters +# are created from the embedded tags. +# --------------------------------------------------------------------------- + +_WRITE_FILE_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "write_file", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "content": {"type": "string"}, + }, + }, + }, + ) +] + +_XML_TAGS_IN_CONTENT = ( + "char_deltas = [\n" + ' "\\n",\n' + ' "\\n",\n' + ' "\\n\\n",\n' + ' "\\n",\n' + "]\n" +) + +_WRITE_FILE_XML_TAGS_OUTPUT = ( + "\n" + "\n" + "\ntest.py\n\n" + f"\n{_XML_TAGS_IN_CONTENT}\n" + "\n" + "\n" +) + + +def test_content_with_xml_structural_tags_nonstreaming(qwen3_tokenizer, parser_cls): + """Non-streaming: a string param whose value embeds , + , , as literal text must be + extracted intact, with no spurious extra params being created from + the embedded tags. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + result = parser.extract_tool_calls(_WRITE_FILE_XML_TAGS_OUTPUT, request=request) + + assert result.tools_called + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "write_file" + args = json.loads(result.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params from embedded tags: {list(args.keys())}" + ) + assert args["path"] == "test.py" + expected = _XML_TAGS_IN_CONTENT.rstrip("\n") + assert args["content"] == expected, ( + f"content was truncated/corrupted. Got: {args.get('content')!r}" + ) + + +def test_content_with_xml_structural_tags_streaming(qwen3_tokenizer, parser_cls): + """Streaming variant: pre-formed chunks, full content in one delta.""" + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + char_deltas = [ + "\n", + "\n", + "\ntest.py\n\n", + f"\n{_XML_TAGS_IN_CONTENT}\n", + "\n", + "\n", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + assert reconstructor.tool_calls[0].function.name == "write_file" + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params from embedded tags: {list(args.keys())}" + ) + assert args["path"] == "test.py" + expected = _XML_TAGS_IN_CONTENT.rstrip("\n") + assert args["content"] == expected + + +# --------------------------------------------------------------------------- +# Parameter value containing and on their +# OWN lines (Jinja2 templates, parser fixtures, etc.). Schema filtering +# must prevent the unknown name from being treated as structural. +# --------------------------------------------------------------------------- + +_CONTENT_WITH_PARAM_LIKE_LINES = ( + 'TOOL_CALL_TEMPLATE = """\n' + "\n" + "\n" + "#!/usr/bin/env python3\n" + "\n" + '"""\n' +) + +_WRITE_FILE_PARAM_LIKE_LINES_OUTPUT = ( + "\n" + "\n" + "\ntest_template.py\n\n" + f"\n{_CONTENT_WITH_PARAM_LIKE_LINES}\n" + "\n" + "\n" +) + + +def test_content_with_param_like_lines_nonstreaming(qwen3_tokenizer, parser_cls): + """Non-streaming: ```` and ```` on their + own lines inside a string value must not terminate the parameter + early. Requires schema-based filtering so that ``new_string`` (not a + real parameter of write_file) is treated as literal text. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + result = parser.extract_tool_calls( + _WRITE_FILE_PARAM_LIKE_LINES_OUTPUT, request=request + ) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params: {list(args.keys())}" + ) + assert args["path"] == "test_template.py" + expected = _CONTENT_WITH_PARAM_LIKE_LINES.rstrip("\n") + assert args["content"] == expected, ( + f"content truncated/wrong: {args.get('content')!r}" + ) + + +def test_content_with_param_like_lines_streaming(qwen3_tokenizer, parser_cls): + """Streaming variant: each structural-looking literal line arrives in + its own delta — the critical case is when ``\\n`` appears + alone with empty lookahead, which must NOT be treated as a real + structural close. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + char_deltas = [ + "\n", + "\n", + "\ntest_template.py\n\n", + '\nTOOL_CALL_TEMPLATE = """\n', + "\n", # literal close — alone in its delta + "\n", # literal new-param line + "#!/usr/bin/env python3\n", + "\n", # second literal close + '"""\n', + "\n", # REAL close of content + "\n", + "\n", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params: {list(args.keys())}" + ) + assert args["path"] == "test_template.py" + expected = _CONTENT_WITH_PARAM_LIKE_LINES.rstrip("\n") + assert args["content"] == expected + + +# --------------------------------------------------------------------------- +# Array param containing JSON true/false/null +# --------------------------------------------------------------------------- + +_ARRAY_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "pick", + "parameters": { + "type": "object", + "properties": {"items": {"type": "array"}}, + }, + }, + ) +] + +_ARRAY_WITH_JSON_BOOL_OUTPUT = ( + "\n\n" + '\n["a", "b", 1, true]\n\n' + "\n" +) + + +def test_array_with_json_bool(qwen3_tokenizer, parser_cls): + """An array param containing a JSON literal (``true``/``false``/``null``) + must be parsed as a real Python list, not wrapped as a string. + + Root cause for the XML parser: the deferred path used + ``ast.literal_eval`` first, which doesn't understand JSON tokens. + Both parsers must try ``json.loads`` before falling back to + ``ast.literal_eval``. + """ + parser = parser_cls(qwen3_tokenizer, tools=_ARRAY_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_ARRAY_TOOLS) + result = parser.extract_tool_calls(_ARRAY_WITH_JSON_BOOL_OUTPUT, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert isinstance(args["items"], list), ( + f"Array with JSON bool was not parsed as list: " + f"{type(args['items']).__name__} = {args['items']!r}" + ) + assert args["items"] == ["a", "b", 1, True] + + +# --------------------------------------------------------------------------- +# Speculative decoding: two complete tool calls in a single streaming delta. +# Both parsers must emit both tool calls, not drop the second. +# --------------------------------------------------------------------------- + +_WEATHER_TOOLS = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "get_weather", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + }, + }, + ) +] + +_TWO_TOOL_CALLS_IN_ONE_CHUNK = ( + "\n\n" + "\nParis\n\n" + "\n\n" + "\n\n" + "\nLondon\n\n" + "\n" +) + + +def test_two_tool_calls_in_one_streaming_chunk(qwen3_tokenizer, parser_cls): + """Speculative decoding flushes can deliver several full + ``...`` blocks in a single delta. Both must be + emitted; dropping the second one is a regression. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS) + reconstructor = run_tool_extraction_streaming( + parser, + [_TWO_TOOL_CALLS_IN_ONE_CHUNK], + request, + assert_one_tool_per_delta=False, + ) + assert len(reconstructor.tool_calls) == 2, ( + f"Expected 2 tool calls in one delta, got {len(reconstructor.tool_calls)}" + ) + args0 = json.loads(reconstructor.tool_calls[0].function.arguments) + args1 = json.loads(reconstructor.tool_calls[1].function.arguments) + assert args0 == {"city": "Paris"} + assert args1 == {"city": "London"} + + +# --------------------------------------------------------------------------- +# Trailing free text after the LAST in the SAME delta (MTP / +# speculative decoding). The text must be emitted as content; dropping it +# silently is a regression. +# --------------------------------------------------------------------------- + + +def test_python_none_value_for_nullable_int(qwen3_tokenizer, parser_cls): + """A Qwen3.5-trained model emits Python ``None`` (not ``null``) for a + nullable non-string parameter, because the Qwen3.5 chat template + renders ``args_value | string`` for non-container types — turning a + null arg from a previous tool call into the literal "None" in the + prompt. The model then learns to generate the same "None" verbatim. + + The parser must recognise this and convert "None" to JSON null, + just like it already does for the literal "null" emitted by + Qwen3.6-trained models. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_count", + "parameters": { + "type": "object", + "properties": { + "count": { + "anyOf": [ + {"type": "integer"}, + {"type": "null"}, + ], + }, + }, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + model_output = ( + "\n" + "\n" + "None\n" + "\n" + "" + ) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser.extract_tool_calls(model_output, request=request) + + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["count"] is None, ( + f"Python repr None was not converted to JSON null. Got: {args['count']!r}" + ) + + +def test_streaming_two_tool_calls_plus_trailing_text_one_delta( + qwen3_tokenizer, parser_cls +): + """MTP: a single delta delivers tool 1 + tool 2 + trailing free text. + Both tool calls must be emitted AND the trailing text must surface as + content in the same delta — not be silently dropped. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS) + deltas = [ + _TWO_TOOL_CALLS_IN_ONE_CHUNK + "\nAll done!", + ] + reconstructor = run_tool_extraction_streaming( + parser, deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 2, ( + f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}" + ) + assert "All done!" in reconstructor.other_content, ( + f"Trailing text after the second tool call was dropped. " + f"Got content: {reconstructor.other_content!r}" + ) + + +def test_streaming_trailing_text_with_final_close_in_same_delta( + qwen3_tokenizer, parser_cls +): + """MTP / speculative decoding can deliver the closing ```` + together with trailing free text in a single delta. The text after + the close must be emitted as content rather than being silently + consumed by the parser's "advance to next tool" logic. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WEATHER_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WEATHER_TOOLS) + deltas = [ + # Build up the tool call up to and including . + "\n\n" + "Paris\n", + # Then deliver + trailing text in ONE delta. + "\n\nI hope this helps!", + ] + reconstructor = run_tool_extraction_streaming( + parser, deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + assert "I hope this helps!" in reconstructor.other_content, ( + f"Trailing text after was dropped. " + f"Got content: {reconstructor.other_content!r}" + ) + + +# --------------------------------------------------------------------------- +# Parameter value containing a literal ```` whose NAME IS +# itself a real parameter of the same tool. The schema-based filter cannot +# rule the literal out by name, so a stronger heuristic is required (e.g. +# the literal does not pair with a structural ```` followed by +# another structural delimiter). This is the exact pattern that breaks +# qwen-code WriteFile when the file being written is itself a parser test +# fixture. +# --------------------------------------------------------------------------- + +_CONTENT_WITH_REAL_PARAM_NAME_LITERAL = ( + 'doc = """\n\nliteral/value\n\n"""\n' +) + +_REAL_PARAM_NAME_LITERAL_OUTPUT = ( + "\n" + "\n" + "\nfixture.py\n\n" + f"\n{_CONTENT_WITH_REAL_PARAM_NAME_LITERAL}\n" + "\n" + "" +) + + +def test_content_with_real_param_name_literal_nonstreaming(qwen3_tokenizer, parser_cls): + """Non-streaming: parameter ``content`` value embeds + ``...`` where ``path`` IS the other real + parameter of the same ``write_file`` tool. Schema name filtering alone + cannot disambiguate — the parser must use a stronger rule (e.g. the + embedded ```` must be followed by a structural delimiter + that closes the OUTER param, not the inner literal). + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + result = parser.extract_tool_calls(_REAL_PARAM_NAME_LITERAL_OUTPUT, request=request) + + assert result.tools_called + assert len(result.tool_calls) == 1 + args = json.loads(result.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params from embedded same-name literal: {list(args.keys())}" + ) + assert args["path"] == "fixture.py", ( + f"Outer ``path`` was overwritten by embedded literal: {args.get('path')!r}" + ) + expected = _CONTENT_WITH_REAL_PARAM_NAME_LITERAL.rstrip("\n") + assert args["content"] == expected, ( + f"content was truncated at the embedded . " + f"Got: {args.get('content')!r}" + ) + + +def test_content_with_real_param_name_literal_streaming(qwen3_tokenizer, parser_cls): + """Streaming variant of the same case. Each meaningful structural- + looking line arrives in its own delta — the parser cannot wait for the + full text to disambiguate. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + char_deltas = [ + "\n", + "\n", + "\nfixture.py\n\n", + '\ndoc = """\n', + "\n", + "literal/value\n", + "\n", + '"""\n', + "\n", + "\n", + "", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params from embedded same-name literal: {list(args.keys())}" + ) + assert args["path"] == "fixture.py" + expected = _CONTENT_WITH_REAL_PARAM_NAME_LITERAL.rstrip("\n") + assert args["content"] == expected, ( + f"content was truncated at the embedded . " + f"Got: {args.get('content')!r}" + ) + + +# --------------------------------------------------------------------------- +# Parameter value containing a COMPLETE nested tool_call (all four balise +# types: , , , , +# , ) — the qwen-code WriteFile pattern when the +# file being written is itself a parser fixture or a chat-template +# example. Every literal must stay inside the value; no spurious extra +# tool calls or params should be generated. +# --------------------------------------------------------------------------- + +_CONTENT_WITH_FULL_NESTED_CALL = ( + 'doc = """\n' + "\n" + "\n" + "\n" + "literal/value.txt\n" + "\n" + "\n" + "hello\n" + "\n" + "\n" + "\n" + '"""\n' +) + +_FULL_NESTED_CALL_OUTPUT = ( + "\n" + "\n" + "\nfixture.py\n\n" + f"\n{_CONTENT_WITH_FULL_NESTED_CALL}\n" + "\n" + "" +) + + +def test_content_with_full_nested_tool_call_nonstreaming(qwen3_tokenizer, parser_cls): + """Non-streaming: parameter ``content`` contains a complete literal + ``...`` whose function/parameter names match + the OUTER tool's schema. Every literal must stay inside the value; + no extra tool call must be generated. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + result = parser.extract_tool_calls(_FULL_NESTED_CALL_OUTPUT, request=request) + + assert result.tools_called + assert len(result.tool_calls) == 1, ( + f"Expected 1 tool call (the outer one), got " + f"{len(result.tool_calls)} — embedded literal tool_call was " + f"incorrectly promoted to a real call." + ) + args = json.loads(result.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"] + assert args["path"] == "fixture.py" + expected = _CONTENT_WITH_FULL_NESTED_CALL.rstrip("\n") + assert args["content"] == expected, ( + f"content truncated/corrupted: {args.get('content')!r}" + ) + + +def test_content_with_full_nested_tool_call_streaming(qwen3_tokenizer, parser_cls): + """Streaming variant: the literal nested ``...`` + crosses many delta boundaries; the parser must not start a second + tool call. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + char_deltas = [ + "\n", + "\n", + "\nfixture.py\n\n", + '\ndoc = """\n', + "\n", + "\n", + "\n", + "literal/value.txt\n", + "\n", + "\n", + "hello\n", + "\n", + "\n", + "\n", + '"""\n', + "\n", + "\n", + "", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1, ( + f"Expected 1 tool call, got {len(reconstructor.tool_calls)} — " + f"a literal nested was promoted to a real call." + ) + args = json.loads(reconstructor.tool_calls[0].function.arguments) + assert list(args.keys()) == ["path", "content"] + assert args["path"] == "fixture.py" + expected = _CONTENT_WITH_FULL_NESTED_CALL.rstrip("\n") + assert args["content"] == expected, ( + f"content truncated/corrupted: {args.get('content')!r}" + ) + + +# --------------------------------------------------------------------------- +# Two consecutive tool calls, where the SECOND embeds a literal nested +# tool_call whose ```` uses a NAME that is NOT in the +# OUTER tool's schema (e.g. a description of a different tool's format). +# Reproduces the qwen-code Qwen 3.6 freeze scenario: the depth tracker +# in ``_find_true_param_end`` filters opens by schema, so the literal +# ```` that closes the unknown-NAME literal open appears +# unmatched and matches the structural lookahead of the trailing +# ````, truncating the OUTER content value. +# --------------------------------------------------------------------------- + +_OUT_OF_SCHEMA_NESTED_CONTENT = ( + 'template = """\n' + "\n\n" + "baz\n" + "\n\n" + '"""\n' +) + +_TWO_TOOLS_OUT_OF_SCHEMA_NESTED_OUTPUT = ( + "\n\n" + "baz\n" + "\n" + "\n\n" + "\n\n" + "\nfixture.py\n\n" + f"\n{_OUT_OF_SCHEMA_NESTED_CONTENT}\n" + "\n" +) + + +def test_two_tools_second_with_out_of_schema_nested_literal_nonstreaming( + qwen3_tokenizer, parser_cls +): + """Two structural tool calls; the second's ``content`` value embeds a + literal nested ```` block whose inner ```` + uses a NAME not in the outer tool's schema (``write_file`` only knows + ``path`` and ``content``). + + The walker must still match the outer ```` of ``content``, + not the literal ```` of the unknown-NAME nested open. + """ + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + result = parser.extract_tool_calls( + _TWO_TOOLS_OUT_OF_SCHEMA_NESTED_OUTPUT, request=request + ) + assert result.tools_called + assert len(result.tool_calls) == 2, ( + f"Expected 2 tool calls, got {len(result.tool_calls)}: " + f"{[tc.function.name for tc in result.tool_calls]}" + ) + args0 = json.loads(result.tool_calls[0].function.arguments) + args1 = json.loads(result.tool_calls[1].function.arguments) + assert args0 == {"bar": "baz"}, f"first tool args wrong: {args0!r}" + assert result.tool_calls[1].function.name == "write_file" + assert list(args1.keys()) == ["path", "content"], ( + f"Spurious params on outer tool: {list(args1.keys())}" + ) + assert args1["path"] == "fixture.py" + expected = _OUT_OF_SCHEMA_NESTED_CONTENT.rstrip("\n") + assert args1["content"] == expected, ( + f"outer content truncated at literal : {args1.get('content')!r}" + ) + + +def test_two_tools_second_with_out_of_schema_nested_literal_streaming( + qwen3_tokenizer, parser_cls +): + """Streaming variant of the same scenario.""" + parser = parser_cls(qwen3_tokenizer, tools=_WRITE_FILE_TOOLS) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=_WRITE_FILE_TOOLS) + char_deltas = [ + "\n\n", + "baz\n", + "\n", + "\n\n", + "\n\n", + "\nfixture.py\n\n", + '\ntemplate = """\n', + "\n\n", + "baz\n", + "\n\n", + '"""\n', + "\n", + "\n", + "", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 2, ( + f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}" + ) + args0 = json.loads(reconstructor.tool_calls[0].function.arguments) + args1 = json.loads(reconstructor.tool_calls[1].function.arguments) + assert args0 == {"bar": "baz"} + assert reconstructor.tool_calls[1].function.name == "write_file" + assert list(args1.keys()) == ["path", "content"] + assert args1["path"] == "fixture.py" + expected = _OUT_OF_SCHEMA_NESTED_CONTENT.rstrip("\n") + assert args1["content"] == expected, ( + f"outer content truncated/corrupted: {args1.get('content')!r}" + ) + + +# --------------------------------------------------------------------------- +# Phantom tool calls produced when the model writes an UNRENDERED Jinja +# template literally in its response: ``\n\n +# ...``. The function name ``{{ x }}`` contains +# template-syntax characters and CANNOT be a real function — the parser +# must reject these tool calls (or render them as content) rather than +# emit them as real ones, since the client will then raise "tool not +# found" errors and cause the agent to loop. +# --------------------------------------------------------------------------- + +_JINJA_PHANTOM_OUTPUT = ( + "\n\n" + "\n{{ v }}\n\n" + "\n" + "\n\n" + "\n\n" + "\nout.txt\n\n" + "\nhello\n\n" + "\n" +) + + +def test_jinja_template_phantom_tool_call_is_rejected_nonstreaming( + qwen3_tokenizer, parser_cls +): + """A ```` block (unrendered Jinja) emits a + function name that is not a valid identifier. It must NOT be + surfaced as a real tool call — the client would fail with "tool not + found" and the agent would loop. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "write_file", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "content": {"type": "string"}, + }, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + result = parser.extract_tool_calls(_JINJA_PHANTOM_OUTPUT, request=request) + assert result.tools_called + names = [tc.function.name for tc in result.tool_calls] + assert "{{ tc.name }}" not in names, ( + f"Phantom Jinja-template tool call surfaced as real: {names}" + ) + assert names == ["write_file"], ( + f"Expected only the real ``write_file`` tool call, got: {names}" + ) + + +# NOTE: a streaming counterpart of the above test is intentionally not +# added. Filtering phantoms in streaming requires a separate +# "client-visible index" counter (the existing ``current_tool_index`` is +# also used for internal position bookkeeping). Until that refactor +# lands, the streaming path may still surface phantoms and the client +# is expected to drop unknown function names. The non-streaming path +# is the one consumed by the offline tools-extraction code and by the +# ``_parse_xml_function_call`` helper invoked at function-end during +# streaming, so production users still see the filtered result for +# completed tool calls. + + +# --------------------------------------------------------------------------- +# Inline empty ``...`` (no ````) before a +# real tool call: the content text BETWEEN the inline literal and the real +# tool call must be preserved. Previously the content was truncated at the +# position of the FIRST ```` token regardless of whether that +# block contained a real ````. +# --------------------------------------------------------------------------- + + +def test_inline_empty_tool_call_preserves_content_before_real_call( + qwen3_tokenizer, parser_cls +): + """A bare ``example`` in the model's narrative + text (no ```` inside) must NOT consume the surrounding + content; only the real ```` block that contains a valid + function call should anchor ``content_index``. + + The XML parser's SAX-based pipeline consumes the inline empty + block's body as XML text (so ``example`` is dropped), but the + surrounding narrative ("I'll show:" and "Now real:") must still be + preserved — both parsers are checked. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "log", + "parameters": { + "type": "object", + "properties": {"msg": {"type": "string"}}, + }, + }, + ) + ] + parser = parser_cls(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + text = ( + "I'll show: example. Now real:\n" + "\n\n\nhi\n\n" + "\n" + ) + result = parser.extract_tool_calls(text, request=request) + assert result.tools_called + assert len(result.tool_calls) == 1 + assert result.tool_calls[0].function.name == "log" + # Content between the inline empty tool_call and the real one MUST be + # preserved — dropping it loses the model's contextual narrative. + assert result.content is not None + assert "I'll show:" in result.content, ( + f"Pre-inline narrative lost from content: {result.content!r}" + ) + assert "Now real:" in result.content, ( + f"Content between inline literal and real tool_call lost: {result.content!r}" + ) + + +# --------------------------------------------------------------------------- +# anyOf [{type: string}, {type: null}] with the literal "null" or "None" +# value must convert to JSON null, NOT preserve as the string "null"/"None". +# Observed against a real Qwen 3.6 server: the model emits ``None`` for a +# nullable optional parameter and the parser kept it as the string "None", +# breaking nullable-typed clients. +# --------------------------------------------------------------------------- + + +def test_anyof_string_null_with_null_literal_returns_none(qwen3_tokenizer, parser_cls): + """anyOf [{type: string}, {type: null}] with value "null" or "None" + must convert to JSON null. String-typed paths preserve the literal, + but a nullable schema MUST recognise the null sentinel — otherwise + the client receives the literal "null" / "None" string and downstream + type checks fail. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_value", + "parameters": { + "type": "object", + "properties": { + "optional": { + "anyOf": [{"type": "string"}, {"type": "null"}], + }, + }, + }, + }, + ) + ] + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + for literal in ("null", "None"): + parser = parser_cls(qwen3_tokenizer, tools=tools) + model_output = ( + "\n" + "\n" + f"{literal}\n" + "\n" + "" + ) + result = parser.extract_tool_calls(model_output, request=request) + assert result.tools_called + args = json.loads(result.tool_calls[0].function.arguments) + assert args["optional"] is None, ( + f"anyOf string|null with value {literal!r} was kept as " + f"{type(args['optional']).__name__}: {args['optional']!r}" + ) + + +def test_get_vllm_registry_structural_tag_returns_structural_tag( + parser, + sample_tools: list[ChatCompletionToolsParam], +) -> None: + request_tools = _as_chat_completion_tools(sample_tools) + req = ChatCompletionRequest( + messages=[], + model="m", + tools=request_tools, + tool_choice="auto", + ) + tag = parser.get_structural_tag(req) + assert isinstance(tag, StructuralTag) + + req = ChatCompletionRequest( + messages=[], + model="m", + tools=request_tools, + tool_choice="required", + ) + tag = parser.get_structural_tag(req) + assert isinstance(tag, StructuralTag) + + if request_tools: + tool = request_tools[0] + req = ChatCompletionRequest( + messages=[], + model="m", + tools=request_tools, + ) + req.tool_choice = ChatCompletionNamedToolChoiceParam( + function=ChatCompletionNamedFunction(name=tool.function.name) + ) + tag = parser.get_structural_tag(req) + assert isinstance(tag, StructuralTag) + + +@pytest.mark.parametrize("include_reasoning", [True, False]) +def test_adjust_request_auto_uses_vllm_registry_structural_tag( + monkeypatch: pytest.MonkeyPatch, + parser, + sample_tools: list[ChatCompletionToolsParam], + include_reasoning: bool, +) -> None: + monkeypatch.setattr( + "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING", + True, + ) + request_tools = _as_chat_completion_tools(sample_tools) + req = ChatCompletionRequest( + messages=[], + model="m", + tools=request_tools, + tool_choice="auto", + include_reasoning=include_reasoning, + ) + out = parser.adjust_request(req) + assert out.structured_outputs is not None + assert out.structured_outputs.structural_tag is not None + assert isinstance(out.structured_outputs.structural_tag, str) + loaded = json.loads(out.structured_outputs.structural_tag) + assert isinstance(loaded, dict) + + +def test_adjust_request_required_prefers_structural_tag( + monkeypatch: pytest.MonkeyPatch, + parser, + sample_tools: list[ChatCompletionToolsParam], +) -> None: + monkeypatch.setattr( + "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING", + True, + ) + request_tools = _as_chat_completion_tools(sample_tools) + req = ChatCompletionRequest( + messages=[], + model="m", + tools=request_tools, + tool_choice="required", + ) + out = parser.adjust_request(req) + assert out.structured_outputs is not None + assert out.structured_outputs.structural_tag is not None diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py index defc6d23eff4..9ff5a933a515 100644 --- a/tests/tool_parsers/test_qwen3coder_tool_parser.py +++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py @@ -1,30 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Coder-parser-specific tests. + +Tests that exercise behaviour shared with the XML parser live in +``tests/tool_parsers/test_qwen3_xml_coder_shared.py``. Only tests that +depend on Coder-only API (e.g. ``is_tool_call_started``) or on Coder-only +streaming behaviour (e.g. character-by-character chunking) belong here. +""" + import json -from collections.abc import Generator import pytest -from openai.types.responses.function_tool import FunctionTool -from xgrammar import StructuralTag from vllm.entrypoints.openai.chat_completion.protocol import ( - ChatCompletionNamedFunction, - ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, - ChatCompletionToolsParam, -) -from vllm.entrypoints.openai.engine.protocol import ( - DeltaMessage, - FunctionCall, - ToolCall, -) -from vllm.tokenizers import TokenizerLike, get_tokenizer -from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.tool_parsers.qwen3coder_tool_parser import ( - Qwen3CoderToolParser, ) -from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser +from vllm.tokenizers import get_tokenizer +from vllm.tool_parsers.qwen3coder_tool_parser import Qwen3CoderToolParser MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" @@ -35,1407 +29,432 @@ def qwen3_tokenizer(): @pytest.fixture -def qwen3_tool_parser(qwen3_tokenizer, sample_tools): - return Qwen3CoderToolParser(qwen3_tokenizer, tools=sample_tools) - - -@pytest.fixture -def qwen3_xml_tool_parser(qwen3_tokenizer, sample_tools): - return Qwen3XMLToolParser(qwen3_tokenizer, tools=sample_tools) - - -@pytest.fixture(params=["xml"]) -def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request): - """Parameterized fixture that provides both parser types for testing""" - if request.param == "original": - return qwen3_tool_parser - else: - return qwen3_xml_tool_parser - - -WEATHER_PARAMS = { - "type": "object", - "properties": { - "city": {"type": "string", "description": "The city name"}, - "state": {"type": "string", "description": "The state code"}, - "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]}, - }, - "required": ["city", "state"], -} - -AREA_PARAMS = { - "type": "object", - "properties": { - "shape": {"type": "string"}, - "dimensions": {"type": "object"}, - "precision": {"type": "integer"}, - }, -} - - -@pytest.fixture(params=["chat_completion", "responses_api"]) -def sample_tools(request): - if request.param == "chat_completion": - return [ - ChatCompletionToolsParam( - type="function", - function={ - "name": "get_current_weather", - "description": "Get the current weather", - "parameters": WEATHER_PARAMS, - }, - ), - ChatCompletionToolsParam( - type="function", - function={ - "name": "calculate_area", - "description": "Calculate area of a shape", - "parameters": AREA_PARAMS, - }, - ), - ] - else: - return [ - FunctionTool( - type="function", - name="get_current_weather", - description="Get the current weather", - parameters=WEATHER_PARAMS, - ), - FunctionTool( - type="function", - name="calculate_area", - description="Calculate area of a shape", - parameters=AREA_PARAMS, - ), - ] - - -def _as_chat_completion_tools( - tools: list[ChatCompletionToolsParam | FunctionTool], -) -> list[ChatCompletionToolsParam]: - normalized: list[ChatCompletionToolsParam] = [] - for tool in tools: - if isinstance(tool, ChatCompletionToolsParam): - normalized.append(tool) - else: - normalized.append( - ChatCompletionToolsParam( - type="function", - function={ - "name": tool.name, - "description": tool.description, - "parameters": tool.parameters, - }, - ) - ) - return normalized - - -def assert_tool_calls( - actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall] -): - assert len(actual_tool_calls) == len(expected_tool_calls) - - for actual_tool_call, expected_tool_call in zip( - actual_tool_calls, expected_tool_calls - ): - # Qwen3 parser doesn't generate IDs during extraction - assert actual_tool_call.type == "function" - assert actual_tool_call.function.name == expected_tool_call.function.name - assert json.loads(actual_tool_call.function.arguments) == json.loads( - expected_tool_call.function.arguments - ) - +def qwen3_tool_parser(qwen3_tokenizer): + return Qwen3CoderToolParser(qwen3_tokenizer, tools=None) -def stream_delta_message_generator( - qwen3_tool_parser, - qwen3_tokenizer: TokenizerLike, - model_output: str, - request: ChatCompletionRequest | None = None, -) -> Generator[DeltaMessage, None, None]: - all_token_ids = qwen3_tokenizer.encode(model_output, add_special_tokens=False) - previous_text = "" - previous_tokens = None - prefix_offset = 0 - read_offset = 0 - for i, delta_token in enumerate(all_token_ids): - delta_token_ids = [delta_token] - previous_token_ids = all_token_ids[:i] - current_token_ids = all_token_ids[: i + 1] - - (new_tokens, delta_text, new_prefix_offset, new_read_offset) = ( - detokenize_incrementally( - tokenizer=qwen3_tokenizer, - all_input_ids=current_token_ids, - prev_tokens=previous_tokens, - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=False, - spaces_between_special_tokens=True, - ) - ) - - current_text = previous_text + delta_text - - delta_message = qwen3_tool_parser.extract_tool_calls_streaming( - previous_text, - current_text, - delta_text, - previous_token_ids, - current_token_ids, - delta_token_ids, - request=request, - ) - if delta_message: - yield delta_message - - previous_text = current_text - previous_tokens = ( - previous_tokens + new_tokens if previous_tokens else new_tokens - ) - prefix_offset = new_prefix_offset - read_offset = new_read_offset - - -def test_extract_tool_calls_no_tools(qwen3_tool_parser_parametrized): - model_output = "This is a test response without any tool calls" - extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( - model_output, request=None - ) # type: ignore[arg-type] - assert not extracted_tool_calls.tools_called - assert extracted_tool_calls.tool_calls == [] - assert extracted_tool_calls.content == model_output - - -@pytest.mark.parametrize( - ids=[ - "single_tool", - "single_tool_with_content", - "single_tool_multiline_param", - "parallel_tools", - "tool_with_typed_params", - ], - argnames=["model_output", "expected_tool_calls", "expected_content"], - argvalues=[ - ( - """ - - -Dallas - - -TX - - -fahrenheit - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ) - ], - None, - ), - ( - """Sure! Let me check the weather for you. - - -Dallas - - -TX - - -fahrenheit - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ) - ], - "Sure! Let me check the weather for you.", - ), - ( - """ - - -rectangle - - -{"width": 10, - "height": 20} - - -2 - - -""", - [ - ToolCall( - function=FunctionCall( - name="calculate_area", - arguments=json.dumps( - { - "shape": "rectangle", - "dimensions": {"width": 10, "height": 20}, - "precision": 2, - } - ), - ) - ) - ], - None, - ), - ( - """ - - -Dallas - - -TX - - -fahrenheit - - - - - - -Orlando - - -FL - - -fahrenheit - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ), - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Orlando", "state": "FL", "unit": "fahrenheit"} - ), - ) - ), - ], - None, - ), - ( - """Let me calculate that area for you. - - -circle - - -{"radius": 15.5} - - -3 - - -""", - [ - ToolCall( - function=FunctionCall( - name="calculate_area", - arguments=json.dumps( - { - "shape": "circle", - "dimensions": {"radius": 15.5}, - "precision": 3, - } - ), - ) - ) - ], - "Let me calculate that area for you.", - ), - ], -) -def test_extract_tool_calls( - qwen3_tool_parser_parametrized, - model_output, - expected_tool_calls, - expected_content, -): - request = ChatCompletionRequest(model=MODEL, messages=[]) - extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( - model_output, request=request - ) - assert extracted_tool_calls.tools_called - - assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) - - assert extracted_tool_calls.content == expected_content - - -def test_extract_tool_calls_fallback_no_tags( - qwen3_tool_parser_parametrized, +def test_streaming_trailing_text_after_tool_with_literal_close_tag_in_value( + qwen3_tokenizer, ): - """Test fallback parsing when XML tags are missing""" - model_output = """ - -Dallas - - -TX - -""" - - request = ChatCompletionRequest(model=MODEL, messages=[]) - extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( - model_output, request=request + """A tool call's parameter value contains a literal ```` + string. After the real tool call closes, trailing free text must + still be emitted as content. + + The naive ``current_text.count()`` and + ``current_text.find()`` used by the early-advance and + ``_advance_to_next_tool`` logic don't distinguish literal text from + structural delimiters. This can cause ``_sent_content_idx`` to land + INSIDE the tool's parameter value, after which the trailing text + fails to be emitted. + """ + from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionToolsParam, ) - assert extracted_tool_calls.tools_called - assert len(extracted_tool_calls.tool_calls) == 1 - assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather" - - -def test_extract_tool_calls_type_conversion(qwen3_tokenizer): - """Test parameter type conversion based on tool schema""" tools = [ ChatCompletionToolsParam( type="function", function={ - "name": "test_types", + "name": "write_file", "parameters": { "type": "object", "properties": { - "int_param": {"type": "integer"}, - "float_param": {"type": "float"}, - "bool_param": {"type": "boolean"}, - "str_param": {"type": "string"}, - "obj_param": {"type": "object"}, + "path": {"type": "string"}, + "content": {"type": "string"}, }, }, }, ) ] - - model_output = """ - - -42 - - -3.14 - - -true - - -hello world - - -{"key": "value"} - - -""" - - parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools) request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - extracted_tool_calls = parser.extract_tool_calls(model_output, request=request) - args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) - assert args["int_param"] == 42 - assert args["float_param"] == 3.14 - assert args["bool_param"] is True - assert args["str_param"] == "hello world" - assert args["obj_param"] == {"key": "value"} + # The parameter value contains a literal ```` string. + # The real ```` follows after ````. + delta_1 = ( + "\n\n" + "foo.py\n" + "\n" + "doc = 'example'\n" + "\n\n" + ) + parser.extract_tool_calls_streaming( + previous_text="", + current_text=delta_1, + delta_text=delta_1, + previous_token_ids=[], + current_token_ids=[1], + delta_token_ids=[1], + request=request, + ) + delta_2 = "\nDone, file written!" + text2 = delta_1 + delta_2 + msg2 = parser.extract_tool_calls_streaming( + previous_text=delta_1, + current_text=text2, + delta_text=delta_2, + previous_token_ids=[1], + current_token_ids=[1, 2], + delta_token_ids=[2], + request=request, + ) + contents = [] + if msg2 and msg2.content: + contents.append(msg2.content) + # EOS-style empty delta to flush + msg3 = parser.extract_tool_calls_streaming( + previous_text=text2, + current_text=text2, + delta_text="", + previous_token_ids=[1, 2], + current_token_ids=[1, 2, 3], + delta_token_ids=[3], + request=request, + ) + if msg3 and msg3.content: + contents.append(msg3.content) + + full = "".join(contents) + assert "Done, file written!" in full, ( + f"Trailing text after a tool call whose parameter value contains " + f"a literal was dropped. Got content: {full!r}" + ) -def test_extract_tool_calls_anyof_type_conversion(qwen3_tokenizer): - """Test type conversion for anyOf/oneOf nullable schemas (Pydantic v2). - Pydantic v2 emits anyOf for Optional[T] fields, e.g.: - Optional[int] -> {"anyOf": [{"type": "integer"}, {"type": "null"}]} - The parser must extract the non-null type and apply the correct - conversion (int(), float(), etc.) instead of returning a raw string. +def test_streaming_second_tool_after_first_with_literal_close_tag_in_value( + qwen3_tokenizer, +): + """A first tool call's parameter value contains a literal + ````. A SECOND structural tool call follows after the + real ````. Both tool calls and any inter-call content + must be emitted correctly. """ + from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionToolsParam, + ) + tools = [ ChatCompletionToolsParam( type="function", function={ - "name": "test_anyof", + "name": "write_file", "parameters": { "type": "object", "properties": { - "anyof_int": { - "anyOf": [ - {"type": "integer"}, - {"type": "null"}, - ], - "default": 5, - }, - "anyof_str": { - "anyOf": [ - {"type": "string"}, - {"type": "null"}, - ], - }, - "anyof_array": { - "anyOf": [ - {"type": "array", "items": {"type": "string"}}, - {"type": "null"}, - ], - }, - "anyof_obj": { - "anyOf": [ - {"type": "object"}, - {"type": "null"}, - ], - }, - "type_as_array": { - "type": ["integer", "null"], - }, - "multi_non_null": { - "anyOf": [ - {"type": "string"}, - {"type": "integer"}, - {"type": "null"}, - ], - }, + "path": {"type": "string"}, + "content": {"type": "string"}, }, }, }, - ) - ] - - model_output = """ - - -5 - - -hello - - -["a", "b", "c"] - - -{"key": "value"} - - -42 - - -some text - - -""" - - parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools) - request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - extracted = parser.extract_tool_calls(model_output, request=request) - - args = json.loads(extracted.tool_calls[0].function.arguments) - assert args["anyof_int"] == 5 - assert isinstance(args["anyof_int"], int) - assert args["anyof_str"] == "hello" - assert isinstance(args["anyof_str"], str) - assert args["anyof_array"] == ["a", "b", "c"] - assert isinstance(args["anyof_array"], list) - assert args["anyof_obj"] == {"key": "value"} - assert isinstance(args["anyof_obj"], dict) - assert args["type_as_array"] == 42 - assert isinstance(args["type_as_array"], int) - # Multi non-null: anyOf[string, integer, null] → first non-null is string - assert args["multi_non_null"] == "some text" - assert isinstance(args["multi_non_null"], str) - - -def test_extract_tool_calls_anyof_type_conversion_streaming(qwen3_tokenizer): - """Test streaming e2e for anyOf/oneOf nullable schemas (Pydantic v2). - - Verifies that the full streaming pipeline — tokenize, incrementally - decode, extract_tool_calls_streaming — correctly resolves types from - anyOf schemas and produces valid JSON with properly typed values. - """ - tools = [ + ), ChatCompletionToolsParam( type="function", function={ - "name": "search_web", + "name": "log", "parameters": { "type": "object", - "properties": { - "query": { - "anyOf": [ - {"type": "string"}, - {"type": "null"}, - ], - }, - "count": { - "anyOf": [ - {"type": "integer"}, - {"type": "null"}, - ], - "default": 5, - }, - "verbose": { - "anyOf": [ - {"type": "boolean"}, - {"type": "null"}, - ], - }, - }, + "properties": {"msg": {"type": "string"}}, }, }, - ) + ), ] - - model_output = """ - - -vllm tool parser - - -10 - - -true - - -""" - parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools) request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - tool_states = {} - for delta_message in stream_delta_message_generator( - parser, qwen3_tokenizer, model_output, request - ): - if delta_message.tool_calls: - for tool_call in delta_message.tool_calls: - idx = tool_call.index - if idx not in tool_states: - tool_states[idx] = {"name": None, "arguments": ""} - if tool_call.function: - if tool_call.function.name: - tool_states[idx]["name"] = tool_call.function.name - if tool_call.function.arguments is not None: - tool_states[idx]["arguments"] += tool_call.function.arguments - - assert len(tool_states) == 1 - assert tool_states[0]["name"] == "search_web" - assert tool_states[0]["arguments"] is not None - args = json.loads(tool_states[0]["arguments"]) - assert args["query"] == "vllm tool parser" - assert isinstance(args["query"], str) - assert args["count"] == 10 - assert isinstance(args["count"], int) - assert args["verbose"] is True - assert isinstance(args["verbose"], bool) - - -@pytest.mark.parametrize( - ids=[ - "no_tools", - "single_tool", - "single_tool_with_content", - "single_tool_multiline_param", - "parallel_tools", - "tool_with_typed_params", # Added this test case - ], - argnames=["model_output", "expected_tool_calls", "expected_content"], - argvalues=[ - ("This is a test without tools", [], "This is a test without tools"), - ( - """ - - -Dallas - - -TX - - -fahrenheit - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ) - ], - None, - ), - ( - """Sure! Let me check the weather for you. - - -Dallas - - -TX - - -fahrenheit - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ) - ], - "Sure! Let me check the weather for you.", - ), - ( - """ - - -rectangle - - -{"width": 10, - "height": 20} - - -2 - - -""", - [ - ToolCall( - function=FunctionCall( - name="calculate_area", - arguments=json.dumps( - { - "shape": "rectangle", - "dimensions": {"width": 10, "height": 20}, - "precision": 2, - } - ), - ) - ) - ], - None, - ), - ( - """ - - -Dallas - - -TX - - -fahrenheit - - - - - - -Orlando - - -FL - - -celsius - - -""", - [ - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Dallas", "state": "TX", "unit": "fahrenheit"} - ), - ) - ), - ToolCall( - function=FunctionCall( - name="get_current_weather", - arguments=json.dumps( - {"city": "Orlando", "state": "FL", "unit": "celsius"} - ), - ) - ), - ], - None, - ), - # Added tool_with_typed_params test case - ( - """Let me calculate that area for you. - - -circle - - -{"radius": 15.5} - - -3 - - -""", - [ - ToolCall( - function=FunctionCall( - name="calculate_area", - arguments=json.dumps( - { - "shape": "circle", - "dimensions": {"radius": 15.5}, - "precision": 3, - } - ), - ) - ) - ], - "Let me calculate that area for you.", - ), - ], -) -def test_extract_tool_calls_streaming( - qwen3_tool_parser_parametrized, - qwen3_tokenizer, - model_output, - expected_tool_calls, - expected_content, -): - """Test incremental streaming behavior including typed parameters""" - request = ChatCompletionRequest(model=MODEL, messages=[]) - - other_content = "" - tool_states = {} # Track state per tool index - - for delta_message in stream_delta_message_generator( - qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request - ): - # role should never be streamed from tool parser - assert not delta_message.role - - if delta_message.content: - other_content += delta_message.content - - if delta_message.tool_calls: - for tool_call in delta_message.tool_calls: - idx = tool_call.index - - # Initialize state for new tool - if idx not in tool_states: - tool_states[idx] = { - "id": None, - "name": None, - "arguments": "", - "type": None, - } - - # First chunk should have id, name, and type - if tool_call.id: - tool_states[idx]["id"] = tool_call.id - - if tool_call.type: - assert tool_call.type == "function" - tool_states[idx]["type"] = tool_call.type - - if tool_call.function: - if tool_call.function.name: - # Should only be set once - assert tool_states[idx]["name"] is None - tool_states[idx]["name"] = tool_call.function.name - - if tool_call.function.arguments is not None: - # Accumulate arguments incrementally - tool_states[idx]["arguments"] += tool_call.function.arguments - - # Verify final content - assert other_content == (expected_content or "") # Handle None case - - # Verify we got all expected tool calls - assert len(tool_states) == len(expected_tool_calls) - assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len( - expected_tool_calls + full = ( + "\n\n" + "foo.py\n" + "\n" + "doc = 'example'\n" + "\n\n" + "\n" + "\n\n" + "done\n" + "\n" ) - # Verify each tool call - for idx, expected_tool in enumerate(expected_tool_calls): - state = tool_states[idx] - assert state["id"] is not None - assert state["type"] == "function" - assert state["name"] == expected_tool.function.name - - # Parse accumulated arguments - arguments_str = state["arguments"] - assert arguments_str is not None - actual_args = json.loads(arguments_str) - expected_args = json.loads(expected_tool.function.arguments) - assert actual_args == expected_args - - -def test_extract_tool_calls_missing_closing_parameter_tag( - qwen3_tool_parser_parametrized, -): - """Test handling of missing closing tag""" - # Using get_current_weather from sample_tools but with malformed XML - model_output = """Let me check the weather for you: - - - -Dallas - -TX - - -fahrenheit - - -""" - - request = ChatCompletionRequest(model=MODEL, messages=[]) - extracted_tool_calls = qwen3_tool_parser_parametrized.extract_tool_calls( - model_output, request=request + msg = parser.extract_tool_calls_streaming( + previous_text="", + current_text=full, + delta_text=full, + previous_token_ids=[], + current_token_ids=[1], + delta_token_ids=[1], + request=request, ) - - # The parser should handle the malformed XML gracefully - assert extracted_tool_calls.tools_called - assert len(extracted_tool_calls.tool_calls) == 1 - - # Verify the function name is correct - assert extracted_tool_calls.tool_calls[0].function.name == "get_current_weather" - - # Verify the arguments are parsed despite the missing closing tag - args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) - assert "city" in args - assert args["city"] == "Dallas" - assert args["state"] == "TX" - assert args["unit"] == "fahrenheit" - - # Check that content before the tool call is preserved - assert "Let me check the weather for you:" in extracted_tool_calls.content + assert msg is not None + assert msg.tool_calls is not None + assert len(msg.tool_calls) == 2, ( + f"Expected 2 tool calls, got {len(msg.tool_calls)}: {msg.tool_calls}" + ) + names = [tc.function.name for tc in msg.tool_calls] + assert names == ["write_file", "log"], f"Wrong tool names: {names}" -def test_extract_tool_calls_streaming_missing_closing_tag( - qwen3_tool_parser_parametrized, qwen3_tokenizer +def test_streaming_content_before_and_between_two_tool_calls_one_delta( + qwen3_tool_parser, ): - """Test streaming with missing closing tag""" - # Using get_current_weather from sample_tools but with malformed XML - model_output = """Let me check the weather for you: - - - -Dallas - -TX - - -fahrenheit - - -""" - + """MTP / spec-decode: a single delta delivers free text BEFORE tool 1 + AND free text BETWEEN tool 1 and tool 2. Both content fragments must + be emitted; the recursion path used to drop the second one because of a + ``not result.content`` guard that discarded the recursion's content + when the outer call already had content of its own. + """ request = ChatCompletionRequest(model=MODEL, messages=[]) - - other_content = "" - tool_states = {} - - for delta_message in stream_delta_message_generator( - qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request - ): - if delta_message.content: - other_content += delta_message.content - - if delta_message.tool_calls: - for tool_call in delta_message.tool_calls: - idx = tool_call.index - - if idx not in tool_states: - tool_states[idx] = { - "id": None, - "name": None, - "arguments": "", - "type": None, - } - - if tool_call.id: - tool_states[idx]["id"] = tool_call.id - - if tool_call.type: - assert tool_call.type == "function" - tool_states[idx]["type"] = tool_call.type - - if tool_call.function: - if tool_call.function.name: - tool_states[idx]["name"] = tool_call.function.name - - if tool_call.function.arguments is not None: - tool_states[idx]["arguments"] += tool_call.function.arguments - - # Verify content was streamed - assert "Let me check the weather for you:" in other_content - # Verify we got the tool call - assert len(tool_states) == 1 - assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 - - state = tool_states[0] - assert state["id"] is not None - assert state["type"] == "function" - assert state["name"] == "get_current_weather" - - # Verify arguments were parsed correctly despite missing closing tag - assert state["arguments"] is not None - args = json.loads(state["arguments"]) - assert args["city"] == "Dallas" - assert args["state"] == "TX" - assert args["unit"] == "fahrenheit" + delta = ( + "before text " + "\n\n" + "\n1\n\n" + "\n" + "between text " + "\n\n" + "\n2\n\n" + "\n" + ) + msg = qwen3_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text=delta, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[1], + delta_token_ids=[1], + request=request, + ) + assert msg is not None + assert msg.content is not None, "outer content lost" + assert "before text " in msg.content, ( + f"missing 'before text' content: {msg.content!r}" + ) + assert "between text " in msg.content, ( + f"recursion content 'between text' was dropped because the outer " + f"already had content. Got: {msg.content!r}" + ) -def test_extract_tool_calls_streaming_incremental( - qwen3_tool_parser_parametrized, qwen3_tokenizer -): - """Test that streaming is truly incremental""" - model_output = """I'll check the weather. - - -Dallas - - -TX - - -""" +def test_extract_tool_calls_streaming_split_tag(qwen3_tool_parser): + """```` arrives split across two deltas (````). ``is_tool_call_started`` must flip to ``True`` once the + full tag exists in ``current_text``, and the partial tag must not leak + into ``DeltaMessage.content``. + This relies on the Coder parser's ``is_tool_call_started`` attribute, + which has no equivalent on the XML parser. + """ request = ChatCompletionRequest(model=MODEL, messages=[]) - chunks = [] - for delta_message in stream_delta_message_generator( - qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request - ): - chunks.append(delta_message) - - # Should have multiple chunks - assert len(chunks) > 3 + prev_text_1 = "I will use a tool." + delta_text_1 = "" not in msg2.content - # Arguments should be streamed incrementally - assert len(arg_chunks) > 1 - # Concatenated arguments should form valid JSON - full_args = "".join(arg_chunks) - parsed_args = json.loads(full_args) - assert parsed_args["city"] == "Dallas" - assert parsed_args["state"] == "TX" +def test_streaming_char_by_char_literal_balises_in_value(qwen3_tokenizer): + """Stress test: a WriteFile tool call whose ``content`` value embeds a + complete literal ``...`` block — including + ``...`` and ``... + `` with names that match the OUTER tool's schema — + streamed one character at a time. + Reproduces the qwen-code scenario where the model writes a parser + fixture file: every literal ````, ````, + ````, ````, ```` and + ```` inside the ``content`` value must stay inside the + value; no spurious second tool call, no value truncation. + """ + from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionToolsParam, + ) -def test_extract_tool_calls_complex_type_with_single_quote( - qwen3_tokenizer, -): - """Test parameter type conversion based on tool schema""" tools = [ ChatCompletionToolsParam( type="function", function={ - "name": "test_types", + "name": "write_file", "parameters": { "type": "object", "properties": { - "int_param": {"type": "integer"}, - "float_param": {"type": "float"}, - "bool_param": {"type": "boolean"}, - "str_param": {"type": "string"}, - "obj_param": {"type": "object"}, + "path": {"type": "string"}, + "content": {"type": "string"}, }, }, }, ) ] - - model_output = """ - - -{'key': 'value'} - - -""" - - parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools) request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - extracted_tool_calls = parser.extract_tool_calls(model_output, request=request) - - args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) - assert args["obj_param"] == {"key": "value"} - - -def test_extract_tool_calls_streaming_missing_opening_tag( - qwen3_tool_parser_parametrized, qwen3_tokenizer -): - """Test streaming with missing opening tag - - This tests that the streaming parser correctly handles - tool calls that start directly with - """ - model_output = """I'll check the weather for you. - - - -Dallas - - -TX - - -fahrenheit - - -""" - - request = ChatCompletionRequest(model=MODEL, messages=[]) - - other_content = "" - tool_states = {} - - for delta_message in stream_delta_message_generator( - qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request - ): - if delta_message.content: - other_content += delta_message.content - - if delta_message.tool_calls: - for tool_call in delta_message.tool_calls: - idx = tool_call.index - - if idx not in tool_states: - tool_states[idx] = { - "id": None, - "name": None, - "arguments": "", - "type": None, - } - - if tool_call.id: - tool_states[idx]["id"] = tool_call.id - - if tool_call.type: - assert tool_call.type == "function" - tool_states[idx]["type"] = tool_call.type - - if tool_call.function: - if tool_call.function.name: - tool_states[idx]["name"] = tool_call.function.name - if tool_call.function.arguments is not None: - tool_states[idx]["arguments"] += tool_call.function.arguments - - # Verify content was streamed - assert "I'll check the weather for you." in other_content - - # Verify we got the tool call - assert len(tool_states) == 1 - assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1 - - state = tool_states[0] - assert state["id"] is not None - assert state["type"] == "function" - assert state["name"] == "get_current_weather" - - # Verify arguments were parsed correctly despite missing opening tag - assert state["arguments"] is not None - args = json.loads(state["arguments"]) - assert args["city"] == "Dallas" - assert args["state"] == "TX" - assert args["unit"] == "fahrenheit" - - -def test_malformed_xml_no_gt_delimiter(qwen3_tool_parser): - """Regression: malformed XML without '>' must not crash (PR #36774).""" - model_output = ( + nested_content = ( + 'doc = """\n' "\n" - "Dallas\n" - "\n" - "" - ) - - request = ChatCompletionRequest(model=MODEL, messages=[]) - result = qwen3_tool_parser.extract_tool_calls(model_output, request=request) - assert result is not None - assert isinstance(result.tool_calls, list) - assert all(tc is not None for tc in result.tool_calls) - - -def test_none_tool_calls_filtered(qwen3_tool_parser): - """Regression: None tool calls filtered from output (PR #36774).""" - model_output = ( - "\n" - "\n" + "\nliteral/value.txt\n\n" + "\nhello\n\n" "\n" "\n" - "\n" - "\n" - "Dallas\n" - "TX\n" - "\n" - "" + '"""\n' ) - request = ChatCompletionRequest(model=MODEL, messages=[]) - result = qwen3_tool_parser.extract_tool_calls(model_output, request=request) - assert all(tc is not None for tc in result.tool_calls) - assert result.tools_called - assert len(result.tool_calls) == 1 - assert result.tool_calls[0].function.name == "get_current_weather" - args = json.loads(result.tool_calls[0].function.arguments) - assert args["city"] == "Dallas" - assert args["state"] == "TX" - - -def test_anyof_parameter_not_double_encoded(qwen3_tokenizer): - """Regression: anyOf parameters must not be double-encoded (PR #36032).""" - tools = [ - ChatCompletionToolsParam( - type="function", - function={ - "name": "update_record", - "parameters": { - "type": "object", - "properties": { - "data": { - "anyOf": [{"type": "object"}, {"type": "null"}], - }, - }, - }, - }, - ) - ] - - parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=tools) - - model_output = ( + full_output = ( "\n" - "\n" - '{"key": "value", "count": 42}\n' + "\n" + "\nfixture.py\n\n" + f"\n{nested_content}\n" "\n" "" ) - request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - result = parser.extract_tool_calls(model_output, request=request) - - assert result.tools_called - assert len(result.tool_calls) == 1 - args = json.loads(result.tool_calls[0].function.arguments) - assert isinstance(args["data"], dict) - assert args["data"] == {"key": "value", "count": 42} - - -def test_streaming_multi_param_single_chunk(qwen3_tool_parser, qwen3_tokenizer): - """Regression: speculative decode delivering multiple params at once (PR #35615).""" - request = ChatCompletionRequest(model=MODEL, messages=[]) - - deltas = [ - "", - "\n", - "\n", # triggers json_started -> sends "{" - # This single delta delivers all three parameters at once - "\nDallas\n" - "\n\nTX\n" - "\n\nfahrenheit\n", - "\n", - "\n", - ] + tool_states: dict[int, dict] = {} + current_text = "" + previous_text = "" + for ch in full_output: + previous_text = current_text + current_text += ch + delta_message = parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=ch, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=request, + ) + if delta_message and delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + state = tool_states.setdefault( + idx, {"id": None, "name": None, "arguments": ""} + ) + if tool_call.id: + state["id"] = tool_call.id + if tool_call.function: + if tool_call.function.name: + state["name"] = tool_call.function.name + if tool_call.function.arguments is not None: + state["arguments"] += tool_call.function.arguments - from tests.tool_parsers.utils import ( - run_tool_extraction_streaming, + assert list(tool_states.keys()) == [0], ( + f"Expected exactly one tool call; got indices " + f"{list(tool_states.keys())} — a literal nested " + f"was promoted to a real call." ) - - reconstructor = run_tool_extraction_streaming( - qwen3_tool_parser, - deltas, - request, - assert_one_tool_per_delta=False, + state = tool_states[0] + assert state["name"] == "write_file" + args = json.loads(state["arguments"]) + assert list(args.keys()) == ["path", "content"], ( + f"Spurious params from embedded literals: {list(args.keys())}" ) - - assert len(reconstructor.tool_calls) == 1 - args = json.loads(reconstructor.tool_calls[0].function.arguments) - assert args["city"] == "Dallas" - assert args["state"] == "TX" - assert args["unit"] == "fahrenheit" - - -def test_no_double_serialization_string_args(qwen3_tool_parser): - """Regression: string arguments must not be double-serialized (PR #35615).""" - tools = [ - ChatCompletionToolsParam( - type="function", - function={ - "name": "greet", - "parameters": { - "type": "object", - "properties": { - "message": {"type": "string"}, - }, - }, - }, - ) - ] - - model_output = ( - "\n" - "\n" - "hello world\n" - "\n" - "" + assert args["path"] == "fixture.py" + assert args["content"] == nested_content.rstrip("\n"), ( + f"content was truncated/corrupted: {args.get('content')!r}" ) - request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) - result = qwen3_tool_parser.extract_tool_calls(model_output, request=request) - assert result.tools_called - assert len(result.tool_calls) == 1 - raw_arguments = result.tool_calls[0].function.arguments - args = json.loads(raw_arguments) - assert args["message"] == "hello world" - assert '\\"hello world\\"' not in raw_arguments +def test_extract_tool_calls_streaming_various_chunk_sizes( + qwen3_tokenizer, +): + """Coder streaming must reconstruct arguments correctly even when the + deltas arrive a single character at a time. + The XML parser's SAX-based streaming cannot tolerate ``chunk_size=1`` + by design (an XML tag is not parseable until ``>`` arrives), so this + robustness test stays Coder-only. + """ + request = ChatCompletionRequest(model="test", messages=[]) -def test_get_vllm_registry_structural_tag_returns_structural_tag( - qwen3_tool_parser: Qwen3CoderToolParser, - sample_tools: list[ChatCompletionToolsParam], -) -> None: - request_tools = _as_chat_completion_tools(sample_tools) - req = ChatCompletionRequest( - messages=[], - model="m", - tools=request_tools, - tool_choice="auto", - ) - tag = qwen3_tool_parser.get_structural_tag(req) - assert isinstance(tag, StructuralTag) + template_text = """ + + +value_1 + + +This is the value for the second parameter +that can span +multiple lines + + +""" - req = ChatCompletionRequest( - messages=[], - model="m", - tools=request_tools, - tool_choice="required", - ) - tag = qwen3_tool_parser.get_structural_tag(req) - assert isinstance(tag, StructuralTag) + for chunk_size in [1, 3, 15, len(template_text)]: + parser = Qwen3CoderToolParser(qwen3_tokenizer, tools=None) + + tool_states = {} + current_text = "" + previous_text = "" + ptr = 0 + + while ptr < len(template_text): + delta = template_text[ptr : ptr + chunk_size] + previous_text = current_text + current_text += delta + ptr += chunk_size + + delta_message = parser.extract_tool_calls_streaming( + previous_text=previous_text, + current_text=current_text, + delta_text=delta, + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=request, + ) - if request_tools: - tool = request_tools[0] - req = ChatCompletionRequest( - messages=[], - model="m", - tools=request_tools, + if delta_message and delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + if tool_call.type: + tool_states[idx]["type"] = tool_call.type + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += ( + tool_call.function.arguments + ) + + assert 0 in tool_states, f"chunk_size={chunk_size}" + assert tool_states[0]["name"] == "example_function_name" + args = json.loads(tool_states[0]["arguments"]) + assert args["example_parameter_1"] == "value_1" + assert args["example_parameter_2"] == ( + "This is the value for the second parameter\nthat can span\nmultiple lines" ) - req.tool_choice = ChatCompletionNamedToolChoiceParam( - function=ChatCompletionNamedFunction(name=tool.function.name) - ) - tag = qwen3_tool_parser.get_structural_tag(req) - assert isinstance(tag, StructuralTag) - - -@pytest.mark.parametrize("include_reasoning", [True, False]) -def test_adjust_request_auto_uses_vllm_registry_structural_tag( - monkeypatch: pytest.MonkeyPatch, - qwen3_tool_parser: Qwen3CoderToolParser, - sample_tools: list[ChatCompletionToolsParam], - include_reasoning: bool, -) -> None: - monkeypatch.setattr( - "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING", - True, - ) - request_tools = _as_chat_completion_tools(sample_tools) - req = ChatCompletionRequest( - messages=[], - model="m", - tools=request_tools, - tool_choice="auto", - include_reasoning=include_reasoning, - ) - out = qwen3_tool_parser.adjust_request(req) - assert out.structured_outputs is not None - assert out.structured_outputs.structural_tag is not None - assert isinstance(out.structured_outputs.structural_tag, str) - loaded = json.loads(out.structured_outputs.structural_tag) - assert isinstance(loaded, dict) - - -def test_adjust_request_required_prefers_structural_tag( - monkeypatch: pytest.MonkeyPatch, - qwen3_tool_parser: Qwen3CoderToolParser, - sample_tools: list[ChatCompletionToolsParam], -) -> None: - monkeypatch.setattr( - "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING", - True, - ) - request_tools = _as_chat_completion_tools(sample_tools) - req = ChatCompletionRequest( - messages=[], - model="m", - tools=request_tools, - tool_choice="required", - ) - out = qwen3_tool_parser.adjust_request(req) - assert out.structured_outputs is not None - assert out.structured_outputs.structural_tag is not None diff --git a/tests/tool_parsers/test_qwen3xml_tool_parser.py b/tests/tool_parsers/test_qwen3xml_tool_parser.py index 1ea9a1d65c04..c38268c62ec9 100644 --- a/tests/tool_parsers/test_qwen3xml_tool_parser.py +++ b/tests/tool_parsers/test_qwen3xml_tool_parser.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import pytest @@ -8,6 +9,23 @@ ToolParserTestConfig, ToolParserTests, ) +from tests.tool_parsers.test_qwen3_xml_coder_shared import ( + stream_delta_message_generator, +) +from tests.tool_parsers.utils import run_tool_extraction_streaming +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, + ChatCompletionToolsParam, +) +from vllm.tokenizers import get_tokenizer +from vllm.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser + +MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" + + +@pytest.fixture(scope="module") +def qwen3_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) class TestQwen3xmlToolParser(ToolParserTests): @@ -54,19 +72,508 @@ def test_config(self) -> ToolParserTestConfig: single_tool_call_expected_args={"city": "Tokyo"}, parallel_tool_calls_count=2, parallel_tool_calls_names=["get_weather", "get_time"], - # xfail markers - Qwen3XML has systematic streaming issues - xfail_streaming={ - "test_single_tool_call_simple_args": ( - "Qwen3XML streaming has systematic issues" - ), - "test_parallel_tool_calls": "Qwen3XML streaming has systematic issues", - "test_various_data_types": "Qwen3XML streaming has systematic issues", - "test_empty_arguments": "Qwen3XML streaming has systematic issues", - "test_surrounding_text": "Qwen3XML streaming has systematic issues", - "test_escaped_strings": "Qwen3XML streaming has systematic issues", - "test_streaming_reconstruction": ( - "Qwen3XML streaming reconstruction has known issues" - ), - }, supports_typed_arguments=False, ) + + def test_qwen3xml_async_streaming_free_text(self, qwen3_tokenizer): + parser = Qwen3XMLToolParser(qwen3_tokenizer) + + # 1. First tool call + # 2. Free text + # 3. Second tool call + text_to_stream = ( + "\n\nParis\n\n" + "\nNext, I will check the weather for London:\n" + "\n\nLondon\n\n" + ) + + request = ChatCompletionRequest(messages=[], model="test") + emitted_messages = [] + previous_text = "" + previous_tokens = [] + token_ids = qwen3_tokenizer.encode(text_to_stream, add_special_tokens=False) + + for i in range(1, len(token_ids) + 1): + current_token_ids = token_ids[:i] + current_text = qwen3_tokenizer.decode(current_token_ids) + delta_text = current_text[len(previous_text) :] + token_delta = current_token_ids[len(previous_tokens) :] + + delta = parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_tokens, + current_token_ids, + token_delta, + request, + ) + if delta is not None: + emitted_messages.append(delta) + + previous_text = current_text + previous_tokens = current_token_ids + + # Check that the free text is emitted BEFORE London's arguments are emitted. + found_early = False + accumulated_content = "" + for i, msg in enumerate(emitted_messages): + if msg.content: + accumulated_content += msg.content + + if "Next, I will check the weather for London" in accumulated_content: + # Check if we already saw "London" in any previous or + # current tool call arguments + is_london_emitted = any( + tc.function.arguments and "London" in tc.function.arguments + for m in emitted_messages[: i + 1] + if m.tool_calls + for tc in m.tool_calls + ) + if not is_london_emitted: + found_early = True + break + + assert found_early, ( + "Free text between tool calls should be emitted as soon as the " + "second tool call starts, not delayed." + ) + + def test_qwen3xml_streaming_text_after_tool_call(self, qwen3_tokenizer): + parser = Qwen3XMLToolParser(qwen3_tokenizer) + + # Tool call followed by free text + text_to_stream = ( + "\n\nParis\n\n" + "\nI hope this helps!" + ) + + request = ChatCompletionRequest(messages=[], model="test") + emitted_messages = [] + previous_text = "" + previous_tokens = [] + token_ids = qwen3_tokenizer.encode(text_to_stream, add_special_tokens=False) + + for i in range(1, len(token_ids) + 1): + current_token_ids = token_ids[:i] + current_text = qwen3_tokenizer.decode(current_token_ids) + delta_text = current_text[len(previous_text) :] + token_delta = current_token_ids[len(previous_tokens) :] + + delta = parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_tokens, + current_token_ids, + token_delta, + request, + ) + if delta is not None: + emitted_messages.append(delta) + + previous_text = current_text + previous_tokens = current_token_ids + + # Aggregate all emitted content + all_content = "".join([m.content for m in emitted_messages if m.content]) + + assert "I hope this helps!" in all_content, ( + "Free text after the last tool call should be emitted." + ) + + +def test_qwen3xml_streaming_trailing_text_after_literal_close_in_value( + qwen3_tokenizer, +): + """XML parser: a tool_call's parameter value contains a literal + ````. After the real ````, trailing free + text must still be emitted. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "write_file", + "parameters": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "content": {"type": "string"}, + }, + }, + }, + ) + ] + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + deltas = [ + # Tool 1 with literal embedded in 'content'. + "\n\n" + "foo.py\n" + "\n" + "doc = 'example'\n" + "\n\n", + # Trailing text in a separate delta. + "\nDone, file written!", + ] + + reconstructor = run_tool_extraction_streaming( + parser, deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1, ( + f"Expected 1 tool call, got {len(reconstructor.tool_calls)}" + ) + assert "Done, file written!" in reconstructor.other_content, ( + f"Trailing text after a tool with literal in its " + f"value was dropped. Got content: {reconstructor.other_content!r}" + ) + + +def test_qwen3xml_streaming_python_none_int_char_by_char(qwen3_tokenizer): + """Streaming a nullable INTEGER param value of "None" (Qwen3.5 style) + char-by-char must produce VALID JSON. The XML parser's incremental + char path used to emit "Non" then a "l" delta computed from the diff + between "Non" and "null", giving the cumulative invalid string + "Nonl". The fix defers int/float conversion just like bool/object + so the full value is parsed at close. + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_count", + "parameters": { + "type": "object", + "properties": { + "count": { + "anyOf": [ + {"type": "integer"}, + {"type": "null"}, + ], + }, + }, + }, + }, + ) + ] + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + # Char-by-char deltas emulate worst-case slow streaming. + char_deltas = [ + "\n", + "\n", + "", + "\n", + "N", + "o", + "n", + "e", + "\n", + "\n", + "\n", + "", + ] + reconstructor = run_tool_extraction_streaming( + parser, char_deltas, request, assert_one_tool_per_delta=False + ) + assert len(reconstructor.tool_calls) == 1 + raw = reconstructor.tool_calls[0].function.arguments + args = json.loads(raw) # must be valid JSON + assert args["count"] is None, ( + f"streaming nullable int 'None' produced invalid JSON or wrong " + f"value. Raw: {raw!r}" + ) + + +def test_qwen36_xml_streaming_double_close_brace(qwen3_tokenizer): + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "get_weather", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + }, + }, + ) + ] + + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + deltas = [ + "", + "\n", + "\n\nDallas\n", + "\n", + "\n", + ] + + reconstructor = run_tool_extraction_streaming( + parser, + deltas, + request, + assert_one_tool_per_delta=False, + ) + + assert len(reconstructor.tool_calls) == 1 + full_args = reconstructor.tool_calls[0].function.arguments + + assert not full_args.endswith("}}"), ( + f"XML streaming parser emitted double closing brace: {full_args!r}. " + "parse_single_streaming_chunks fallback called _end_element('function') twice." + ) + args = json.loads(full_args) + assert args == {"city": "Dallas"} + + +def test_xml_streaming_parallel_tool_calls_preformed_chunks(qwen3_tokenizer): + """ + Note: in normal token-by-token streaming this rarely triggers because + the tokenizer splits XML tags across multiple tokens. It CAN trigger with + speculative decoding multi-token flushes. + """ + + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "get_weather", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + }, + }, + ) + ] + + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + deltas = [ + "", + "\n", + "\nParis", + "\n", + "\n", + "", + "\n", + "\nLondon", + "\n", + "\n", + ] + + reconstructor = run_tool_extraction_streaming( + parser, + deltas, + request, + assert_one_tool_per_delta=False, + ) + + assert len(reconstructor.tool_calls) == 2, ( + f"Expected 2 tool calls, got {len(reconstructor.tool_calls)}" + ) + + args0 = json.loads(reconstructor.tool_calls[0].function.arguments) + args1 = json.loads(reconstructor.tool_calls[1].function.arguments) + + assert reconstructor.tool_calls[0].function.name == "get_weather" + assert reconstructor.tool_calls[1].function.name == "get_weather" + assert args0 == {"city": "Paris"}, f"First call args wrong: {args0!r}" + assert args1 == {"city": "London"}, f"Second call args wrong: {args1!r}" + + +# --------------------------------------------------------------------------- +# XML-specific streaming bugs (Coder parser is not affected) +# --------------------------------------------------------------------------- + + +def test_xml_streaming_boolean_true_not_false(qwen3_tokenizer): + """ + Bug B: In streaming mode, a boolean parameter with value "true" is + streamed as "false". + + Root cause: When "true" arrives character by character: + - 't' → _convert_param_value("t", "boolean") = False → emits "false" + - 'r','u','e' → no new delta (output_data[len("false"):] = "") + Final accumulated arguments contain "false" instead of "true". + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "set_flag", + "parameters": { + "type": "object", + "properties": { + "enabled": {"type": "boolean"}, + }, + }, + }, + ) + ] + + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + # Feed character-by-character to trigger the streaming accumulation bug. + # Each chunk simulates a single-character token arriving in streaming. + char_deltas = [ + "", + "\n", + "\n", + "t", # ← first char triggers False → emits "false" + "r", + "u", + "e", # ← full "true" but delta = "true"[5:] = "" + "", + "\n", + "\n", + ] + + reconstructor = run_tool_extraction_streaming( + parser, + char_deltas, + request, + assert_one_tool_per_delta=False, + ) + + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + + assert args["enabled"] is True, ( + f"Boolean streaming bug: expected True, got {args['enabled']!r}. " + f"First char 't' emits 'false'; subsequent chars emit nothing; " + f"final value is 'false' even though the model said 'true'." + ) + + +def test_xml_streaming_string_null_last_char_not_dropped(qwen3_tokenizer): + """ + Bug A (streaming variant): String parameter with value "null" loses + the last character 'l' when tokens arrive one by one. + + Root cause: Accumulating 'n','u','l' emits correctly, but on the + fourth char 'l' the full value is "null" → + _convert_param_value("null", "string") → None → + _convert_for_json_streaming(None, "string") → "" → delta = ""[3:] = "". + The closing quote is then emitted, yielding "nul" not "null". + """ + tools = [ + ChatCompletionToolsParam( + type="function", + function={ + "name": "search", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + }, + }, + }, + ) + ] + + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=tools) + request = ChatCompletionRequest(model=MODEL, messages=[], tools=tools) + + char_deltas = [ + "", + "\n", + "\n", + "n", + "u", + "l", + "l", # ← triggers _convert_param_value("null",…) = None → nothing emitted + "", + "\n", + "\n", + ] + + reconstructor = run_tool_extraction_streaming( + parser, + char_deltas, + request, + assert_one_tool_per_delta=False, + ) + + assert len(reconstructor.tool_calls) == 1 + args = json.loads(reconstructor.tool_calls[0].function.arguments) + + assert "query" in args + assert args["query"] == "null", ( + f"String 'null' streaming bug: last 'l' was dropped. " + f"Got: {args['query']!r}. " + f"When full value reaches 'null', _convert_param_value returns None " + f"and _convert_for_json_streaming(None, 'string') returns '', " + f"so the final delta is empty and the 'l' is never emitted." + ) + + +def test_xml_streaming_missing_opening_tool_call_tag(qwen3_tokenizer): + """The XML streaming parser must recover when the model emits a tool + call without the leading ```` tag — i.e. directly with + ````. The Coder parser does not support this in + streaming mode, so this regression stays XML-specific. + """ + parser = Qwen3XMLToolParser(qwen3_tokenizer, tools=None) + + model_output = """I'll check the weather for you. + + + +Dallas + + +TX + + +fahrenheit + + +""" + + request = ChatCompletionRequest(model=MODEL, messages=[]) + other_content = "" + tool_states: dict = {} + + for delta_message in stream_delta_message_generator( + parser, qwen3_tokenizer, model_output, request + ): + if delta_message.content: + other_content += delta_message.content + if delta_message.tool_calls: + for tool_call in delta_message.tool_calls: + idx = tool_call.index + if idx not in tool_states: + tool_states[idx] = { + "id": None, + "name": None, + "arguments": "", + "type": None, + } + if tool_call.id: + tool_states[idx]["id"] = tool_call.id + if tool_call.type: + assert tool_call.type == "function" + tool_states[idx]["type"] = tool_call.type + if tool_call.function: + if tool_call.function.name: + tool_states[idx]["name"] = tool_call.function.name + if tool_call.function.arguments is not None: + tool_states[idx]["arguments"] += tool_call.function.arguments + + assert "I'll check the weather for you." in other_content + assert len(tool_states) == 1 + state = tool_states[0] + assert state["id"] is not None + assert state["type"] == "function" + assert state["name"] == "get_current_weather" + args = json.loads(state["arguments"]) + assert args["city"] == "Dallas" + assert args["state"] == "TX" + assert args["unit"] == "fahrenheit" diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py index 7457590c5ac0..a3875118861d 100644 --- a/vllm/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/tool_parsers/qwen3coder_tool_parser.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import contextlib import json import uuid from collections.abc import Sequence @@ -29,11 +31,7 @@ get_enable_structured_outputs_in_reasoning, get_model_structural_tag, ) -from vllm.tool_parsers.utils import ( - coerce_to_schema_type, - extract_types_from_schema, - find_tool_properties, -) +from vllm.tool_parsers.utils import find_tool_properties, partial_tag_overlap logger = init_logger(__name__) @@ -119,16 +117,464 @@ def _reset_streaming_state(self): # Store accumulated parameters for type conversion self.accumulated_params = {} self.streaming_request = None + self._sent_content_idx = 0 def _convert_param_value( self, param_value: str, param_name: str, param_config: dict, func_name: str ) -> Any: """Convert parameter value based on its type in the schema.""" - if not isinstance(param_value, str): + if param_name not in param_config: + if param_config != {}: + logger.debug( + "Parsed parameter '%s' is not defined in the tool " + "parameters for tool '%s', directly returning the " + "string value.", + param_name, + func_name, + ) + return param_value + + # ``allows_null`` is True when the schema explicitly admits a + # null value (either via ``"type": "null"`` or in an ``anyOf`` + # union). A nullable parameter must convert the literal + # ``"null"`` / ``"None"`` to JSON null even when the primary + # type is ``string`` — otherwise a Qwen3.5-trained model that + # emits the Python ``None`` literal leaves the client with the + # string ``"None"`` for a nullable optional. + allows_null = False + if ( + isinstance(param_config[param_name], dict) + and "type" in param_config[param_name] + ): + param_type = str(param_config[param_name]["type"]).strip().lower() + allows_null = param_type == "null" + elif ( + isinstance(param_config[param_name], dict) + and "anyOf" in param_config[param_name] + ): + # Extract the first non-null type from the anyOf list so that + # nullable schemas like {"anyOf": [{"type": "string"}, + # {"type": "null"}]} behave as "string", not "object". + param_type = "string" + picked = False + for option in param_config[param_name]["anyOf"]: + if isinstance(option, dict) and "type" in option: + opt_type = str(option["type"]).strip().lower() + if opt_type == "null": + allows_null = True + elif not picked: + param_type = opt_type + picked = True + else: + param_type = "string" + # Nullable schemas: recognise "null" / "None" up front so a + # string-typed nullable still maps to JSON null. + if allows_null and param_value.lower() in ("null", "none"): + return None + # String type takes precedence: preserve the raw value (including + # the literal "null") rather than converting it to Python None. + if param_type in ["string", "str", "text", "varchar", "char", "enum"]: return param_value - param_schema = param_config.get(param_name, {}) - param_types = extract_types_from_schema(param_schema) - return coerce_to_schema_type(param_value, param_types) + # For non-string types, "null" maps to JSON null. Also accept + # the Python literal "None" so that Qwen3.5-trained models — whose + # chat template renders null args via ``| string`` (yielding the + # literal "None" in the prompt) — round-trip nullable values + # correctly. + if param_value.lower() in ("null", "none"): + return None + if ( + param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + ): + try: + return int(param_value) + except (ValueError, TypeError): + logger.debug( + "Parsed value '%s' of parameter '%s' is not an " + "integer in tool '%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type.startswith("num") or param_type.startswith("float"): + try: + float_param_value = float(param_value) + return ( + float_param_value + if float_param_value - int(float_param_value) != 0 + else int(float_param_value) + ) + except (ValueError, TypeError): + logger.debug( + "Parsed value '%s' of parameter '%s' is not a float " + "in tool '%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + elif param_type in ["boolean", "bool", "binary"]: + param_value = param_value.lower() + if param_value not in ["true", "false"]: + logger.debug( + "Parsed value '%s' of parameter '%s' is not a boolean " + "(`true` or `false`) in tool '%s', degenerating to " + "false.", + param_value, + param_name, + func_name, + ) + return param_value == "true" + else: + is_container_type = ( + param_type in ["object", "array", "arr"] + or param_type.startswith("dict") + or param_type.startswith("list") + ) + if is_container_type: + try: + parsed = json.loads(param_value) + # A model trained with a buggy template + # (json.dumps(str(dict))) may output a JSON-encoded + # Python repr like "{'k': 'v'}". json.loads returns a + # string in that case — try one more parse. + if isinstance(parsed, str): + with contextlib.suppress(ValueError, SyntaxError, TypeError): + parsed = ast.literal_eval(parsed) + return parsed + except (json.JSONDecodeError, TypeError, ValueError): + logger.debug( + "Parsed value '%s' of parameter '%s' cannot be " + "parsed with json.loads in tool '%s', will try " + "other methods to parse it.", + param_value, + param_name, + func_name, + ) + try: + param_value = ast.literal_eval(param_value) # safer + # Same double-decode for container types whose raw text + # had no JSON outer layer (e.g. bare Python repr + # "{'k': 'v'}"). + if is_container_type and isinstance(param_value, str): + with contextlib.suppress(ValueError, SyntaxError, TypeError): + param_value = ast.literal_eval(param_value) + except (ValueError, SyntaxError, TypeError): + logger.debug( + "Parsed value '%s' of parameter '%s' cannot be " + "converted via Python `ast.literal_eval()` in tool " + "'%s', degenerating to string.", + param_value, + param_name, + func_name, + ) + return param_value + + def _next_structural_param_start( + self, + text: str, + start_pos: int = 0, + valid_param_names: set[str] | None = None, + ) -> int: + """Return index of next structural ```` from + start_pos. Structural means preceded by ``\\n`` or at position 0. + If valid_param_names is given, NAME must also be in that set. + Returns -1 if none found. + """ + ni = start_pos + prefix_len = len(self.parameter_prefix) + while True: + ni = text.find(self.parameter_prefix, ni) + if ni == -1: + return -1 + if ni == 0 or text[ni - 1] == "\n": + if valid_param_names is not None: + name_end = text.find(">", ni + prefix_len) + if ( + name_end != -1 + and text[ni + prefix_len : name_end] in valid_param_names + ): + return ni + ni += 1 + continue + return ni + ni += 1 + + def _find_true_function_end(self, text: str) -> int: + """Return the index of the real structural ```` in text + (followed with optional whitespace by ```` or end of + string), or -1 if none found. Skips ```` that appears + as literal text inside a parameter value. + """ + search_pos = 0 + while True: + idx = text.find(self.function_end_token, search_pos) + if idx == -1: + return -1 + after = text[idx + len(self.function_end_token) :] + stripped = after.lstrip() + if stripped == "" or stripped.startswith(self.tool_call_end_token): + return idx + search_pos = idx + len(self.function_end_token) + + def _scan_to_structural_function_end( + self, + after_func_open: str, + valid_param_names: set[str] | None = None, + ) -> int: + """Scan a function body — text immediately following the closing + ``>`` of ```` — by walking through structural + ``...`` blocks and return the index of + the structural ```` in ``after_func_open``. + + This is more robust than ``_find_true_function_end`` when the + parameter value embeds a complete literal ``... + \\n`` block: that nested ```` + is followed by ```` and would pass the lookahead + heuristic, but it is INSIDE a parameter and must be skipped. + + Handles a "missing " malformation by treating the + next structural ```` (with NAME unseen so far) + as an implicit end. + + Returns -1 if the body is incomplete or malformed. + """ + pos = 0 + n = len(after_func_open) + seen: set[str] = set() + while pos < n: + # Skip whitespace between params + while pos < n and after_func_open[pos] in " \t\n\r": + pos += 1 + if pos >= n: + return -1 + if after_func_open[pos:].startswith(self.function_end_token): + return pos + if not after_func_open[pos:].startswith(self.parameter_prefix): + # Unexpected token before ; fall back to the + # legacy heuristic on the rest of the text. + rest_offset = self._find_true_function_end(after_func_open[pos:]) + return pos + rest_offset if rest_offset != -1 else -1 + name_end = after_func_open.find(">", pos + len(self.parameter_prefix)) + if name_end == -1: + return -1 + param_name = after_func_open[pos + len(self.parameter_prefix) : name_end] + value_start = name_end + 1 + if value_start < n and after_func_open[value_start] == "\n": + value_start += 1 + param_end = self._find_true_param_end( + after_func_open[value_start:], + valid_param_names, + require_lookahead=True, + ) + if param_end == -1: + # Missing malformation: try the next + # structural with NAME unseen so far + # as the implicit end. + unseen: set[str] | None = ( + (valid_param_names - seen - {param_name}) + if valid_param_names is not None + else None + ) + implicit_end = self._next_structural_param_start( + after_func_open[value_start:], 0, unseen + ) + if implicit_end == -1: + return -1 + pos = value_start + implicit_end + seen.add(param_name) + continue + seen.add(param_name) + pos = value_start + param_end + len(self.parameter_end_token) + return -1 + + def _advance_to_next_tool(self, current_text: str) -> None: + """Advance streaming state to the next tool call. + + Updates _sent_content_idx to skip past the completed tool call's + closing tag, then resets per-tool state for the next invocation. + Called both on normal delta boundaries and during speculative- + decoding recursion when multiple complete tool calls arrive in one + delta. + + Uses STRUCTURAL ```` positions so a literal + ```` embedded in a parameter value (e.g. a code + snippet) does not move ``_sent_content_idx`` to the wrong place. + """ + end_positions = self._structural_tool_call_end_positions(current_text) + target = self.current_tool_index + if target < len(end_positions): + self._sent_content_idx = max( + self._sent_content_idx, + end_positions[target] + len(self.tool_call_end_token), + ) + + self.current_tool_index += 1 + self.header_sent = False + self.param_count = 0 + self.json_started = False + self.json_closed = False + self.accumulated_params = {} + self.is_tool_call_started = False + + def _find_true_tool_call_end(self, text: str) -> int: + """Return the index of the real structural ```` in + text (followed with optional whitespace by another ```` + or end of string), or -1 if none found. + """ + search_pos = 0 + while True: + idx = text.find(self.tool_call_end_token, search_pos) + if idx == -1: + return -1 + after = text[idx + len(self.tool_call_end_token) :] + stripped = after.lstrip() + if stripped == "" or stripped.startswith(self.tool_call_start_token): + return idx + search_pos = idx + len(self.tool_call_end_token) + + def _structural_tool_call_end_positions(self, text: str) -> list[int]: + """Return positions of every STRUCTURAL ```` in text. + + Walks each ``...`` top-level block by + following ````, scanning the body via + ``_scan_to_structural_function_end`` (which steps over parameter + values that may contain literal ````, ````, + ```` or ```` strings), then matching the + trailing ````. + + Falls back to a lookahead heuristic when the walker cannot + determine a structural close (incomplete body, malformed XML). + """ + positions: list[int] = [] + pos = 0 + n = len(text) + while pos < n: + tc_start = text.find(self.tool_call_start_token, pos) + if tc_start == -1: + break + body_start = tc_start + len(self.tool_call_start_token) + func_open = text.find(self.tool_call_prefix, body_start) + if func_open == -1: + break + name_end = text.find(">", func_open + len(self.tool_call_prefix)) + if name_end == -1: + break + func_name = text[func_open + len(self.tool_call_prefix) : name_end] + valid_params: set[str] | None = None + if self.tools: + cfg = find_tool_properties(self.tools, func_name) + if cfg: + valid_params = set(cfg.keys()) + body_after_name = text[name_end + 1 :] + func_end_rel = self._scan_to_structural_function_end( + body_after_name, valid_params + ) + if func_end_rel == -1: + # Body incomplete; the structural is not + # yet known. Stop walking — DO NOT fall back to the + # legacy heuristic for the rest of the text, because a + # literal embedded in an unfinished + # parameter would be erroneously treated as structural. + break + func_end_abs = (name_end + 1) + func_end_rel + after = text[func_end_abs + len(self.function_end_token) :] + i = 0 + while i < len(after) and after[i] in " \t\n\r": + i += 1 + if not after[i:].startswith(self.tool_call_end_token): + break + tc_end_pos = func_end_abs + len(self.function_end_token) + i + positions.append(tc_end_pos) + pos = tc_end_pos + len(self.tool_call_end_token) + return positions + + def _find_true_param_end( + self, + value_text: str, + valid_param_names: set[str] | None = None, + require_lookahead: bool = False, + ) -> int: + """Find the true end of a parameter value in value_text. + + A ```` is structural only when it is followed by + another structural delimiter (schema-known ````, + ````, ````) or — in non-streaming mode — + end-of-string. Nested ```` opens are tracked + for depth REGARDLESS of whether NAME is in the schema: a + literal nested tool_call may use NAMEs that are not in the + outer tool's schema, but its literal ```` still + pairs with the literal open and must not be mistaken for a + structural close. + + Returns the index of the true ```` in value_text, or + -1 if incomplete. + """ + depth = 0 + pos = 0 + param_prefix_len = len(self.parameter_prefix) + param_end_len = len(self.parameter_end_token) + + while pos < len(value_text): + # Use UNFILTERED structural opens for depth tracking so that + # a literal ```` (NAME not in the outer + # schema) still increments depth and its matching literal + # ```` is balanced — otherwise that close would + # appear unmatched and pass the structural lookahead. + next_open = self._next_structural_param_start(value_text, pos, None) + next_close = value_text.find(self.parameter_end_token, pos) + if next_close == -1: + return -1 + + if next_open != -1 and next_open < next_close: + depth += 1 + pos = next_open + param_prefix_len + elif depth == 0: + after = value_text[next_close + param_end_len :] + stripped = after.lstrip() + structural_next_param = False + if stripped.startswith(self.parameter_prefix): + if valid_param_names is not None: + name_start = len(self.parameter_prefix) + name_end = stripped.find(">", name_start) + if name_end != -1: + structural_next_param = ( + stripped[name_start:name_end] in valid_param_names + ) + else: + structural_next_param = True + if ( + (stripped == "" and not require_lookahead) + or structural_next_param + or stripped.startswith(self.function_end_token) + or stripped.startswith(self.tool_call_end_token) + ): + return next_close + pos = next_close + param_end_len + else: + depth -= 1 + pos = next_close + param_end_len + + return -1 + + @staticmethod + def _is_valid_function_name(name: str) -> bool: + """Return True when ``name`` looks like a real function identifier + and not a stray template token, malformed tag, or freeform text. + + Rejects names that contain template-syntax characters (``{``, + ``}``, ``<``, ``>``), whitespace, quotes, or are empty. Permits + identifiers, dashes (``max-retries``), dots (``user.name``), + slashes (``namespace/tool``), and Unicode letters. + """ + if not name: + return False + forbidden = set("{}<>\"' \t\n\r") + return not any(c in forbidden for c in name) def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None: # Extract function name @@ -137,13 +583,59 @@ def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None: if end_index == -1: return None function_name = function_call_str[:end_index] + # Reject phantom tool calls produced when the model writes an + # unrendered Jinja template or pseudo-XML in its response (e.g. + # ````). Surfacing such names as real + # tool calls causes "tool not found" errors at the client and + # makes agents loop. + if not self._is_valid_function_name(function_name): + return None param_config = find_tool_properties(self.tools, function_name) + valid_param_names: set[str] | None = ( + set(param_config.keys()) if param_config else None + ) parameters = function_call_str[end_index + 1 :] - param_dict = {} - for match_text in self.tool_call_parameter_regex.findall(parameters): - idx = match_text.index(">") - param_name = match_text[:idx] - param_value = str(match_text[idx + 1 :]) + param_dict: dict = {} + pos = 0 + while True: + # Find next structural at the top level. We + # do NOT filter the outer search by schema: callers may + # legitimately send a parameter whose name is not declared + # in the schema (e.g. renamed fields). Schema filtering is + # applied only when scanning INSIDE a parameter value, to + # disambiguate real nested delimiters from literal text. + param_start = self._next_structural_param_start(parameters, pos, None) + if param_start == -1: + break + name_start = param_start + len(self.parameter_prefix) + name_end = parameters.find(">", name_start) + if name_end == -1: + break + param_name = parameters[name_start:name_end] + value_text = parameters[name_end + 1 :] + + param_end = self._find_true_param_end(value_text, valid_param_names) + if param_end == -1: + # No true found (malformed XML or incomplete). + # Fallback 1: next structural boundary or end + func_end = self._find_true_function_end(value_text) + if func_end != -1: + param_value = value_text[:func_end] + else: + param_value = value_text + pos = len(parameters) + else: + param_value = value_text[:param_end] + pos = (name_end + 1) + param_end + len(self.parameter_end_token) + # Remove prefix and trailing \n if param_value.startswith("\n"): param_value = param_value[1:] @@ -161,23 +653,79 @@ def _parse_xml_function_call(self, function_call_str: str) -> ToolCall | None: ) def _get_function_calls(self, model_output: str) -> list[str]: - # Find all tool calls - matched_ranges = self.tool_call_regex.findall(model_output) - raw_tool_calls = [ - match[0] if match[0] else match[1] for match in matched_ranges - ] + # Find tool_calls using a structural delimiter approach: + # a real is followed by another or + # end-of-text. This skips that appears as literal + # text inside a parameter value. + raw_tool_calls: list[str] = [] + search_pos = 0 + while True: + tc_start = model_output.find(self.tool_call_start_token, search_pos) + if tc_start == -1: + break + after_open = model_output[tc_start + len(self.tool_call_start_token) :] + tc_end = -1 + inner_search = 0 + while True: + idx = after_open.find(self.tool_call_end_token, inner_search) + if idx == -1: + tc_end = -1 + break + after_close = after_open[idx + len(self.tool_call_end_token) :] + stripped = after_close.lstrip() + if stripped == "" or stripped.startswith(self.tool_call_start_token): + tc_end = idx + break + inner_search = idx + len(self.tool_call_end_token) + if tc_end == -1: + raw_tool_calls.append(after_open) + break + raw_tool_calls.append(after_open[:tc_end]) + search_pos = ( + tc_start + + len(self.tool_call_start_token) + + tc_end + + len(self.tool_call_end_token) + ) # Back-off strategy if no tool_call tags found if len(raw_tool_calls) == 0: raw_tool_calls = [model_output] - raw_function_calls = [] + # Use a parameter-aware walk to find the structural : + # when the value of a parameter embeds a complete literal + # ``...\n`` block, the nested + # ```` is followed by ```` and would pass + # the simple "followed by " lookahead. Walking the + # body parameter-by-parameter with ``_find_true_param_end`` + # correctly steps over the literal. + function_calls: list[str] = [] for tool_call in raw_tool_calls: - raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call)) - - function_calls = [ - match[0] if match[0] else match[1] for match in raw_function_calls - ] + func_start = tool_call.find(self.tool_call_prefix) + if func_start == -1: + continue + after_func_open = tool_call[func_start + len(self.tool_call_prefix) :] + name_end = after_func_open.find(">") + valid_param_names: set[str] | None = None + body_start = 0 + if name_end != -1: + func_name = after_func_open[:name_end] + cfg = find_tool_properties(self.tools, func_name) + if cfg: + valid_param_names = set(cfg.keys()) + body_start = name_end + 1 + scan_end = self._scan_to_structural_function_end( + after_func_open[body_start:], valid_param_names + ) + if scan_end != -1: + function_calls.append(after_func_open[: body_start + scan_end]) + continue + # Fallback to legacy heuristic. + func_end = self._find_true_function_end(after_func_open) + if func_end == -1: + function_calls.append(after_func_open) + else: + function_calls.append(after_func_open[:func_end]) return function_calls def extract_tool_calls( @@ -213,11 +761,39 @@ def extract_tool_calls( } ) - # Extract content before tool calls - content_index = model_output.find(self.tool_call_start_token) - idx = model_output.find(self.tool_call_prefix) - content_index = content_index if content_index >= 0 else idx - content = model_output[:content_index] # .rstrip() + # Extract content before tool calls. Anchor at the FIRST + # ```` that contains a real ```` + # opener — a bare ``...`` written by + # the model in its narrative text (no function inside) is + # NOT a real tool call and the surrounding text MUST stay + # in ``content``. + content_index = -1 + search_pos = 0 + tc_start_token = self.tool_call_start_token + tc_end_token = self.tool_call_end_token + while True: + tc_pos = model_output.find(tc_start_token, search_pos) + if tc_pos == -1: + break + tc_close = model_output.find(tc_end_token, tc_pos + len(tc_start_token)) + # Look for a ```` block contains a + # ``= 0 else model_output + ) valid_tool_calls = [tc for tc in tool_calls if tc is not None] return ExtractedToolCallInformation( tools_called=(len(valid_tool_calls) > 0), @@ -277,77 +853,116 @@ def extract_tool_calls_streaming( # Check if we need to advance to next tool if self.json_closed and not self.in_function: - # Check if this tool call has ended - tool_ends = current_text.count(self.tool_call_end_token) + # Use structural count: a literal + # embedded in a parameter value must not trigger spurious + # advance. + tool_ends = len(self._structural_tool_call_end_positions(current_text)) if tool_ends > self.current_tool_index: - # This tool has ended, advance to next - self.current_tool_index += 1 - self.header_sent = False - self.param_count = 0 - self.json_started = False - self.json_closed = False - self.accumulated_params = {} - - # Check if there are more tool calls - tool_starts = current_text.count(self.tool_call_start_token) - if self.current_tool_index >= tool_starts: - # No more tool calls - self.is_tool_call_started = False - # Continue processing next tool - return None - + # Advance to next tool; is_tool_call_started is reset so + # content between or after tool calls is emitted correctly. + # We deliberately fall through (no early ``return None``): + # the rest of this delta may carry trailing free text after + # the closed or even an entire next tool call + # (MTP / speculative decoding). The downstream code handles + # both — emitting trailing content via the not-started + # branch, or starting the next tool via tool_starts_count. + self._advance_to_next_tool(current_text) + + content_message = None # Handle normal content before tool calls if not self.is_tool_call_started: - # Check if tool call is starting - if ( + tool_starts_count = current_text.count(self.tool_call_start_token) + start_signal = ( self.tool_call_start_token_id in delta_token_ids - or self.tool_call_start_token in delta_text - ): + or tool_starts_count > self.current_tool_index + ) + # ``tool_starts_count`` is naive and over-counts when an + # earlier tool's parameter value contains a literal + # ````. Confirm a REAL next tool by locating an + # opener past ``_sent_content_idx`` (which sits after the last + # processed tool's structural ````). + last_start = -1 + if start_signal: + last_start = current_text.find( + self.tool_call_start_token, self._sent_content_idx + ) + if start_signal and last_start != -1: self.is_tool_call_started = True # Return any content before the tool call - if self.tool_call_start_token in delta_text: - content_before = delta_text[ - : delta_text.index(self.tool_call_start_token) - ] + if last_start > self._sent_content_idx: + content_before = current_text[self._sent_content_idx : last_start] + self._sent_content_idx = last_start if content_before: - return DeltaMessage(content=content_before) - return None + content_message = DeltaMessage(content=content_before) else: - # Check if we're between tool calls - skip whitespace + # No real new tool starting in this delta — emit any + # trailing/inter-call content. + overlap = partial_tag_overlap(current_text, self.tool_call_start_token) + sendable_idx = len(current_text) - overlap + + # Skip whitespace-only deltas right after a closed tool. if ( current_text.rstrip().endswith(self.tool_call_end_token) and delta_text.strip() == "" ): - # We just ended a tool call, skip whitespace + self._sent_content_idx = len(current_text) return None - # Normal content, no tool call - return DeltaMessage(content=delta_text) - - # Check if we're between tool calls (waiting for next one) - # Count tool calls we've seen vs processed - tool_starts_count = current_text.count(self.tool_call_start_token) - if self.current_tool_index >= tool_starts_count: - # We're past all tool calls, shouldn't be here - return None - # We're in a tool call, find the current tool call portion - # Need to find the correct tool call based on current_tool_index + if sendable_idx > self._sent_content_idx: + content = current_text[self._sent_content_idx : sendable_idx] + self._sent_content_idx = sendable_idx + if content: + return DeltaMessage(content=content) + return None + + # Check if we're between tool calls (waiting for next one). + # Only count structural starts (skip past each + # of completed calls) so that tokens + # embedded in a parameter value of a completed call are not + # counted as spurious new tool calls. + if self.tool_call_start_token not in current_text[self._sent_content_idx :]: + return content_message + + # We're in a tool call, find the current tool call portion. + # Build tool_start_positions by jumping OVER completed tool + # calls (past each ), so that tokens + # embedded in parameter values of completed calls are never + # included. + # Use STRUCTURAL positions when jumping past + # completed tool calls — naive ``current_text.find()`` + # matches a literal ```` embedded in a parameter + # value and would land inside an earlier tool's content. + structural_ends = self._structural_tool_call_end_positions(current_text) tool_start_positions: list[int] = [] - idx = 0 - while True: - idx = current_text.find(self.tool_call_start_token, idx) + search_pos = 0 + for i in range(self.current_tool_index + 1): + idx = current_text.find(self.tool_call_start_token, search_pos) if idx == -1: break tool_start_positions.append(idx) - idx += len(self.tool_call_start_token) + if i < self.current_tool_index: + # Completed tool call: jump past its STRUCTURAL . + end_idx = -1 + for end_pos in structural_ends: + if end_pos > idx: + end_idx = end_pos + break + if end_idx == -1: + break + search_pos = end_idx + len(self.tool_call_end_token) if self.current_tool_index >= len(tool_start_positions): - # No more tool calls to process yet - return None + return content_message tool_start_idx = tool_start_positions[self.current_tool_index] - # Find where this tool call ends (or current position if not ended yet) - tool_end_idx = current_text.find(self.tool_call_end_token, tool_start_idx) + # Find this tool call's STRUCTURAL end (or use rest of text if + # the tool isn't closed yet). A naive find would truncate at a + # literal inside a parameter value. + tool_end_idx = -1 + for end_pos in structural_ends: + if end_pos > tool_start_idx: + tool_end_idx = end_pos + break if tool_end_idx == -1: tool_text = current_text[tool_start_idx:] else: @@ -355,6 +970,7 @@ def extract_tool_calls_streaming( tool_start_idx : tool_end_idx + len(self.tool_call_end_token) ] + tool_call_fragments = None # Looking for function header if not self.header_sent: if self.tool_call_prefix in tool_text: @@ -387,21 +1003,18 @@ def extract_tool_calls_streaming( # accesses streamed_args_for_tool[index]. self.streamed_args_for_tool.append("") - # Send header with function info - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_index, - id=self.current_tool_id, - function=DeltaFunctionCall( - name=self.current_function_name, arguments="" - ), - type="function", - ) - ] + tool_call_fragments = DeltaToolCall( + index=self.current_tool_index, + id=self.current_tool_id, + function=DeltaFunctionCall( + name=self.current_function_name, arguments="" + ), + type="function", ) - return None + if not self.header_sent: + return content_message + arguments_to_emit = "" # We've sent header, now handle function body if self.in_function: # Always send opening brace first, regardless of whether @@ -412,24 +1025,91 @@ def extract_tool_calls_streaming( if not self.json_started: self.json_started = True self.streamed_args_for_tool[self.current_tool_index] += "{" - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_index, - function=DeltaFunctionCall(arguments="{"), - ) - ] - ) - - # Find all parameter start positions in current tool_text - param_starts = [] + arguments_to_emit += "{" + + # Build param_starts using structural-aware lookup. Plain + # tool_text.find(parameter_prefix) would return positions + # inside parameter VALUES (e.g. Python code that embeds the + # XML format), creating spurious extra params. Use the + # schema to filter nested and advance + # sequentially past each complete parameter's value. + streaming_param_config = find_tool_properties( + self.tools, self.current_function_name or "" + ) + valid_param_names: set[str] | None = ( + set(streaming_param_config.keys()) if streaming_param_config else None + ) + param_starts: list[int] = [] search_idx = 0 while True: - search_idx = tool_text.find(self.parameter_prefix, search_idx) - if search_idx == -1: + # Don't filter top-level by schema: + # callers may send params whose names aren't declared + # (e.g. renamed fields). Schema filtering is applied + # below when walking INSIDE a parameter value to + # disambiguate nested literal XML. + param_start_pos = self._next_structural_param_start( + tool_text, search_idx, None + ) + if param_start_pos == -1: break - param_starts.append(search_idx) - search_idx += len(self.parameter_prefix) + param_starts.append(param_start_pos) + # Advance past this parameter's content. + name_end_pos = tool_text.find( + ">", param_start_pos + len(self.parameter_prefix) + ) + if name_end_pos == -1: + break + after_name = tool_text[name_end_pos + 1 :] + after_name_stripped = ( + after_name[1:] if after_name.startswith("\n") else after_name + ) + end_in_after = self._find_true_param_end( + after_name_stripped, + valid_param_names, + require_lookahead=True, + ) + if end_in_after == -1: + # No structural ```` close yet. A + # legitimate "missing " malformation — + # the model jumps from ```` straight to + # ```` — is recoverable: treat the + # next structural ```` as implicit + # end of the current param. But only if NAME has + # NOT already been parsed as a sibling param of this + # tool call (and is not the param currently being + # scanned). A repeated NAME is almost always a + # literal embedded in the unfinished value, not a + # real next parameter. + cand_name = tool_text[ + param_start_pos + len(self.parameter_prefix) : name_end_pos + ] + already_seen = set(self.accumulated_params.keys()) | ( + {cand_name} if cand_name else set() + ) + unseen_valid: set[str] | None = ( + (valid_param_names - already_seen) + if valid_param_names is not None + else None + ) + implicit_end = self._next_structural_param_start( + after_name_stripped, 0, unseen_valid + ) + if implicit_end != -1: + search_idx = ( + (name_end_pos + 1) + + (1 if after_name.startswith("\n") else 0) + + implicit_end + ) + else: + # Wait for more data. + break + else: + search_idx = ( + (name_end_pos + 1) + + (1 if after_name.startswith("\n") else 0) + + end_in_after + + len(self.parameter_end_token) + ) # Process ALL complete params in a loop (spec decode fix). # With speculative decoding a single delta can deliver @@ -455,30 +1135,67 @@ def extract_tool_calls_streaming( if value_text.startswith("\n"): value_text = value_text[1:] - param_end_idx = value_text.find(self.parameter_end_token) + param_end_idx = self._find_true_param_end( + value_text, valid_param_names, require_lookahead=True + ) if param_end_idx == -1: - next_param_idx = value_text.find(self.parameter_prefix) - func_end_idx = value_text.find(self.function_end_token) - - if next_param_idx != -1 and ( - func_end_idx == -1 or next_param_idx < func_end_idx - ): - param_end_idx = next_param_idx - elif func_end_idx != -1: - param_end_idx = func_end_idx - else: - # Fallback for malformed XML where - # is missing. Use as a delimiter - # if present in the value so we don't include - # the closing tag as part of the param value. - tool_end_in_value = value_text.find(self.tool_call_end_token) - if tool_end_in_value != -1: - param_end_idx = tool_end_in_value + # Confirm via the parameter-aware walker that the + # function body is truly complete. The legacy + # ``_find_true_function_end`` matches a ```` + # at end-of-buffer (lstripped lookahead == ""), which + # is wrong in streaming when the literal close of a + # nested tool_call inside a parameter value sits at + # the buffer's end. Walking the body via + # ``_scan_to_structural_function_end`` correctly + # steps over literal tags inside parameter values + # and returns -1 if any param is still open. + tc_open_in_tool = tool_text.find(self.tool_call_prefix) + body_func_end_in_value = -1 + if tc_open_in_tool != -1: + name_end_in_tool = tool_text.find( + ">", tc_open_in_tool + len(self.tool_call_prefix) + ) + if name_end_in_tool != -1: + body_after_name = tool_text[name_end_in_tool + 1 :] + body_func_end_rel = self._scan_to_structural_function_end( + body_after_name, valid_param_names + ) + if body_func_end_rel != -1: + body_func_end_abs = ( + name_end_in_tool + 1 + body_func_end_rel + ) + body_func_end_in_value = body_func_end_abs - value_start + + if body_func_end_in_value > 0: + # Function body is structurally complete; the + # current param has missing . Use + # the next legitimate (NAME + # unseen) before the structural as + # the implicit end. + already_seen = set(self.accumulated_params.keys()) | ( + {current_param_name} if current_param_name else set() + ) + unseen_valid: set[str] | None = ( + (valid_param_names - already_seen) + if valid_param_names is not None + else None + ) + next_param_idx = self._next_structural_param_start( + value_text, 0, unseen_valid + ) + if ( + next_param_idx != -1 + and next_param_idx < body_func_end_in_value + ): + param_end_idx = next_param_idx else: - # Parameter incomplete — break so we still - # emit any fragments accumulated by earlier - # loop iterations. - break + param_end_idx = body_func_end_in_value + else: + # Body not yet complete — wait for more data. + # Do NOT truncate at a literal or + # that may sit inside a still-open + # parameter value. + break if param_end_idx == -1: break @@ -522,15 +1239,7 @@ def extract_tool_calls_streaming( self.current_tool_index, len(self.streamed_args_for_tool), ) - - return DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_index, - function=DeltaFunctionCall(arguments=combined), - ) - ] - ) + arguments_to_emit += combined # Check for function end AFTER processing parameters. # This ordering is critical: with speculative decoding a @@ -538,13 +1247,31 @@ def extract_tool_calls_streaming( # . If the close check ran first it would emit # "}" and set in_function=False before the parameter loop # ever ran, causing the parameter to be silently dropped. - if not self.json_closed and self.function_end_token in tool_text: + # Use the parameter-aware walker so a literal '' + # inside a parameter value (e.g. a content arg embedding a + # complete nested tool_call) does not trigger a premature + # close. + true_func_end = -1 + tc_open_in_tool_for_close = tool_text.find(self.tool_call_prefix) + if tc_open_in_tool_for_close != -1: + name_end_in_tool = tool_text.find( + ">", + tc_open_in_tool_for_close + len(self.tool_call_prefix), + ) + if name_end_in_tool != -1: + body_after_name = tool_text[name_end_in_tool + 1 :] + body_func_end_rel = self._scan_to_structural_function_end( + body_after_name, valid_param_names + ) + if body_func_end_rel != -1: + true_func_end = name_end_in_tool + 1 + body_func_end_rel + if not self.json_closed and true_func_end != -1: self.json_closed = True func_start = tool_text.find(self.tool_call_prefix) + len( self.tool_call_prefix ) - func_content_end = tool_text.find(self.function_end_token, func_start) + func_content_end = true_func_end if func_content_end != -1: func_content = tool_text[func_start:func_content_end] try: @@ -572,23 +1299,88 @@ def extract_tool_calls_streaming( self.current_tool_index, len(self.streamed_args_for_tool), ) - - result = DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_index, - function=DeltaFunctionCall(arguments="}"), - ) - ] - ) - + arguments_to_emit += "}" self.in_function = False self.json_closed = True self.accumulated_params = {} - return result + if tool_call_fragments or arguments_to_emit: + if not tool_call_fragments: + tool_call_fragments = DeltaToolCall( + index=self.current_tool_index, + function=DeltaFunctionCall(arguments=arguments_to_emit), + ) + else: + tool_call_fragments.function.arguments += arguments_to_emit + + if content_message: + content_message.tool_calls = [tool_call_fragments] + result = content_message + else: + result = DeltaMessage(tool_calls=[tool_call_fragments]) + + # Speculative decoding can deliver multiple complete tool + # calls in a single delta. If we just finished one and + # another complete ... remains in + # current_text, advance and re-enter to emit it. We pass a + # non-empty `previous_text` sentinel so reset_streaming_state + # is NOT triggered inside the recursion (which would clear + # current_tool_index back to 0 and loop forever). + if ( + self.json_closed + and not self.in_function + and len(self._structural_tool_call_end_positions(current_text)) + > self.current_tool_index + 1 + ): + # Speculative decoding delivered multiple complete tool + # calls in one delta; advance and recurse for the next. + self._advance_to_next_tool(current_text) + + # Recurse with a sentinel previous_text so the entry + # check `if not previous_text` does NOT reset the state. + next_delta = self.extract_tool_calls_streaming( + previous_text or " ", + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request, + ) + if next_delta is not None and next_delta.tool_calls: + if result.tool_calls is None: + result.tool_calls = [] + result.tool_calls.extend(next_delta.tool_calls) + # Concatenate the recursion's content (e.g. text + # BETWEEN tool 1 and tool 2) with the outer's content + # (e.g. text BEFORE tool 1). Without this, the "between" + # fragment is silently dropped whenever the outer + # already produced its own content. + if next_delta.content: + result.content = (result.content or "") + next_delta.content + + # Emit trailing free text that follows the LAST structural + # in this delta (MTP / spec-decoding bursts that + # bundle N tool calls + trailing content into one chunk). + # Without this the trailing text is buffered indefinitely: + # the per-tool processing never advances ``_sent_content_idx`` + # past its tool's ````, and an EOS-style empty + # delta cannot recover content that was never emitted. + if self.json_closed and not self.in_function: + end_positions = self._structural_tool_call_end_positions(current_text) + if end_positions: + last_end = end_positions[-1] + len(self.tool_call_end_token) + if ( + last_end < len(current_text) + and last_end > self._sent_content_idx + ): + trailing = current_text[last_end:] + if trailing: + self._sent_content_idx = len(current_text) + result.content = (result.content or "") + trailing + return result - return None + return content_message def get_structural_tag(self, request: ChatCompletionRequest): return get_model_structural_tag( diff --git a/vllm/tool_parsers/qwen3xml_tool_parser.py b/vllm/tool_parsers/qwen3xml_tool_parser.py index 8ee10dcbc9e6..3f2ae4d253bf 100644 --- a/vllm/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/tool_parsers/qwen3xml_tool_parser.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast +import contextlib import json from collections.abc import Sequence from typing import Any @@ -26,11 +27,28 @@ Tool, ToolParser, ) +from vllm.tool_parsers.structural_tag_registry import ( + get_enable_structured_outputs_in_reasoning, + get_model_structural_tag, +) from vllm.tool_parsers.utils import find_tool_properties logger = init_logger(__name__) +def _is_valid_function_name(name: str) -> bool: + """Return True when ``name`` looks like a real function identifier and + not a stray template token, malformed tag, or freeform text. + + Rejects names that contain template-syntax characters (``{``, ``}``, + ``<``, ``>``), whitespace, quotes, or are empty. + """ + if not name: + return False + forbidden = set("{}<>\"' \t\n\r") + return not any(c in forbidden for c in name) + + class StreamingXMLToolCallParser: """ Simplified streaming XML tool call parser @@ -53,9 +71,16 @@ def reset_streaming_state(self): """Reset streaming parsing state""" self.deltas = [] + # When True (delta-by-delta streaming), _process_complete_xml_elements + # holds off on when nothing follows in the buffer yet — + # that would be ambiguous since more tokens may still arrive. When + # False (full output passed at once), an empty lookahead is a + # genuine end. + self._streaming_mode: bool = False # state for streaming self.tool_call_index = 0 self.current_call_id = None + self.id_emitted = False self.last_completed_call_id = None self.current_function_name = None self.current_function_open = False @@ -79,6 +104,21 @@ def reset_streaming_state(self): self.defer_current_parameter = False self.deferred_param_raw_value = "" + # Depth of LITERAL nested ````/```` opens + # encountered inside the current parameter's value. Each literal + # opener bumps the depth; each ````/```` + # encountered while depth > 0 is also literal (decrements the + # depth) and must not be treated as a structural close. Reset + # to 0 when leaving a parameter. + self._literal_tag_depth = 0 + # Number of literal tool_call/function open or close events seen + # in the current ``parse_single_streaming_chunks`` call. Used to + # suppress the post-processing structural-close fallback when + # the chunk contained literal nested-tag events: those events + # are already handled (escaped) by the preprocess pass and must + # not trigger ``_end_element`` calls. + self._literal_events_this_chunk = 0 + # recreate parser self.parser = ParserCreate() self.setup_parser() @@ -98,72 +138,58 @@ def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage: # Record delta count before processing initial_delta_count = len(self.deltas) + # Reset literal-event counter for this chunk: it will be + # incremented by the preprocess pass whenever it encounters a + # literal nested ````/```` open or + # the matching close inside a parameter value. + self._literal_events_this_chunk = 0 + self.streaming_buffer += xml_chunk found_elements = self._process_complete_xml_elements() if found_elements: # If complete elements found, check if end events were missed - # some tags may not have been triggered + # some tags may not have been triggered. Use structural-aware + # checks so that / appearing as literal + # text inside a parameter value (e.g. file content) does NOT + # trigger a spurious close that emits a duplicate '}' or ''. + # When ``_literal_tag_depth > 0`` we are still inside a + # literal nested ````/```` block in + # the current parameter's value — the chunk's `` + # or `` matches a literal opener, not a real + # structural close, so skip the fallback close events. try: - new_deltas = self.deltas[initial_delta_count:] - # If this chunk contains - # but didn't generate '}', then complete it + # Skip the fallback close events when this chunk + # contained any literal nested-tag event: those + # ````/```` strings are matched + # to literal openers in the param value and have + # already been escaped — firing ``_end_element`` here + # would prematurely close the OUTER parameter and + # truncate its value. + literals_in_chunk = self._literal_events_this_chunk > 0 if ( self.current_call_id is not None - and self.function_end_token in xml_chunk + and not literals_in_chunk + and self._literal_tag_depth == 0 + and self._chunk_has_structural_function_end(xml_chunk) + and self.current_function_open ): - # - Added '}' (non-empty parameter ending) - # - Added '{}' (empty parameter function) - has_function_close = any( - ( - td.tool_calls - and any( - ( - tc.function - and tc.id == self.current_call_id - and isinstance(tc.function.arguments, str) - and (tc.function.arguments in ("}", "{}")) - ) - for tc in td.tool_calls - ) - ) - for td in new_deltas - ) - if not has_function_close: - # Close potentially unclosed element - if self.current_param_name: - self._end_element("parameter") - if self.current_function_name: - self._end_element("function") - # If this chunk contains - # but didn't generate final empty delta, then complete it + if self.current_param_name: + self._end_element("parameter") + if self.current_function_name: + self._end_element("function") if ( self.current_call_id is not None - and self.tool_call_end_token in xml_chunk + and not literals_in_chunk + and self._literal_tag_depth == 0 + and self._chunk_has_structural_tool_call_end(xml_chunk) ): - has_toolcall_close = any( - ( - td.tool_calls - and any( - ( - tc.type == "function" - and tc.function - and tc.function.arguments == "" - and tc.id == self.current_call_id - ) - for tc in td.tool_calls - ) - ) - for td in new_deltas - ) - if not has_toolcall_close: - # Close potentially unclosed element - if self.current_param_name: - self._end_element("parameter") - if self.current_function_name: - self._end_element("function") - self._end_element("tool_call") + if self.current_param_name: + self._end_element("parameter") + if self.current_function_open: + self._end_element("function") + self._end_element("tool_call") except Exception as e: logger.warning("Error with fallback parsing: %s", e) # Merge newly generated deltas into single response @@ -173,29 +199,37 @@ def parse_single_streaming_chunks(self, xml_chunk: str) -> DeltaMessage: return result_delta else: # No complete elements, check if there's unoutput text content - if self.text_content_buffer and self.tool_call_index == 0: - # Has text content but no tool_call yet, output text content + if self.text_content_buffer: + # Output buffered text content text_delta = DeltaMessage(content=self.text_content_buffer) self._emit_delta(text_delta) # Clear buffer to avoid duplicate output self.text_content_buffer = "" return text_delta - # If this chunk contains end tags but wasn't triggered by parser, - # manually complete end events - # Only execute when still on the same call as when entered, - # to prevent accidentally closing new calls - # in multi scenarios - if self.current_call_id is not None and ( - self.function_end_token in xml_chunk - or self.tool_call_end_token in xml_chunk + # If this chunk contains structural end tags but wasn't + # triggered by parser, manually complete end events. Only + # execute when still on the same call as when entered, to + # prevent accidentally closing new calls in multi- + # scenarios. Also skip when ``_literal_tag_depth > 0``: the + # chunk's ``/`` matches a literal + # opener inside the current parameter's value. + if ( + self.current_call_id is not None + and self._literal_tag_depth == 0 + and ( + self._chunk_has_structural_function_end(xml_chunk) + or self._chunk_has_structural_tool_call_end(xml_chunk) + ) ): - # Close potentially unclosed element if self.current_param_name: self._end_element("parameter") - if self.function_end_token in xml_chunk and self.current_function_name: + if ( + self._chunk_has_structural_function_end(xml_chunk) + and self.current_function_name + ): self._end_element("function") - if self.tool_call_end_token in xml_chunk: + if self._chunk_has_structural_tool_call_end(xml_chunk): self._end_element("tool_call") # Return the merged delta result generated by this fallback result_delta = self._merge_new_deltas_to_single_response( @@ -227,6 +261,141 @@ def _escape_xml_special_chars(self, text: str) -> str: return text + def _is_structural_tag_position(self) -> bool: + """Return True when the current element is at a structural position. + + A structural opening tag (e.g. ) must appear at the + beginning of a line in the raw output — i.e. the character + immediately before it in the streaming buffer is a newline (or it + is at position 0). Opening tags inside parameter content (e.g. + '""') are preceded by a non-newline character + such as a quote. + """ + if self.last_processed_pos == 0: + return True + return self.streaming_buffer[self.last_processed_pos - 1] == "\n" + + def _get_valid_param_names(self) -> set[str] | None: + """Return the set of parameter names defined in the schema for the + current function, or None when the schema is not available. + + Used to filter structural-looking tokens that + appear as literal text inside a parameter value (e.g. Jinja2 + templates, test fixtures, or files that document the tool-call + format). + """ + if not self.tools or not self.current_function_name: + return None + props = find_tool_properties(self.tools, self.current_function_name) + return set(props.keys()) if props else None + + def _is_already_emitted_param(self, name: str) -> bool: + """Return True when ``name`` has already appeared as a parameter + of the current tool call (either fully closed or currently open). + + A ```` whose NAME is already used for the same + tool is almost always literal text inside another parameter's + value (e.g. a parser fixture or a file that documents the + tool-call format). Treating it as a real structural opening + causes silent value truncation and spurious extra params. + """ + if name == self.current_param_name: + return True + return name in self.parameters + + def _is_structural_closing_tag(self, chunk: str) -> bool: + """Return True when a closing tag at the current buffer position is + a real structural delimiter rather than literal text content. + + A closing tag is structural when the text that follows it in the + streaming buffer (after stripping leading whitespace) begins with + another structural token or is empty (end of buffered output). + + When the schema is available, a following is only + considered structural if NAME is a known parameter of the current + function. This prevents literal lines like ```` + in file content from being mistaken for real structural boundaries. + """ + after_pos = self.last_processed_pos + len(chunk) + rest = self.streaming_buffer[after_pos:].lstrip() + + structural_param_follows = False + if rest.startswith(self.parameter_start_token): + valid_names = self._get_valid_param_names() + name_start = len(self.parameter_start_token) + name_end = rest.find(">", name_start) + if name_end != -1: + candidate = rest[name_start:name_end] + if valid_names is not None: + structural_param_follows = ( + candidate in valid_names + and not self._is_already_emitted_param(candidate) + ) + else: + # Fallback (no schema): trust the name unless it is a + # repeat of the current/already-emitted param, which + # is almost always a literal in a parser fixture. + structural_param_follows = not self._is_already_emitted_param( + candidate + ) + + # Return True when rest is an incomplete prefix of a structural + # closing token (e.g. rest="" hasn't fully + # arrived yet). The empty-rest case is handled by the deferral in + # _process_complete_xml_elements; this guards against the + # partial-tag scenario where the deferral does not fire (rest is + # non-empty) but the token is still incomplete. + is_partial_structural_prefix = any( + tok.startswith(rest) + for tok in ( + self.parameter_end_token, + self.function_end_token, + self.tool_call_end_token, + ) + ) + + return ( + not rest + or is_partial_structural_prefix + or structural_param_follows + or rest.startswith(self.parameter_end_token) + or rest.startswith(self.function_end_token) + or rest.startswith(self.tool_call_end_token) + ) + + def _chunk_has_structural_function_end(self, chunk: str) -> bool: + """Return True if `chunk` contains a structural tag. + + A structural is followed (after optional whitespace) + by or end-of-string — not inside parameter content + such as a file whose body contains ''. + """ + search = 0 + token = self.function_end_token + end_token = self.tool_call_end_token + while True: + idx = chunk.find(token, search) + if idx == -1: + return False + rest = chunk[idx + len(token) :].lstrip() + if not rest or rest.startswith(end_token): + return True + search = idx + len(token) + + def _chunk_has_structural_tool_call_end(self, chunk: str) -> bool: + """Return True if `chunk` contains a structural tag.""" + search = 0 + token = self.tool_call_end_token + start_token = self.tool_call_start_token + while True: + idx = chunk.find(token, search) + if idx == -1: + return False + rest = chunk[idx + len(token) :].lstrip() + if not rest or rest.startswith(start_token): + return True + search = idx + len(token) + def _process_complete_xml_elements(self) -> bool: """ Process complete XML elements in buffer @@ -243,6 +412,23 @@ def _process_complete_xml_elements(self) -> bool: # No complete element found, wait for more data break + # In streaming mode, hold off on when nothing + # follows in the buffer yet. We need the lookahead to + # distinguish a real structural close (followed by + # or a schema-known ) from + # literal text content that happens to be ```` on + # its own line (e.g. Jinja2 template files). When not in + # _pre_inside_parameter mode the SAX-level decision is made + # here; skip for now and re-evaluate on the next delta. + if ( + self._streaming_mode + and element == self.parameter_end_token + and self.current_param_name is not None + and not self._pre_inside_parameter + and not self.streaming_buffer[end_pos:].lstrip() + ): + break + # Check if this element should be skipped if self._should_skip_element(element): self.last_processed_pos = end_pos @@ -251,16 +437,12 @@ def _process_complete_xml_elements(self) -> bool: # Found complete XML element, process it try: preprocessed_element = self._preprocess_xml_chunk(element) - # Check if this is the first tool_call start + # Check if a new tool_call starts and we have buffered text content if ( - ( - preprocessed_element.strip().startswith("") - or preprocessed_element.strip().startswith("") + or preprocessed_element.strip().startswith(" bool: # Update processed position self.last_processed_pos = end_pos + # Flush any text accumulated AFTER the last processed + # in this batch. Without this, trailing free text that arrives in + # the SAME delta as the closing (MTP / speculative + # decoding) is buffered but never emitted — and is lost entirely + # if EOS comes before any subsequent delta. + if found_any and self.text_content_buffer and self.current_call_id is None: + text_delta = DeltaMessage(content=self.text_content_buffer) + self._emit_delta(text_delta) + self.text_content_buffer = "" + return found_any def _should_skip_element(self, element: str) -> bool: @@ -441,10 +633,10 @@ def _merge_new_deltas_to_single_response(self, initial_count: int) -> DeltaMessa if delta.tool_calls: # For tool_calls, we need to intelligently merge arguments for tool_call in delta.tool_calls: - # Find if there's already a tool_call with the same call_id + # Find if there's already a tool_call with the same index existing_call = None for existing in merged_tool_calls: - if existing.id == tool_call.id: + if existing.index == tool_call.index: existing_call = existing break @@ -534,36 +726,59 @@ def _preprocess_xml_chunk(self, chunk: str) -> str: if self._pre_current_param_name else "string" ) - # Only these types need deferred parsing to - # handle Python literals containing single quotes - is_object_type = param_type in ["object"] + # Container types always need deferred parsing so the + # full value is available for json.loads / + # ast.literal_eval — even when the first streaming + # token is just "\n". + is_object_type = param_type == "object" is_complex_type = ( param_type in ["array", "arr", "sequence"] or param_type.startswith("dict") or param_type.startswith("list") ) - - # Only delay when contains container symbols - # and has single quotes and is complex type - has_container_hint = ( - ("[" in original_chunk) - or ("{" in original_chunk) - or ("(" in original_chunk) + # Boolean also needs deferral: streaming "t" as the + # first char would otherwise be converted to False and + # emit "false", shadowing the real "true" that follows. + is_bool_type = param_type in ["boolean", "bool", "binary"] + # Numeric types need deferral too: a nullable + # parameter rendered as the literal "None" (Qwen3.5 + # template) or "null" (Qwen3.6 template) flips from + # the partial-string fallback to JSON ``null`` only + # when the FULL value is in. Without deferral the + # diff-based char emission would interleave the + # partial string ("Non") with the JSON literal + # ("null") and produce invalid output ("Nonl"). + is_numeric_type = ( + param_type.startswith("int") + or param_type.startswith("uint") + or param_type.startswith("long") + or param_type.startswith("short") + or param_type.startswith("unsigned") + or param_type.startswith("num") + or param_type.startswith("float") ) - # Determine if deferred parsing is needed - need_defer = False - if is_complex_type: - # Complex type, always need deferred parsing - need_defer = True - elif ( - is_object_type - and has_container_hint - and ("'" in original_chunk) - ): - # Object type with container symbols - # and single quotes, need deferred parsing - need_defer = True + # Nullable string params (``anyOf: [string, null]``) + # must defer too: the literal ``null`` / ``None`` is + # only recognisable when the full value is in. + # Without deferral, the streaming string path emits + # ``"`` + chars + ``"`` and the literal stays + # quoted. + is_nullable_string = param_type in [ + "string", + "str", + "text", + "varchar", + "char", + "enum", + ] and self._param_allows_null(self._pre_current_param_name) + need_defer = ( + is_complex_type + or is_object_type + or is_bool_type + or is_numeric_type + or is_nullable_string + ) if not need_defer: # No need for deferred parsing, @@ -573,6 +788,69 @@ def _preprocess_xml_chunk(self, chunk: str) -> str: self._pre_param_buffer += original_chunk return "" + # When a parameter value is being streamed (SAX state says we are + # inside a ), structural-looking tokens that arrive as + # subsequent elements are literal text — e.g. a file whose content + # describes the tool-call format. Escape them unless they are + # genuine structural delimiters. + if self.current_param_name is not None: + if chunk.startswith(self.tool_call_start_token) or chunk.startswith( + self.function_start_token + ): + # Opening tool_call/function tags are always literal inside + # a parameter value. Track nesting depth so that the + # matching ```` / ```` is also + # treated as literal even when its lookahead would + # otherwise satisfy the structural heuristic. + self._literal_tag_depth += 1 + self._literal_events_this_chunk += 1 + return self._escape_xml_special_chars(chunk) + if chunk.startswith(self.parameter_start_token): + # A structural always follows a newline in + # the buffer. When a schema is available, also require + # NAME to be a known parameter of the current function so + # that literal ```` inside file + # content is treated as text. A NAME already emitted + # for this tool (or equal to the param currently being + # parsed) is also literal text — a parser fixture or a + # file that documents the tool-call format. + if not self._is_structural_tag_position(): + return self._escape_xml_special_chars(chunk) + name_start = len(self.parameter_start_token) + name_end = chunk.find(">", name_start) + if name_end != -1: + candidate = chunk[name_start:name_end] + if self._is_already_emitted_param(candidate): + return self._escape_xml_special_chars(chunk) + valid_names = self._get_valid_param_names() + if valid_names is not None and candidate not in valid_names: + return self._escape_xml_special_chars(chunk) + if ( + chunk.startswith(self.parameter_end_token) + or chunk.startswith(self.function_end_token) + or chunk.startswith(self.tool_call_end_token) + ): + # Inside a literal nested tool_call/function (depth > 0), + # any closing tag pairs with the literal opener and is + # itself literal — regardless of what the lookahead says. + # ```` does not affect depth (parameters do + # not nest in the Qwen format). + if self._literal_tag_depth > 0: + if chunk.startswith(self.function_end_token) or ( + chunk.startswith(self.tool_call_end_token) + ): + self._literal_tag_depth -= 1 + self._literal_events_this_chunk += 1 + else: + # Literal `` inside a nested literal + # block — count it as a literal event so the + # post-processing fallback knows the chunk + # contained literals and skips spurious closes. + self._literal_events_this_chunk += 1 + return self._escape_xml_special_chars(chunk) + if not self._is_structural_closing_tag(chunk): + return self._escape_xml_special_chars(chunk) + # Parameter start: enable accumulation if processed.startswith("', processed) @@ -593,6 +871,12 @@ def _emit_delta(self, delta: DeltaMessage): """Emit Delta response (streaming output)""" self.deltas.append(delta) + def _get_call_id_for_delta(self) -> str | None: + if not self.id_emitted: + self.id_emitted = True + return self.current_call_id + return None + def _auto_close_open_parameter_if_needed(self, incoming_tag: str | None = None): """Before starting to process new elements, if there are unclosed tags from before, @@ -648,7 +932,7 @@ def _start_element(self, name: str, attrs: dict[str, str]): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall( name=function_name, arguments="" @@ -679,7 +963,7 @@ def _start_element(self, name: str, attrs: dict[str, str]): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall( name=None, arguments=json_start @@ -697,7 +981,7 @@ def _start_element(self, name: str, attrs: dict[str, str]): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall( name=None, arguments=json_continue @@ -740,7 +1024,7 @@ def _char_data(self, data: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments='"'), ) @@ -775,7 +1059,7 @@ def _char_data(self, data: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments=delta_data), ) @@ -799,7 +1083,9 @@ def _end_element(self, name: str): if ( name.startswith("parameter") or name == "parameter" ) and self.current_param_name: - # End current parameter + # End current parameter; reset literal-tag depth tracker + # since we are leaving the param's value scope. + self._literal_tag_depth = 0 param_name = self.current_param_name param_value = self.current_param_value @@ -812,27 +1098,118 @@ def _end_element(self, name: str): if self.deferred_param_raw_value else param_value ) - parsed_value = None - output_arguments = None - try: - # If previously delayed trailing newline, - # add it back before parsing - if self.should_emit_end_newline: - raw_for_parse = raw_text + "\n" + parsed_value: Any = None + output_arguments: str | None = None + if self.should_emit_end_newline: + raw_for_parse = raw_text + "\n" + else: + raw_for_parse = raw_text + # Nullable-string short-circuit: when the schema is + # ``anyOf: [string, null]``, ``"null"`` and Python's + # ``"None"`` map to JSON null. Any other value is + # kept verbatim as a string — never parsed as int, + # float, JSON, etc., even if it LOOKS like one. + _param_type_for_check = self._get_param_type(param_name) + if _param_type_for_check in [ + "string", + "str", + "text", + "varchar", + "char", + "enum", + ] and self._param_allows_null(param_name): + if raw_for_parse.strip().lower() in ("null", "none"): + parsed_value = None + output_arguments = "null" else: - raw_for_parse = raw_text - parsed_value = ast.literal_eval(raw_for_parse) - output_arguments = json.dumps(parsed_value, ensure_ascii=False) - except Exception: - # Fallback: output as string as-is - output_arguments = json.dumps(raw_text, ensure_ascii=False) - parsed_value = raw_text + parsed_value = raw_for_parse + output_arguments = json.dumps(raw_for_parse, ensure_ascii=False) + delta = DeltaMessage( + tool_calls=[ + DeltaToolCall( + index=self.tool_call_index - 1, + id=self._get_call_id_for_delta(), + type="function", + function=DeltaFunctionCall( + name=None, arguments=output_arguments + ), + ) + ] + ) + self._emit_delta(delta) + self.parameters[param_name] = parsed_value + self.current_param_name = None + self.current_param_value = "" + self.current_param_value_converted = "" + self.start_quote_emitted = False + self.should_emit_end_newline = False + self.defer_current_parameter = False + self.deferred_param_raw_value = "" + return + raw_lower = raw_for_parse.strip().lower() + # Handle JSON literals that ast.literal_eval cannot parse + # (true/false/null are JSON, not Python). + if raw_lower == "null": + parsed_value = None + output_arguments = "null" + elif raw_lower == "true": + parsed_value = True + output_arguments = "true" + elif raw_lower == "false": + parsed_value = False + output_arguments = "false" + else: + # Try JSON first: handles arrays/objects that use JSON + # native tokens (true, false, null) which + # ast.literal_eval cannot parse. + try: + parsed_value = json.loads(raw_for_parse) + # A model trained with a buggy template + # (json.dumps(str(dict))) may output a JSON-encoded + # Python repr like "\"{'k': 'v'}\"". json.loads + # returns a str in that case — try one more level. + if isinstance(parsed_value, str): + try: + parsed_value = ast.literal_eval(parsed_value) + except (ValueError, SyntaxError, TypeError): + with contextlib.suppress( + json.JSONDecodeError, ValueError + ): + parsed_value = json.loads(parsed_value) + output_arguments = json.dumps(parsed_value, ensure_ascii=False) + except (json.JSONDecodeError, ValueError): + try: + parsed_value = ast.literal_eval(raw_for_parse) + # A model trained with a buggy template + # (json.dumps(str(dict))) may output a + # JSON-encoded Python repr like "{'k': 'v'}". + # ast.literal_eval returns a str in that + # case — try one more level. + if isinstance(parsed_value, str): + try: + parsed_value = ast.literal_eval(parsed_value) + except ( + ValueError, + SyntaxError, + TypeError, + ): + with contextlib.suppress( + json.JSONDecodeError, ValueError + ): + parsed_value = json.loads(parsed_value) + output_arguments = json.dumps( + parsed_value, ensure_ascii=False + ) + except (ValueError, SyntaxError, TypeError): + # Fallback: output as string as-is + output_arguments = json.dumps(raw_text, ensure_ascii=False) + parsed_value = raw_text delta = DeltaMessage( tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall( name=None, arguments=output_arguments @@ -868,7 +1245,7 @@ def _end_element(self, name: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments='""'), ) @@ -881,7 +1258,7 @@ def _end_element(self, name: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments='"'), ) @@ -904,7 +1281,7 @@ def _end_element(self, name: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments="}"), ) @@ -917,7 +1294,7 @@ def _end_element(self, name: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments="{}"), ) @@ -940,7 +1317,7 @@ def _end_element(self, name: str): tool_calls=[ DeltaToolCall( index=self.tool_call_index - 1, - id=self.current_call_id, + id=self._get_call_id_for_delta(), type="function", function=DeltaFunctionCall(name=None, arguments=""), ) @@ -1003,11 +1380,52 @@ def _get_param_type(self, param_name: str) -> str: properties = find_tool_properties(self.tools, self.current_function_name) if param_name in properties and isinstance(properties[param_name], dict): - return self.repair_param_type( - str(properties[param_name].get("type", "string")) - ) + prop = properties[param_name] + param_type = prop.get("type") + if isinstance(param_type, list): + # JSON-Schema list-form type, e.g. + # {"type": ["integer", "null"]}. Pick the first non-null + # type, mirroring the anyOf handling below. + for option_type in param_type: + if str(option_type).lower() != "null": + return self.repair_param_type(str(option_type)) + return "string" + if param_type is None and "anyOf" in prop: + # Handle anyOf schemas (e.g. nullable types like + # anyOf: [{type: "integer"}, {type: "null"}]). + # Pick the first non-null type; fall back to "string". + for option in prop["anyOf"]: + if isinstance(option, dict) and "type" in option: + opt_type = str(option["type"]) + if opt_type != "null": + return self.repair_param_type(opt_type) + return "string" + + return self.repair_param_type(str(param_type or "string")) return "string" + def _param_allows_null(self, param_name: str | None) -> bool: + """Return True when the schema for ``param_name`` admits a null + value — either via ``"type": "null"`` or as one alternative in + an ``anyOf`` union. Used to recognise the literal ``"null"`` / + ``"None"`` as JSON null even when the primary type is string. + """ + if not self.tools or not self.current_function_name or not param_name: + return False + properties = find_tool_properties(self.tools, self.current_function_name) + if param_name not in properties or not isinstance(properties[param_name], dict): + return False + prop = properties[param_name] + if str(prop.get("type", "")).lower() == "null": + return True + for option in prop.get("anyOf", []) or []: + if ( + isinstance(option, dict) + and str(option.get("type", "")).lower() == "null" + ): + return True + return False + def repair_param_type(self, param_type: str) -> str: """Repair unknown parameter types by treating them as string Args: @@ -1045,13 +1463,29 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any: Returns: Converted value """ - if param_value.lower() == "null": - return None - param_type = param_type.strip().lower() + # Nullable schemas (``anyOf: [string, null]`` or similar): the + # primary type may be string but the literal ``"null"`` / + # ``"None"`` must still convert to JSON null. Caller passes the + # current parameter name via the parser state so we can query + # the schema. + if self._param_allows_null(self.current_param_name) and param_value.lower() in ( + "null", + "none", + ): + return None + # String type takes precedence: the literal value "null" must remain + # the string "null" instead of being converted to Python None. if param_type in ["string", "str", "text", "varchar", "char", "enum"]: return param_value - elif ( + # Non-string: "null" → Python None → JSON null. Also accept the + # Python literal "None" so that Qwen3.5-trained models — whose + # chat template renders null args via ``| string`` (yielding the + # literal "None" in the prompt) — round-trip nullable values + # correctly. + if param_value.lower() in ("null", "none"): + return None + if ( param_type.startswith("int") or param_type.startswith("uint") or param_type.startswith("long") @@ -1062,11 +1496,10 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any: return int(param_value) except (ValueError, TypeError): logger.warning( - "Parsed value '%s' of parameter '%s' is not an integer " - "in tool '%s', degenerating to string.", + "Parsed value '%s' is not an integer, degenerating to string.", param_value, ) - return param_value + return param_value elif param_type.startswith("num") or param_type.startswith("float"): try: float_param_value: float = float(param_value) @@ -1077,14 +1510,12 @@ def _convert_param_value(self, param_value: str, param_type: str) -> Any: ) except (ValueError, TypeError): logger.warning( - "Parsed value '%s' of parameter '%s' is not a float " - "in tool '%s', degenerating to string.", + "Parsed value '%s' is not a float, degenerating to string.", param_value, ) - return param_value + return param_value elif param_type in ["boolean", "bool", "binary"]: - param_value = param_value.lower() - return param_value == "true" + return param_value.lower() == "true" else: return param_value @@ -1098,9 +1529,12 @@ def _convert_for_json_streaming(self, converted_value: Any, param_type: str) -> Returns: Converted string for streaming output """ - # Check if value is empty, but exclude numeric 0 - if converted_value is None or converted_value == "": + # Empty string: no output. + if converted_value == "": return "" + # None → JSON null literal (e.g. for nullable integer/object params). + if converted_value is None: + return "null" if param_type in ["string", "str", "text", "varchar", "char", "enum"]: # String type, remove double quotes @@ -1126,6 +1560,7 @@ def _reset_xml_parser_after_tool_call(self): if self.current_call_id: self.last_completed_call_id = self.current_call_id self.current_call_id = None + self.id_emitted = False self.current_function_name = None self.current_function_open = False self.parameters = {} @@ -1179,6 +1614,13 @@ def extract_tool_calls( tool_calls = [] for tool_call in result.tool_calls: if tool_call.function and tool_call.function.name: + # Reject phantom tool calls produced when the model + # writes an unrendered Jinja template or pseudo-XML + # in its response (e.g. ````). + # Surfacing such names as real tool calls causes + # "tool not found" errors at the client. + if not _is_valid_function_name(tool_call.function.name): + continue tool_calls.append( ToolCall( id=tool_call.id, @@ -1235,6 +1677,7 @@ def extract_tool_calls_streaming( ) -> DeltaMessage | None: if not previous_text: self.parser.reset_streaming_state() + self.parser._streaming_mode = True # Reset tool call tracking arrays for new streaming session self.prev_tool_call_arr = [] self.streamed_args_for_tool = [] @@ -1296,3 +1739,11 @@ def extract_tool_calls_streaming( # If no content and no tool calls, return None to indicate no update return None return delta + + def get_structural_tag(self, request: ChatCompletionRequest): + return get_model_structural_tag( + model="qwen_3_5", + tools=request.tools, + tool_choice=request.tool_choice, + reasoning=get_enable_structured_outputs_in_reasoning(), + )