diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py index 769199b8..94c9224e 100644 --- a/tests/test_responses_api.py +++ b/tests/test_responses_api.py @@ -574,6 +574,68 @@ def test_streaming_response_sequence_metadata_is_monotonic(self, client): assert created_payload["response"]["id"] == completed_payload["response"]["id"] assert completed_payload["response"]["output_text"] == "Hello stream" + def test_streaming_response_bracket_tool_call_does_not_leak_text( + self, client, monkeypatch + ): + import vllm_mlx.server as srv + + engine = _mock_engine(_output("unused")) + engine.chat = AsyncMock( + side_effect=AssertionError("stream path should not call chat") + ) + engine._stream_outputs = [ + _stream_output('[Calling tool: add({"a": 1, "b": 2})'), + _stream_output("]", completion_tokens=2, finish_reason="stop"), + ] + srv._engine = engine + monkeypatch.setattr(srv, "_enable_auto_tool_choice", True) + monkeypatch.setattr(srv, "_tool_call_parser", "qwen3") + monkeypatch.setattr(srv, "_tool_parser_instance", None) + monkeypatch.setattr(srv, "_reasoning_parser", None) + + with client.stream( + "POST", + "/v1/responses", + json={ + "model": "test-model", + "input": "Add two numbers", + "stream": True, + "tools": [ + { + "type": "function", + "name": "add", + "description": "Add two numbers", + "parameters": { + "type": "object", + "properties": { + "a": {"type": "integer"}, + "b": {"type": "integer"}, + }, + "required": ["a", "b"], + }, + } + ], + }, + ) as resp: + body = "".join(resp.iter_text()) + + assert resp.status_code == 200 + events = _parse_sse_events(body) + output_text_deltas = [ + payload["delta"] + for event_type, payload in events + if event_type == "response.output_text.delta" + ] + function_call_deltas = [ + payload + for event_type, payload in events + if event_type == "response.function_call_arguments.delta" + ] + + assert not any("[Calling tool:" in delta for delta in output_text_deltas) + assert len(function_call_deltas) == 1 + assert function_call_deltas[0]["delta"] == '{"a": 1, "b": 2}' + def test_json_object_response_format_is_rejected(self, client): import vllm_mlx.server as srv diff --git a/tests/test_tool_parsers.py b/tests/test_tool_parsers.py index 1d37e948..6e0211a3 100644 --- a/tests/test_tool_parsers.py +++ b/tests/test_tool_parsers.py @@ -1286,6 +1286,29 @@ def test_streaming_function_format_complete(self, parser): break assert tool_calls_found + def test_streaming_bracket_call_closing_marker_split(self, parser): + """Qwen bracket calls should complete when ')' and ']' split chunks.""" + chunks = [ + '[Calling tool: add({"a": 1, "b": 2})', + "]", + ] + + accumulated = "" + emitted = None + for chunk in chunks: + previous = accumulated + accumulated += chunk + emitted = parser.extract_tool_calls_streaming( + previous_text=previous, + current_text=accumulated, + delta_text=chunk, + ) + + assert emitted is not None + assert "tool_calls" in emitted + assert emitted["tool_calls"][0]["function"]["name"] == "add" + assert emitted["tool_calls"][0]["function"]["arguments"] == ('{"a": 1, "b": 2}') + def test_streaming_partial_marker_buffered(self, parser): """Test that partial ' list[str]: content = SPECIAL_TOKENS_PATTERN.sub("", delta_text) if tool_parser and delta_text: - if not tool_markup_possible and "<" not in delta_text: + # Fast path: skip parsing until a tool-markup marker appears. + # Use _streaming_tool_markup_possible to catch all supported + # shapes (, " in delta_text or ")]" in delta_text: + # If we're in a tool call, accumulate and parse at the end. + # Check current_text (accumulated), not delta_text — closing markers + # like ")]" or "" often span token boundaries and may + # never appear within a single delta chunk. + if "" in current_text or ")]" in current_text: # Tool call complete, parse the whole thing result = self.extract_tool_calls(current_text) if result.tools_called: