From 0d94472891894f68546227cb5fefd999bb0589d9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 22 May 2026 14:29:14 +0000 Subject: [PATCH 1/2] Studio: inject synthetic respond tool into agentic loop Small local models (~8B) routinely emit bare assistant text mid-loop when they should be calling a tool. Today that triggers the _MAX_REPROMPTS plan-without-action nudge and often times out. This patch gives the model a structured exit by injecting a synthetic respond(message: str) tool into the agentic loop's tools list. When the model calls respond, the loop strips the call from the response and emits message as plain assistant content with finish_reason=stop. Injection is defensive: * Empty / None tools list: skipped. * The caller already supplied a tool named "respond": skipped (the client's semantics win). * The input tools list is not mutated; a fresh list is constructed. Wired into both generate_chat_completion_with_tools (GGUF / llama-server) and run_safetensors_tool_loop (transformers). Adapted from forge (https://github.com/antoinezambelli/forge, MIT). --- studio/backend/core/inference/llama_cpp.py | 23 ++- .../core/inference/safetensors_agentic.py | 22 +++ studio/backend/core/inference/tools.py | 68 ++++++++ .../tests/test_safetensors_tool_loop.py | 159 ++++++++++++++++++ 4 files changed, 271 insertions(+), 1 deletion(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index bf8a3c04df..9491c904a1 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -4437,12 +4437,22 @@ def generate_chat_completion_with_tools( {"type": "content", "text": "token"} -- streamed content tokens (cumulative) {"type": "reasoning", "text": "token"} -- streamed reasoning tokens (cumulative) """ - from core.inference.tools import execute_tool + from core.inference.tools import ( + execute_tool, + extract_respond_message, + inject_respond_tool, + is_respond_call, + ) if not self.is_loaded: raise RuntimeError("llama-server is not loaded") conversation = list(messages) + + # Inject the synthetic respond tool so the model has a structured + # exit for plain assistant text. The unwrap path below strips + # the call and emits ``message`` as content. + tools, _respond_injected = inject_respond_tool(tools) url = f"{self.base_url}/v1/chat/completions" _accumulated_completion_tokens = 0 _accumulated_predicted_ms = 0.0 @@ -5000,6 +5010,17 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: conversation.append(assistant_msg) for tc in tool_calls or []: + # Synthetic respond unwrap: emit the message as + # plain content and end the loop. The status reset + # below clears prev_text in the SSE consumer so the + # message streams as a fresh cumulative. + if _respond_injected and is_respond_call(tc): + message = extract_respond_message(tc) + yield {"type": "status", "text": ""} + if message: + yield {"type": "content", "text": message} + return + func = tc.get("function", {}) tool_name = func.get("name", "") raw_args = func.get("arguments", {}) diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py index 73bb3d090a..edde2b0bda 100644 --- a/studio/backend/core/inference/safetensors_agentic.py +++ b/studio/backend/core/inference/safetensors_agentic.py @@ -134,6 +134,18 @@ def run_safetensors_tool_loop( * ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}`` """ conversation = list(messages) + + # Inject the synthetic respond tool so the model has a structured + # exit for plain text. Unwrap below emits the message as content + # and returns. ``inject_respond_tool`` skips when ``tools`` is empty + # or already carries a tool named "respond". + from core.inference.tools import ( + extract_respond_message, + inject_respond_tool, + is_respond_call, + ) + tools, _respond_injected = inject_respond_tool(tools) + tool_call_history: list[tuple[str, bool]] = [] final_attempt_done = False allowed_tool_names = { @@ -300,6 +312,16 @@ def run_safetensors_tool_loop( conversation.append(assistant_msg) for tc in tool_calls or []: + # Synthetic respond unwrap: emit the message as content and + # end the loop. Empty status resets the consumer's prev_text + # so the message streams as a fresh cumulative. + if _respond_injected and is_respond_call(tc): + message = extract_respond_message(tc) + yield {"type": "status", "text": ""} + if message: + yield {"type": "content", "text": message} + return + func = tc.get("function", {}) or {} tool_name = func.get("name", "") or "" arguments = _coerce_arguments( diff --git a/studio/backend/core/inference/tools.py b/studio/backend/core/inference/tools.py index 0e9cce7c3e..c6e494bd29 100644 --- a/studio/backend/core/inference/tools.py +++ b/studio/backend/core/inference/tools.py @@ -9,6 +9,7 @@ import ast import http.client +import json import os import signal @@ -505,6 +506,73 @@ def _get_workdir(session_id: str | None = None) -> str: ALL_TOOLS = [WEB_SEARCH_TOOL, PYTHON_TOOL, TERMINAL_TOOL] +# Synthetic respond tool. Adapted from forge +# (https://github.com/antoinezambelli/forge, MIT). The model calls +# respond(message=...) instead of producing bare text so the agentic +# loop has a structured exit. The unwrap path strips the call from the +# response and emits the message as plain assistant content. +RESPOND_TOOL_NAME = "respond" + +RESPOND_TOOL = { + "type": "function", + "function": { + "name": RESPOND_TOOL_NAME, + "description": ( + "Respond to the user with a message. Use this when the user " + "is chatting, asking a question, when you need to ask a " + "clarifying question before proceeding, or when no other " + "tool action is needed. Also use this after completing the " + "user's request to report the result." + ), + "parameters": { + "type": "object", + "properties": { + "message": { + "type": "string", + "description": "The message to send to the user.", + }, + }, + "required": ["message"], + }, + }, +} + + +def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]: + """Defensively append ``RESPOND_TOOL`` to ``tools``. Skips when the + list is empty/None or a tool named ``respond`` already exists. + Returns ``(new_tools, was_injected)``. The input list is not mutated. + """ + if not tools: + return tools, False + for t in tools: + if (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME: + return tools, False + return list(tools) + [RESPOND_TOOL], True + + +def is_respond_call(tc: dict) -> bool: + """True if ``tc`` is a tool call to the synthetic ``respond`` tool.""" + return (tc.get("function") or {}).get("name") == RESPOND_TOOL_NAME + + +def extract_respond_message(tc: dict) -> str: + """Pull the ``message`` arg out of a respond call. Tolerates + arguments arriving as either a JSON string or a dict. Returns ``""`` + if missing or malformed. + """ + args = (tc.get("function") or {}).get("arguments", "") + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, ValueError): + return "" + if not isinstance(args, dict): + return "" + msg = args.get("message", "") + return msg if isinstance(msg, str) else "" + + _TIMEOUT_UNSET = object() diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py index 923af87c4f..692dfda786 100644 --- a/studio/backend/tests/test_safetensors_tool_loop.py +++ b/studio/backend/tests/test_safetensors_tool_loop.py @@ -41,6 +41,13 @@ parse_tool_calls_from_text, strip_tool_markup, ) +from core.inference.tools import ( + RESPOND_TOOL, + RESPOND_TOOL_NAME, + extract_respond_message, + inject_respond_tool, + is_respond_call, +) from utils.datasets import is_gpt_oss_model_name @@ -498,6 +505,158 @@ def test_max_iterations_caps_loop(self): assert contents and "final answer" in contents[-1]["text"] +class TestRespondToolHelpers: + """Unit coverage for the synthetic respond tool helpers.""" + + def test_inject_into_real_tools(self): + base = [{"type": "function", "function": {"name": "web_search"}}] + out, injected = inject_respond_tool(base) + assert injected is True + # Original list not mutated. + assert len(base) == 1 + assert len(out) == 2 + assert out[-1]["function"]["name"] == RESPOND_TOOL_NAME + + def test_inject_skips_empty(self): + out, injected = inject_respond_tool([]) + assert injected is False + assert out == [] + + def test_inject_skips_none(self): + out, injected = inject_respond_tool(None) + assert injected is False + assert out is None + + def test_inject_skips_on_collision(self): + base = [ + {"type": "function", "function": {"name": "respond"}}, + {"type": "function", "function": {"name": "python"}}, + ] + out, injected = inject_respond_tool(base) + assert injected is False + assert out is base + + def test_is_respond_call(self): + assert is_respond_call( + {"function": {"name": "respond", "arguments": "{}"}} + ) + assert not is_respond_call( + {"function": {"name": "web_search", "arguments": "{}"}} + ) + assert not is_respond_call({}) + + def test_extract_message_from_json_string(self): + tc = {"function": {"name": "respond", "arguments": '{"message":"hi"}'}} + assert extract_respond_message(tc) == "hi" + + def test_extract_message_from_dict(self): + tc = {"function": {"name": "respond", "arguments": {"message": "hi"}}} + assert extract_respond_message(tc) == "hi" + + def test_extract_message_bad_json_returns_empty(self): + tc = {"function": {"name": "respond", "arguments": "{not json}"}} + assert extract_respond_message(tc) == "" + + def test_extract_message_missing_key_returns_empty(self): + tc = {"function": {"name": "respond", "arguments": '{"other":"x"}'}} + assert extract_respond_message(tc) == "" + + def test_extract_message_non_string_returns_empty(self): + tc = {"function": {"name": "respond", "arguments": '{"message":123}'}} + assert extract_respond_message(tc) == "" + + +class TestRespondToolUnwrap: + """Loop-level coverage for the unwrap path in the agentic loop.""" + + def test_respond_call_emits_message_as_content(self): + # Model calls respond({message:"hi"}); loop must emit content + # "hi" and end without executing any real tool. + loop, exec_fn = _make_loop( + turns = [ + [ + '{"name":"respond",' + '"arguments":{"message":"hi"}}' + ], + ], + exec_results = [], + ) + events = _collect_events(loop) + contents = [e for e in events if e["type"] == "content"] + assert any(e.get("text") == "hi" for e in contents) + assert exec_fn.calls == [] + + def test_respond_call_with_empty_message(self): + # An empty message string still terminates cleanly with no + # crash and no leaked tool execution. + loop, exec_fn = _make_loop( + turns = [ + [ + '{"name":"respond",' + '"arguments":{"message":""}}' + ], + ], + exec_results = [], + ) + events = _collect_events(loop) + assert exec_fn.calls == [] + # Status reset is yielded even when the message is empty. + statuses = [e for e in events if e["type"] == "status"] + assert statuses + + def test_real_tool_call_does_not_unwrap(self): + # A non-respond tool call goes through the normal execute path. + loop, exec_fn = _make_loop( + turns = [ + [ + '{"name":"web_search",' + '"arguments":{"query":"weather"}}' + ], + ["The weather is sunny."], + ], + exec_results = ["Sunny and 22C"], + ) + events = _collect_events(loop) + assert exec_fn.calls == [("web_search", {"query": "weather"})] + contents = [e for e in events if e["type"] == "content"] + assert any("sunny" in c.get("text", "").lower() for c in contents) + + def test_respond_call_when_client_owns_tool(self): + # When the caller supplies a real "respond" tool, the synthetic + # one is NOT injected and the call goes through execute_tool + # like any other tool, preserving the client's semantics. + turn_iter = iter([ + [ + '{"name":"respond",' + '"arguments":{"message":"hi"}}' + ], + ["thanks"], + ]) + + def _gen(_messages): + try: + chunks = next(turn_iter) + except StopIteration: + return + acc = "" + for c in chunks: + acc += c + yield acc + + exec_fn = FakeExecuteTool(["ack"]) + loop = run_safetensors_tool_loop( + single_turn = _gen, + messages = [{"role": "user", "content": "hi"}], + tools = [ + {"type": "function", "function": {"name": "respond"}}, + ], + execute_tool = exec_fn, + ) + events = _collect_events(loop) + # The client's respond tool was executed, not unwrapped. + assert exec_fn.calls == [("respond", {"message": "hi"})] + + class TestStatusFormatting: def test_status_for_known_tools(self): # Use the private helper directly to verify status formatting. From b1b2d019cdd2fd2ffcc324dec7fb6d94ee551b9e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 14:30:13 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../core/inference/safetensors_agentic.py | 1 + .../tests/test_safetensors_tool_loop.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py index edde2b0bda..a300e986b3 100644 --- a/studio/backend/core/inference/safetensors_agentic.py +++ b/studio/backend/core/inference/safetensors_agentic.py @@ -144,6 +144,7 @@ def run_safetensors_tool_loop( inject_respond_tool, is_respond_call, ) + tools, _respond_injected = inject_respond_tool(tools) tool_call_history: list[tuple[str, bool]] = [] diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py index 692dfda786..59d67e1e9e 100644 --- a/studio/backend/tests/test_safetensors_tool_loop.py +++ b/studio/backend/tests/test_safetensors_tool_loop.py @@ -537,9 +537,7 @@ def test_inject_skips_on_collision(self): assert out is base def test_is_respond_call(self): - assert is_respond_call( - {"function": {"name": "respond", "arguments": "{}"}} - ) + assert is_respond_call({"function": {"name": "respond", "arguments": "{}"}}) assert not is_respond_call( {"function": {"name": "web_search", "arguments": "{}"}} ) @@ -625,13 +623,15 @@ def test_respond_call_when_client_owns_tool(self): # When the caller supplies a real "respond" tool, the synthetic # one is NOT injected and the call goes through execute_tool # like any other tool, preserving the client's semantics. - turn_iter = iter([ + turn_iter = iter( [ - '{"name":"respond",' - '"arguments":{"message":"hi"}}' - ], - ["thanks"], - ]) + [ + '{"name":"respond",' + '"arguments":{"message":"hi"}}' + ], + ["thanks"], + ] + ) def _gen(_messages): try: