From 0d94472891894f68546227cb5fefd999bb0589d9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 22 May 2026 14:29:14 +0000
Subject: [PATCH 1/2] Studio: inject synthetic respond tool into agentic loop

Small local models (~8B) routinely emit bare assistant text mid-loop
when they should be calling a tool. Today that triggers the
_MAX_REPROMPTS plan-without-action nudge and often times out. This
patch gives the model a structured exit by injecting a synthetic
respond(message: str) tool into the agentic loop's tools list. When
the model calls respond, the loop strips the call from the response
and emits message as plain assistant content with finish_reason=stop.

Injection is defensive:

* Empty / None tools list: skipped.
* The caller already supplied a tool named "respond": skipped (the
  client's semantics win).
* The input tools list is not mutated; a fresh list is constructed.

Wired into both generate_chat_completion_with_tools (GGUF /
llama-server) and run_safetensors_tool_loop (transformers).

Adapted from forge (https://github.com/antoinezambelli/forge, MIT).
---
 studio/backend/core/inference/llama_cpp.py    |  23 ++-
 .../core/inference/safetensors_agentic.py     |  22 +++
 studio/backend/core/inference/tools.py        |  68 ++++++++
 .../tests/test_safetensors_tool_loop.py       | 159 ++++++++++++++++++
 4 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index bf8a3c04df..9491c904a1 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4437,12 +4437,22 @@ def generate_chat_completion_with_tools(
           {"type": "content", "text": "token"}            -- streamed content tokens (cumulative)
           {"type": "reasoning", "text": "token"}          -- streamed reasoning tokens (cumulative)
         """
-        from core.inference.tools import execute_tool
+        from core.inference.tools import (
+            execute_tool,
+            extract_respond_message,
+            inject_respond_tool,
+            is_respond_call,
+        )
 
         if not self.is_loaded:
             raise RuntimeError("llama-server is not loaded")
 
         conversation = list(messages)
+
+        # Inject the synthetic respond tool so the model has a structured
+        # exit for plain assistant text. The unwrap path below strips
+        # the call and emits ``message`` as content.
+        tools, _respond_injected = inject_respond_tool(tools)
         url = f"{self.base_url}/v1/chat/completions"
         _accumulated_completion_tokens = 0
         _accumulated_predicted_ms = 0.0
@@ -5000,6 +5010,17 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
                 conversation.append(assistant_msg)
 
                 for tc in tool_calls or []:
+                    # Synthetic respond unwrap: emit the message as
+                    # plain content and end the loop. The status reset
+                    # below clears prev_text in the SSE consumer so the
+                    # message streams as a fresh cumulative.
+                    if _respond_injected and is_respond_call(tc):
+                        message = extract_respond_message(tc)
+                        yield {"type": "status", "text": ""}
+                        if message:
+                            yield {"type": "content", "text": message}
+                        return
+
                     func = tc.get("function", {})
                     tool_name = func.get("name", "")
                     raw_args = func.get("arguments", {})
diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py
index 73bb3d090a..edde2b0bda 100644
--- a/studio/backend/core/inference/safetensors_agentic.py
+++ b/studio/backend/core/inference/safetensors_agentic.py
@@ -134,6 +134,18 @@ def run_safetensors_tool_loop(
     * ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
     """
     conversation = list(messages)
+
+    # Inject the synthetic respond tool so the model has a structured
+    # exit for plain text. Unwrap below emits the message as content
+    # and returns. ``inject_respond_tool`` skips when ``tools`` is empty
+    # or already carries a tool named "respond".
+    from core.inference.tools import (
+        extract_respond_message,
+        inject_respond_tool,
+        is_respond_call,
+    )
+    tools, _respond_injected = inject_respond_tool(tools)
+
     tool_call_history: list[tuple[str, bool]] = []
     final_attempt_done = False
     allowed_tool_names = {
@@ -300,6 +312,16 @@ def run_safetensors_tool_loop(
         conversation.append(assistant_msg)
 
         for tc in tool_calls or []:
+            # Synthetic respond unwrap: emit the message as content and
+            # end the loop. Empty status resets the consumer's prev_text
+            # so the message streams as a fresh cumulative.
+            if _respond_injected and is_respond_call(tc):
+                message = extract_respond_message(tc)
+                yield {"type": "status", "text": ""}
+                if message:
+                    yield {"type": "content", "text": message}
+                return
+
             func = tc.get("function", {}) or {}
             tool_name = func.get("name", "") or ""
             arguments = _coerce_arguments(
diff --git a/studio/backend/core/inference/tools.py b/studio/backend/core/inference/tools.py
index 0e9cce7c3e..c6e494bd29 100644
--- a/studio/backend/core/inference/tools.py
+++ b/studio/backend/core/inference/tools.py
@@ -9,6 +9,7 @@
 
 import ast
 import http.client
+import json
 import os
 import signal
 
@@ -505,6 +506,73 @@ def _get_workdir(session_id: str | None = None) -> str:
 ALL_TOOLS = [WEB_SEARCH_TOOL, PYTHON_TOOL, TERMINAL_TOOL]
 
 
+# Synthetic respond tool. Adapted from forge
+# (https://github.com/antoinezambelli/forge, MIT). The model calls
+# respond(message=...) instead of producing bare text so the agentic
+# loop has a structured exit. The unwrap path strips the call from the
+# response and emits the message as plain assistant content.
+RESPOND_TOOL_NAME = "respond"
+
+RESPOND_TOOL = {
+    "type": "function",
+    "function": {
+        "name": RESPOND_TOOL_NAME,
+        "description": (
+            "Respond to the user with a message. Use this when the user "
+            "is chatting, asking a question, when you need to ask a "
+            "clarifying question before proceeding, or when no other "
+            "tool action is needed. Also use this after completing the "
+            "user's request to report the result."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "message": {
+                    "type": "string",
+                    "description": "The message to send to the user.",
+                },
+            },
+            "required": ["message"],
+        },
+    },
+}
+
+
+def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]:
+    """Defensively append ``RESPOND_TOOL`` to ``tools``. Skips when the
+    list is empty/None or a tool named ``respond`` already exists.
+    Returns ``(new_tools, was_injected)``. The input list is not mutated.
+    """
+    if not tools:
+        return tools, False
+    for t in tools:
+        if (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME:
+            return tools, False
+    return list(tools) + [RESPOND_TOOL], True
+
+
+def is_respond_call(tc: dict) -> bool:
+    """True if ``tc`` is a tool call to the synthetic ``respond`` tool."""
+    return (tc.get("function") or {}).get("name") == RESPOND_TOOL_NAME
+
+
+def extract_respond_message(tc: dict) -> str:
+    """Pull the ``message`` arg out of a respond call. Tolerates
+    arguments arriving as either a JSON string or a dict. Returns ``""``
+    if missing or malformed.
+    """
+    args = (tc.get("function") or {}).get("arguments", "")
+    if isinstance(args, str):
+        try:
+            args = json.loads(args)
+        except (json.JSONDecodeError, ValueError):
+            return ""
+    if not isinstance(args, dict):
+        return ""
+    msg = args.get("message", "")
+    return msg if isinstance(msg, str) else ""
+
+
 _TIMEOUT_UNSET = object()
 
 
diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py
index 923af87c4f..692dfda786 100644
--- a/studio/backend/tests/test_safetensors_tool_loop.py
+++ b/studio/backend/tests/test_safetensors_tool_loop.py
@@ -41,6 +41,13 @@
     parse_tool_calls_from_text,
     strip_tool_markup,
 )
+from core.inference.tools import (
+    RESPOND_TOOL,
+    RESPOND_TOOL_NAME,
+    extract_respond_message,
+    inject_respond_tool,
+    is_respond_call,
+)
 from utils.datasets import is_gpt_oss_model_name
 
 
@@ -498,6 +505,158 @@ def test_max_iterations_caps_loop(self):
         assert contents and "final answer" in contents[-1]["text"]
 
 
+class TestRespondToolHelpers:
+    """Unit coverage for the synthetic respond tool helpers."""
+
+    def test_inject_into_real_tools(self):
+        base = [{"type": "function", "function": {"name": "web_search"}}]
+        out, injected = inject_respond_tool(base)
+        assert injected is True
+        # Original list not mutated.
+        assert len(base) == 1
+        assert len(out) == 2
+        assert out[-1]["function"]["name"] == RESPOND_TOOL_NAME
+
+    def test_inject_skips_empty(self):
+        out, injected = inject_respond_tool([])
+        assert injected is False
+        assert out == []
+
+    def test_inject_skips_none(self):
+        out, injected = inject_respond_tool(None)
+        assert injected is False
+        assert out is None
+
+    def test_inject_skips_on_collision(self):
+        base = [
+            {"type": "function", "function": {"name": "respond"}},
+            {"type": "function", "function": {"name": "python"}},
+        ]
+        out, injected = inject_respond_tool(base)
+        assert injected is False
+        assert out is base
+
+    def test_is_respond_call(self):
+        assert is_respond_call(
+            {"function": {"name": "respond", "arguments": "{}"}}
+        )
+        assert not is_respond_call(
+            {"function": {"name": "web_search", "arguments": "{}"}}
+        )
+        assert not is_respond_call({})
+
+    def test_extract_message_from_json_string(self):
+        tc = {"function": {"name": "respond", "arguments": '{"message":"hi"}'}}
+        assert extract_respond_message(tc) == "hi"
+
+    def test_extract_message_from_dict(self):
+        tc = {"function": {"name": "respond", "arguments": {"message": "hi"}}}
+        assert extract_respond_message(tc) == "hi"
+
+    def test_extract_message_bad_json_returns_empty(self):
+        tc = {"function": {"name": "respond", "arguments": "{not json}"}}
+        assert extract_respond_message(tc) == ""
+
+    def test_extract_message_missing_key_returns_empty(self):
+        tc = {"function": {"name": "respond", "arguments": '{"other":"x"}'}}
+        assert extract_respond_message(tc) == ""
+
+    def test_extract_message_non_string_returns_empty(self):
+        tc = {"function": {"name": "respond", "arguments": '{"message":123}'}}
+        assert extract_respond_message(tc) == ""
+
+
+class TestRespondToolUnwrap:
+    """Loop-level coverage for the unwrap path in the agentic loop."""
+
+    def test_respond_call_emits_message_as_content(self):
+        # Model calls respond({message:"hi"}); loop must emit content
+        # "hi" and end without executing any real tool.
+        loop, exec_fn = _make_loop(
+            turns = [
+                [
+                    '<tool_call>{"name":"respond",'
+                    '"arguments":{"message":"hi"}}</tool_call>'
+                ],
+            ],
+            exec_results = [],
+        )
+        events = _collect_events(loop)
+        contents = [e for e in events if e["type"] == "content"]
+        assert any(e.get("text") == "hi" for e in contents)
+        assert exec_fn.calls == []
+
+    def test_respond_call_with_empty_message(self):
+        # An empty message string still terminates cleanly with no
+        # crash and no leaked tool execution.
+        loop, exec_fn = _make_loop(
+            turns = [
+                [
+                    '<tool_call>{"name":"respond",'
+                    '"arguments":{"message":""}}</tool_call>'
+                ],
+            ],
+            exec_results = [],
+        )
+        events = _collect_events(loop)
+        assert exec_fn.calls == []
+        # Status reset is yielded even when the message is empty.
+        statuses = [e for e in events if e["type"] == "status"]
+        assert statuses
+
+    def test_real_tool_call_does_not_unwrap(self):
+        # A non-respond tool call goes through the normal execute path.
+        loop, exec_fn = _make_loop(
+            turns = [
+                [
+                    '<tool_call>{"name":"web_search",'
+                    '"arguments":{"query":"weather"}}</tool_call>'
+                ],
+                ["The weather is sunny."],
+            ],
+            exec_results = ["Sunny and 22C"],
+        )
+        events = _collect_events(loop)
+        assert exec_fn.calls == [("web_search", {"query": "weather"})]
+        contents = [e for e in events if e["type"] == "content"]
+        assert any("sunny" in c.get("text", "").lower() for c in contents)
+
+    def test_respond_call_when_client_owns_tool(self):
+        # When the caller supplies a real "respond" tool, the synthetic
+        # one is NOT injected and the call goes through execute_tool
+        # like any other tool, preserving the client's semantics.
+        turn_iter = iter([
+            [
+                '<tool_call>{"name":"respond",'
+                '"arguments":{"message":"hi"}}</tool_call>'
+            ],
+            ["thanks"],
+        ])
+
+        def _gen(_messages):
+            try:
+                chunks = next(turn_iter)
+            except StopIteration:
+                return
+            acc = ""
+            for c in chunks:
+                acc += c
+                yield acc
+
+        exec_fn = FakeExecuteTool(["ack"])
+        loop = run_safetensors_tool_loop(
+            single_turn = _gen,
+            messages = [{"role": "user", "content": "hi"}],
+            tools = [
+                {"type": "function", "function": {"name": "respond"}},
+            ],
+            execute_tool = exec_fn,
+        )
+        events = _collect_events(loop)
+        # The client's respond tool was executed, not unwrapped.
+        assert exec_fn.calls == [("respond", {"message": "hi"})]
+
+
 class TestStatusFormatting:
     def test_status_for_known_tools(self):
         # Use the private helper directly to verify status formatting.

From b1b2d019cdd2fd2ffcc324dec7fb6d94ee551b9e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 22 May 2026 14:30:13 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../core/inference/safetensors_agentic.py      |  1 +
 .../tests/test_safetensors_tool_loop.py        | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py
index edde2b0bda..a300e986b3 100644
--- a/studio/backend/core/inference/safetensors_agentic.py
+++ b/studio/backend/core/inference/safetensors_agentic.py
@@ -144,6 +144,7 @@ def run_safetensors_tool_loop(
         inject_respond_tool,
         is_respond_call,
     )
+
     tools, _respond_injected = inject_respond_tool(tools)
 
     tool_call_history: list[tuple[str, bool]] = []
diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py
index 692dfda786..59d67e1e9e 100644
--- a/studio/backend/tests/test_safetensors_tool_loop.py
+++ b/studio/backend/tests/test_safetensors_tool_loop.py
@@ -537,9 +537,7 @@ def test_inject_skips_on_collision(self):
         assert out is base
 
     def test_is_respond_call(self):
-        assert is_respond_call(
-            {"function": {"name": "respond", "arguments": "{}"}}
-        )
+        assert is_respond_call({"function": {"name": "respond", "arguments": "{}"}})
         assert not is_respond_call(
             {"function": {"name": "web_search", "arguments": "{}"}}
         )
@@ -625,13 +623,15 @@ def test_respond_call_when_client_owns_tool(self):
         # When the caller supplies a real "respond" tool, the synthetic
         # one is NOT injected and the call goes through execute_tool
         # like any other tool, preserving the client's semantics.
-        turn_iter = iter([
+        turn_iter = iter(
             [
-                '<tool_call>{"name":"respond",'
-                '"arguments":{"message":"hi"}}</tool_call>'
-            ],
-            ["thanks"],
-        ])
+                [
+                    '<tool_call>{"name":"respond",'
+                    '"arguments":{"message":"hi"}}</tool_call>'
+                ],
+                ["thanks"],
+            ]
+        )
 
         def _gen(_messages):
             try: