diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index bf8a3c04df..9491c904a1 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -4437,12 +4437,22 @@ def generate_chat_completion_with_tools(
{"type": "content", "text": "token"} -- streamed content tokens (cumulative)
{"type": "reasoning", "text": "token"} -- streamed reasoning tokens (cumulative)
"""
- from core.inference.tools import execute_tool
+ from core.inference.tools import (
+ execute_tool,
+ extract_respond_message,
+ inject_respond_tool,
+ is_respond_call,
+ )
if not self.is_loaded:
raise RuntimeError("llama-server is not loaded")
conversation = list(messages)
+
+ # Inject the synthetic respond tool so the model has a structured
+ # exit for plain assistant text. The unwrap path below strips
+ # the call and emits ``message`` as content.
+ tools, _respond_injected = inject_respond_tool(tools)
url = f"{self.base_url}/v1/chat/completions"
_accumulated_completion_tokens = 0
_accumulated_predicted_ms = 0.0
@@ -5000,6 +5010,17 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
conversation.append(assistant_msg)
for tc in tool_calls or []:
+ # Synthetic respond unwrap: emit the message as
+ # plain content and end the loop. The status reset
+ # below clears prev_text in the SSE consumer so the
+ # message streams as a fresh cumulative.
+ if _respond_injected and is_respond_call(tc):
+ message = extract_respond_message(tc)
+ yield {"type": "status", "text": ""}
+ if message:
+ yield {"type": "content", "text": message}
+ return
+
func = tc.get("function", {})
tool_name = func.get("name", "")
raw_args = func.get("arguments", {})
diff --git a/studio/backend/core/inference/safetensors_agentic.py b/studio/backend/core/inference/safetensors_agentic.py
index 73bb3d090a..a300e986b3 100644
--- a/studio/backend/core/inference/safetensors_agentic.py
+++ b/studio/backend/core/inference/safetensors_agentic.py
@@ -134,6 +134,19 @@ def run_safetensors_tool_loop(
* ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
"""
conversation = list(messages)
+
+ # Inject the synthetic respond tool so the model has a structured
+ # exit for plain text. Unwrap below emits the message as content
+ # and returns. ``inject_respond_tool`` skips when ``tools`` is empty
+ # or already carries a tool named "respond".
+ from core.inference.tools import (
+ extract_respond_message,
+ inject_respond_tool,
+ is_respond_call,
+ )
+
+ tools, _respond_injected = inject_respond_tool(tools)
+
tool_call_history: list[tuple[str, bool]] = []
final_attempt_done = False
allowed_tool_names = {
@@ -300,6 +313,16 @@ def run_safetensors_tool_loop(
conversation.append(assistant_msg)
for tc in tool_calls or []:
+ # Synthetic respond unwrap: emit the message as content and
+ # end the loop. Empty status resets the consumer's prev_text
+ # so the message streams as a fresh cumulative.
+ if _respond_injected and is_respond_call(tc):
+ message = extract_respond_message(tc)
+ yield {"type": "status", "text": ""}
+ if message:
+ yield {"type": "content", "text": message}
+ return
+
func = tc.get("function", {}) or {}
tool_name = func.get("name", "") or ""
arguments = _coerce_arguments(
diff --git a/studio/backend/core/inference/tools.py b/studio/backend/core/inference/tools.py
index 0e9cce7c3e..c6e494bd29 100644
--- a/studio/backend/core/inference/tools.py
+++ b/studio/backend/core/inference/tools.py
@@ -9,6 +9,7 @@
import ast
import http.client
+import json
import os
import signal
@@ -505,6 +506,73 @@ def _get_workdir(session_id: str | None = None) -> str:
ALL_TOOLS = [WEB_SEARCH_TOOL, PYTHON_TOOL, TERMINAL_TOOL]
+# Synthetic respond tool. Adapted from forge
+# (https://github.com/antoinezambelli/forge, MIT). The model calls
+# respond(message=...) instead of producing bare text so the agentic
+# loop has a structured exit. The unwrap path strips the call from the
+# response and emits the message as plain assistant content.
+RESPOND_TOOL_NAME = "respond"
+
+RESPOND_TOOL = {
+ "type": "function",
+ "function": {
+ "name": RESPOND_TOOL_NAME,
+ "description": (
+ "Respond to the user with a message. Use this when the user "
+ "is chatting, asking a question, when you need to ask a "
+ "clarifying question before proceeding, or when no other "
+ "tool action is needed. Also use this after completing the "
+ "user's request to report the result."
+ ),
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "message": {
+ "type": "string",
+ "description": "The message to send to the user.",
+ },
+ },
+ "required": ["message"],
+ },
+ },
+}
+
+
+def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]:
+ """Defensively append ``RESPOND_TOOL`` to ``tools``. Skips when the
+ list is empty/None or a tool named ``respond`` already exists.
+ Returns ``(new_tools, was_injected)``. The input list is not mutated.
+ """
+ if not tools:
+ return tools, False
+ for t in tools:
+ if (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME:
+ return tools, False
+ return list(tools) + [RESPOND_TOOL], True
+
+
+def is_respond_call(tc: dict) -> bool:
+ """True if ``tc`` is a tool call to the synthetic ``respond`` tool."""
+ return (tc.get("function") or {}).get("name") == RESPOND_TOOL_NAME
+
+
+def extract_respond_message(tc: dict) -> str:
+ """Pull the ``message`` arg out of a respond call. Tolerates
+ arguments arriving as either a JSON string or a dict. Returns ``""``
+ if missing or malformed.
+ """
+ args = (tc.get("function") or {}).get("arguments", "")
+ if isinstance(args, str):
+ try:
+ args = json.loads(args)
+ except (json.JSONDecodeError, ValueError):
+ return ""
+ if not isinstance(args, dict):
+ return ""
+ msg = args.get("message", "")
+ return msg if isinstance(msg, str) else ""
+
+
_TIMEOUT_UNSET = object()
diff --git a/studio/backend/tests/test_safetensors_tool_loop.py b/studio/backend/tests/test_safetensors_tool_loop.py
index 923af87c4f..59d67e1e9e 100644
--- a/studio/backend/tests/test_safetensors_tool_loop.py
+++ b/studio/backend/tests/test_safetensors_tool_loop.py
@@ -41,6 +41,13 @@
parse_tool_calls_from_text,
strip_tool_markup,
)
+from core.inference.tools import (
+ RESPOND_TOOL,
+ RESPOND_TOOL_NAME,
+ extract_respond_message,
+ inject_respond_tool,
+ is_respond_call,
+)
from utils.datasets import is_gpt_oss_model_name
@@ -498,6 +505,158 @@ def test_max_iterations_caps_loop(self):
assert contents and "final answer" in contents[-1]["text"]
+class TestRespondToolHelpers:
+ """Unit coverage for the synthetic respond tool helpers."""
+
+ def test_inject_into_real_tools(self):
+ base = [{"type": "function", "function": {"name": "web_search"}}]
+ out, injected = inject_respond_tool(base)
+ assert injected is True
+ # Original list not mutated.
+ assert len(base) == 1
+ assert len(out) == 2
+ assert out[-1]["function"]["name"] == RESPOND_TOOL_NAME
+
+ def test_inject_skips_empty(self):
+ out, injected = inject_respond_tool([])
+ assert injected is False
+ assert out == []
+
+ def test_inject_skips_none(self):
+ out, injected = inject_respond_tool(None)
+ assert injected is False
+ assert out is None
+
+ def test_inject_skips_on_collision(self):
+ base = [
+ {"type": "function", "function": {"name": "respond"}},
+ {"type": "function", "function": {"name": "python"}},
+ ]
+ out, injected = inject_respond_tool(base)
+ assert injected is False
+ assert out is base
+
+ def test_is_respond_call(self):
+ assert is_respond_call({"function": {"name": "respond", "arguments": "{}"}})
+ assert not is_respond_call(
+ {"function": {"name": "web_search", "arguments": "{}"}}
+ )
+ assert not is_respond_call({})
+
+ def test_extract_message_from_json_string(self):
+ tc = {"function": {"name": "respond", "arguments": '{"message":"hi"}'}}
+ assert extract_respond_message(tc) == "hi"
+
+ def test_extract_message_from_dict(self):
+ tc = {"function": {"name": "respond", "arguments": {"message": "hi"}}}
+ assert extract_respond_message(tc) == "hi"
+
+ def test_extract_message_bad_json_returns_empty(self):
+ tc = {"function": {"name": "respond", "arguments": "{not json}"}}
+ assert extract_respond_message(tc) == ""
+
+ def test_extract_message_missing_key_returns_empty(self):
+ tc = {"function": {"name": "respond", "arguments": '{"other":"x"}'}}
+ assert extract_respond_message(tc) == ""
+
+ def test_extract_message_non_string_returns_empty(self):
+ tc = {"function": {"name": "respond", "arguments": '{"message":123}'}}
+ assert extract_respond_message(tc) == ""
+
+
+class TestRespondToolUnwrap:
+ """Loop-level coverage for the unwrap path in the agentic loop."""
+
+ def test_respond_call_emits_message_as_content(self):
+ # Model calls respond({message:"hi"}); loop must emit content
+ # "hi" and end without executing any real tool.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"respond",'
+ '"arguments":{"message":"hi"}}'
+ ],
+ ],
+ exec_results = [],
+ )
+ events = _collect_events(loop)
+ contents = [e for e in events if e["type"] == "content"]
+ assert any(e.get("text") == "hi" for e in contents)
+ assert exec_fn.calls == []
+
+ def test_respond_call_with_empty_message(self):
+ # An empty message string still terminates cleanly with no
+ # crash and no leaked tool execution.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"respond",'
+ '"arguments":{"message":""}}'
+ ],
+ ],
+ exec_results = [],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == []
+ # Status reset is yielded even when the message is empty.
+ statuses = [e for e in events if e["type"] == "status"]
+ assert statuses
+
+ def test_real_tool_call_does_not_unwrap(self):
+ # A non-respond tool call goes through the normal execute path.
+ loop, exec_fn = _make_loop(
+ turns = [
+ [
+ '{"name":"web_search",'
+ '"arguments":{"query":"weather"}}'
+ ],
+ ["The weather is sunny."],
+ ],
+ exec_results = ["Sunny and 22C"],
+ )
+ events = _collect_events(loop)
+ assert exec_fn.calls == [("web_search", {"query": "weather"})]
+ contents = [e for e in events if e["type"] == "content"]
+ assert any("sunny" in c.get("text", "").lower() for c in contents)
+
+ def test_respond_call_when_client_owns_tool(self):
+ # When the caller supplies a real "respond" tool, the synthetic
+ # one is NOT injected and the call goes through execute_tool
+ # like any other tool, preserving the client's semantics.
+ turn_iter = iter(
+ [
+ [
+ '{"name":"respond",'
+ '"arguments":{"message":"hi"}}'
+ ],
+ ["thanks"],
+ ]
+ )
+
+ def _gen(_messages):
+ try:
+ chunks = next(turn_iter)
+ except StopIteration:
+ return
+ acc = ""
+ for c in chunks:
+ acc += c
+ yield acc
+
+ exec_fn = FakeExecuteTool(["ack"])
+ loop = run_safetensors_tool_loop(
+ single_turn = _gen,
+ messages = [{"role": "user", "content": "hi"}],
+ tools = [
+ {"type": "function", "function": {"name": "respond"}},
+ ],
+ execute_tool = exec_fn,
+ )
+ events = _collect_events(loop)
+ # The client's respond tool was executed, not unwrapped.
+ assert exec_fn.calls == [("respond", {"message": "hi"})]
+
+
class TestStatusFormatting:
def test_status_for_known_tools(self):
# Use the private helper directly to verify status formatting.