Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4437,12 +4437,22 @@ def generate_chat_completion_with_tools(
{"type": "content", "text": "token"} -- streamed content tokens (cumulative)
{"type": "reasoning", "text": "token"} -- streamed reasoning tokens (cumulative)
"""
from core.inference.tools import execute_tool
from core.inference.tools import (
execute_tool,
extract_respond_message,
inject_respond_tool,
is_respond_call,
)

if not self.is_loaded:
raise RuntimeError("llama-server is not loaded")

conversation = list(messages)

# Inject the synthetic respond tool so the model has a structured
# exit for plain assistant text. The unwrap path below strips
# the call and emits ``message`` as content.
tools, _respond_injected = inject_respond_tool(tools)
url = f"{self.base_url}/v1/chat/completions"
_accumulated_completion_tokens = 0
_accumulated_predicted_ms = 0.0
Expand Down Expand Up @@ -5000,6 +5010,17 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
conversation.append(assistant_msg)

for tc in tool_calls or []:
# Synthetic respond unwrap: emit the message as
# plain content and end the loop. The status reset
# below clears prev_text in the SSE consumer so the
# message streams as a fresh cumulative.
if _respond_injected and is_respond_call(tc):
message = extract_respond_message(tc)
yield {"type": "status", "text": ""}
if message:
yield {"type": "content", "text": message}
return
Comment on lines +5017 to +5022

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The early return here skips the final metadata yield (usage and timings) that normally occurs at the end of the generate_chat_completion_with_tools function (lines 5331-5340). This will cause the request to report zero or missing token counts and performance metrics in the UI when the respond tool is used.

Consider yielding the accumulated metadata before returning, or refactoring the loop to allow a clean exit that reaches the final metadata block to ensure all computed values are returned for callers to reuse.

References
  1. To improve efficiency, avoid redundant data iterations. Combine checks and transformations into a single loop and return computed values for callers to reuse.


func = tc.get("function", {})
tool_name = func.get("name", "")
raw_args = func.get("arguments", {})
Expand Down
23 changes: 23 additions & 0 deletions studio/backend/core/inference/safetensors_agentic.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,19 @@ def run_safetensors_tool_loop(
* ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
"""
conversation = list(messages)

# Inject the synthetic respond tool so the model has a structured
# exit for plain text. Unwrap below emits the message as content
# and returns. ``inject_respond_tool`` skips when ``tools`` is empty
# or already carries a tool named "respond".
from core.inference.tools import (
extract_respond_message,
inject_respond_tool,
is_respond_call,
)

tools, _respond_injected = inject_respond_tool(tools)

tool_call_history: list[tuple[str, bool]] = []
final_attempt_done = False
allowed_tool_names = {
Expand Down Expand Up @@ -300,6 +313,16 @@ def run_safetensors_tool_loop(
conversation.append(assistant_msg)

for tc in tool_calls or []:
# Synthetic respond unwrap: emit the message as content and
# end the loop. Empty status resets the consumer's prev_text
# so the message streams as a fresh cumulative.
if _respond_injected and is_respond_call(tc):
message = extract_respond_message(tc)
yield {"type": "status", "text": ""}
if message:
yield {"type": "content", "text": message}
return

func = tc.get("function", {}) or {}
tool_name = func.get("name", "") or ""
arguments = _coerce_arguments(
Expand Down
68 changes: 68 additions & 0 deletions studio/backend/core/inference/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import ast
import http.client
import json
import os
import signal

Expand Down Expand Up @@ -505,6 +506,73 @@ def _get_workdir(session_id: str | None = None) -> str:
ALL_TOOLS = [WEB_SEARCH_TOOL, PYTHON_TOOL, TERMINAL_TOOL]


# Synthetic respond tool. Adapted from forge
# (https://github.com/antoinezambelli/forge, MIT). The model calls
# respond(message=...) instead of producing bare text so the agentic
# loop has a structured exit. The unwrap path strips the call from the
# response and emits the message as plain assistant content.
RESPOND_TOOL_NAME = "respond"

RESPOND_TOOL = {
"type": "function",
"function": {
"name": RESPOND_TOOL_NAME,
"description": (
"Respond to the user with a message. Use this when the user "
"is chatting, asking a question, when you need to ask a "
"clarifying question before proceeding, or when no other "
"tool action is needed. Also use this after completing the "
"user's request to report the result."
),
"parameters": {
"type": "object",
"properties": {
"message": {
"type": "string",
"description": "The message to send to the user.",
},
},
"required": ["message"],
},
},
}


def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]:
"""Defensively append ``RESPOND_TOOL`` to ``tools``. Skips when the
list is empty/None or a tool named ``respond`` already exists.
Returns ``(new_tools, was_injected)``. The input list is not mutated.
"""
if not tools:
return tools, False
for t in tools:
if (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME:
return tools, False
return list(tools) + [RESPOND_TOOL], True
Comment on lines +541 to +551

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The inject_respond_tool function assumes that tools is a list of dictionaries. While this is the expected type, adding a check to ensure each element t is actually a dictionary before calling .get() would make the function more robust against malformed input from external callers. This ensures that when merging tool definitions from multiple sources, we handle potential type mismatches gracefully without shadowing.

Suggested change
def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]:
"""Defensively append ``RESPOND_TOOL`` to ``tools``. Skips when the
list is empty/None or a tool named ``respond`` already exists.
Returns ``(new_tools, was_injected)``. The input list is not mutated.
"""
if not tools:
return tools, False
for t in tools:
if (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME:
return tools, False
return list(tools) + [RESPOND_TOOL], True
def inject_respond_tool(tools: list[dict] | None) -> tuple[list[dict] | None, bool]:
"""Defensively append RESPOND_TOOL to tools. Skips when the
list is empty/None or a tool named respond already exists.
Returns (new_tools, was_injected). The input list is not mutated.
"""
if not tools:
return tools, False
for t in tools:
if isinstance(t, dict) and (t.get("function") or {}).get("name") == RESPOND_TOOL_NAME:
return tools, False
return list(tools) + [RESPOND_TOOL], True
References
  1. When multiple sources for enabling tools are present (e.g., protocol-specific requests and payload extensions), merge the sets of tools instead of allowing one to shadow the other.
  2. To improve efficiency, avoid redundant data iterations. Combine checks and transformations into a single loop and return computed values for callers to reuse.



def is_respond_call(tc: dict) -> bool:
"""True if ``tc`` is a tool call to the synthetic ``respond`` tool."""
return (tc.get("function") or {}).get("name") == RESPOND_TOOL_NAME


def extract_respond_message(tc: dict) -> str:
"""Pull the ``message`` arg out of a respond call. Tolerates
arguments arriving as either a JSON string or a dict. Returns ``""``
if missing or malformed.
"""
args = (tc.get("function") or {}).get("arguments", "")
if isinstance(args, str):
try:
args = json.loads(args)
except (json.JSONDecodeError, ValueError):
return ""
if not isinstance(args, dict):
return ""
msg = args.get("message", "")
return msg if isinstance(msg, str) else ""


_TIMEOUT_UNSET = object()


Expand Down
159 changes: 159 additions & 0 deletions studio/backend/tests/test_safetensors_tool_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
parse_tool_calls_from_text,
strip_tool_markup,
)
from core.inference.tools import (
RESPOND_TOOL,
RESPOND_TOOL_NAME,
extract_respond_message,
inject_respond_tool,
is_respond_call,
)
from utils.datasets import is_gpt_oss_model_name


Expand Down Expand Up @@ -498,6 +505,158 @@ def test_max_iterations_caps_loop(self):
assert contents and "final answer" in contents[-1]["text"]


class TestRespondToolHelpers:
"""Unit coverage for the synthetic respond tool helpers."""

def test_inject_into_real_tools(self):
base = [{"type": "function", "function": {"name": "web_search"}}]
out, injected = inject_respond_tool(base)
assert injected is True
# Original list not mutated.
assert len(base) == 1
assert len(out) == 2
assert out[-1]["function"]["name"] == RESPOND_TOOL_NAME

def test_inject_skips_empty(self):
out, injected = inject_respond_tool([])
assert injected is False
assert out == []

def test_inject_skips_none(self):
out, injected = inject_respond_tool(None)
assert injected is False
assert out is None

def test_inject_skips_on_collision(self):
base = [
{"type": "function", "function": {"name": "respond"}},
{"type": "function", "function": {"name": "python"}},
]
out, injected = inject_respond_tool(base)
assert injected is False
assert out is base

def test_is_respond_call(self):
assert is_respond_call({"function": {"name": "respond", "arguments": "{}"}})
assert not is_respond_call(
{"function": {"name": "web_search", "arguments": "{}"}}
)
assert not is_respond_call({})

def test_extract_message_from_json_string(self):
tc = {"function": {"name": "respond", "arguments": '{"message":"hi"}'}}
assert extract_respond_message(tc) == "hi"

def test_extract_message_from_dict(self):
tc = {"function": {"name": "respond", "arguments": {"message": "hi"}}}
assert extract_respond_message(tc) == "hi"

def test_extract_message_bad_json_returns_empty(self):
tc = {"function": {"name": "respond", "arguments": "{not json}"}}
assert extract_respond_message(tc) == ""

def test_extract_message_missing_key_returns_empty(self):
tc = {"function": {"name": "respond", "arguments": '{"other":"x"}'}}
assert extract_respond_message(tc) == ""

def test_extract_message_non_string_returns_empty(self):
tc = {"function": {"name": "respond", "arguments": '{"message":123}'}}
assert extract_respond_message(tc) == ""


class TestRespondToolUnwrap:
"""Loop-level coverage for the unwrap path in the agentic loop."""

def test_respond_call_emits_message_as_content(self):
# Model calls respond({message:"hi"}); loop must emit content
# "hi" and end without executing any real tool.
loop, exec_fn = _make_loop(
turns = [
[
'<tool_call>{"name":"respond",'
'"arguments":{"message":"hi"}}</tool_call>'
],
],
exec_results = [],
)
events = _collect_events(loop)
contents = [e for e in events if e["type"] == "content"]
assert any(e.get("text") == "hi" for e in contents)
assert exec_fn.calls == []

def test_respond_call_with_empty_message(self):
# An empty message string still terminates cleanly with no
# crash and no leaked tool execution.
loop, exec_fn = _make_loop(
turns = [
[
'<tool_call>{"name":"respond",'
'"arguments":{"message":""}}</tool_call>'
],
],
exec_results = [],
)
events = _collect_events(loop)
assert exec_fn.calls == []
# Status reset is yielded even when the message is empty.
statuses = [e for e in events if e["type"] == "status"]
assert statuses

def test_real_tool_call_does_not_unwrap(self):
# A non-respond tool call goes through the normal execute path.
loop, exec_fn = _make_loop(
turns = [
[
'<tool_call>{"name":"web_search",'
'"arguments":{"query":"weather"}}</tool_call>'
],
["The weather is sunny."],
],
exec_results = ["Sunny and 22C"],
)
events = _collect_events(loop)
assert exec_fn.calls == [("web_search", {"query": "weather"})]
contents = [e for e in events if e["type"] == "content"]
assert any("sunny" in c.get("text", "").lower() for c in contents)

def test_respond_call_when_client_owns_tool(self):
# When the caller supplies a real "respond" tool, the synthetic
# one is NOT injected and the call goes through execute_tool
# like any other tool, preserving the client's semantics.
turn_iter = iter(
[
[
'<tool_call>{"name":"respond",'
'"arguments":{"message":"hi"}}</tool_call>'
],
["thanks"],
]
)

def _gen(_messages):
try:
chunks = next(turn_iter)
except StopIteration:
return
acc = ""
for c in chunks:
acc += c
yield acc

exec_fn = FakeExecuteTool(["ack"])
loop = run_safetensors_tool_loop(
single_turn = _gen,
messages = [{"role": "user", "content": "hi"}],
tools = [
{"type": "function", "function": {"name": "respond"}},
],
execute_tool = exec_fn,
)
events = _collect_events(loop)
# The client's respond tool was executed, not unwrapped.
assert exec_fn.calls == [("respond", {"message": "hi"})]


class TestStatusFormatting:
def test_status_for_known_tools(self):
# Use the private helper directly to verify status formatting.
Expand Down
Loading