Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
238b91e
studio: tool calling for Llama-3, Mistral, Gemma 4 on safetensors + M…
danielhanchen May 19, 2026
a0a0c97
studio: tool-call healing parity between safetensors / MLX and GGUF
danielhanchen May 19, 2026
f790680
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2026
e9b4d3f
studio: fix tool-call parser bugs from gemini review on #5620
shimmyshimmer May 19, 2026
7d8e725
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2026
7ef4b11
studio/routes: make python_tag strip multi-line aware
May 22, 2026
0b4fa12
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 22, 2026
e63c4cf
Merge branch 'main' into studio-tools-multi-format-v2
danielhanchen May 27, 2026
7f9177a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 27, 2026
1fe632d
studio: tighten verbose comments in tool-call parser sections
danielhanchen May 27, 2026
9b3a6c6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 27, 2026
2a76a9b
studio: parser robustness fixes for PR #5620
danielhanchen May 27, 2026
1919e2a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 27, 2026
2c108c6
Merge branch 'main' into studio-tools-multi-format-v2
danielhanchen May 27, 2026
c4bbecf
studio: terminate function-XML body at </function>, not just </tool_c…
danielhanchen May 27, 2026
85660c1
Studio: tighten Llama-3.2 bare-JSON guard
danielhanchen May 27, 2026
bc13a38
studio: fix safetensors tool-call parser gaps vs llama.cpp (Mistral C…
danielhanchen May 31, 2026
5a02120
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 31, 2026
c6d1478
studio: fire safetensors tool calls for the bare-JSON (Llama-3.2) form
danielhanchen May 31, 2026
85751a1
Merge remote-tracking branch 'origin/studio-tools-multi-format-v2' in…
danielhanchen May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,14 @@
_TOOL_ALL_PATS,
_TOOL_CLOSED_PATS,
parse_tool_calls_from_text,
strip_tool_call_markup,
)

# Share strip / signal constants with the multi-format parser so the
# BUFFERING state machine also catches Llama-3 / Mistral / Gemma 4
# emissions (legacy helper only knew <tool_call> / <function=).
from core.inference.tool_call_parser import (
TOOL_XML_SIGNALS as _SHARED_TOOL_XML_SIGNALS,
strip_tool_markup as _shared_strip_tool_markup,
)
from utils.native_path_leases import child_env_without_native_path_secret
from utils.subprocess_compat import (
Expand Down Expand Up @@ -4453,14 +4460,12 @@ def generate_chat_completion_with_tools(
def _strip_tool_markup(text: str, *, final: bool = False) -> str:
if not auto_heal_tool_calls:
return text
return strip_tool_call_markup(text, final = final)
return _shared_strip_tool_markup(text, final = final)

# XML prefixes that signal a tool call in content.
# Empty when auto_heal is disabled so the buffer never
# speculatively holds content for XML detection.
_TOOL_XML_SIGNALS = (
("<tool_call>", "<function=") if auto_heal_tool_calls else ()
)
# Markers the BUFFERING state machine watches for; covers Qwen,
# Qwen3.5, Llama-3, Mistral, and Gemma 4. Empty when auto-heal
# is off so the buffer never speculatively holds content.
_TOOL_XML_SIGNALS = _SHARED_TOOL_XML_SIGNALS if auto_heal_tool_calls else ()
_MAX_BUFFER_CHARS = 32

# ── Duplicate tool-call detection ────────────────────────
Expand Down Expand Up @@ -5011,7 +5016,15 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
arguments = json.loads(raw_args)
except (json.JSONDecodeError, ValueError):
if auto_heal_tool_calls:
arguments = {"query": raw_args}
# Canonical per-tool heal key (must match
# safetensors_agentic._CANONICAL_HEAL_ARG)
# so bare-string emissions still run the
# intended tool.
_heal_key = {
"python": "code",
"terminal": "command",
}.get(tool_name, "query")
arguments = {_heal_key: raw_args}
else:
arguments = {"raw": raw_args}
else:
Expand Down
88 changes: 73 additions & 15 deletions studio/backend/core/inference/safetensors_agentic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""

import json
import re
import threading
from typing import Callable, Generator, Optional
from urllib.parse import urlparse
Expand All @@ -42,6 +43,24 @@
# Buffer cap while waiting to disambiguate a possible tool-call prefix.
_MAX_BUFFER_CHARS = 32

# Forward-looking intent ("I'll...", "First, ...", "Step 1:") that
# means the model is planning rather than answering. Used to nudge it
# to call a tool. Excludes "I can / I should / I want / let's" because
# those also appear in direct answers and explanations. Mirrors GGUF.
_INTENT_SIGNAL = re.compile(
r"(?i)("
r"\b(i['’](ll|m going to|m gonna)|i am (going to|gonna)|i will|i shall|let me|allow me)\b"
r"|\b(?:first\b|step \d+:?|here['’]?s (?:my |the |a )?(?:plan|approach))"
r"|\b(?:now i|next i)\b"
r")"
)
_MAX_REPROMPTS = 3
_REPROMPT_MAX_CHARS = 2000
_REPROMPT_INSTRUCTION = (
"STOP. Do NOT write code or explain. You MUST call a tool NOW. "
"Call web_search or python immediately."
)


def _status_for_tool(tool_name: str, arguments: dict) -> str:
"""Return a human-readable status line matching the GGUF path."""
Expand Down Expand Up @@ -142,6 +161,7 @@ def run_safetensors_tool_loop(
if (tool.get("function") or {}).get("name")
}
next_call_id = 0
reprompt_count = 0

if max_tool_iterations <= 0:
# 0 = disabled (same contract as the GGUF loop).
Expand All @@ -152,7 +172,9 @@ def run_safetensors_tool_loop(
_state_streaming = 1
_state_draining = 2

for iteration in range(max_tool_iterations + 1):
# Reserve re-prompt slots so they don't eat the caller's tool budget.
_extra_iters = _MAX_REPROMPTS if max_tool_iterations > 0 else 0
for iteration in range(max_tool_iterations + _extra_iters + 1):
if cancel_event is not None and cancel_event.is_set():
return

Expand Down Expand Up @@ -242,24 +264,57 @@ def run_safetensors_tool_loop(
if stripped and has_tool_signal(stripped):
detect_state = _state_draining
else:
# Drain the buffer and fall through to STREAMING so the
# intent re-prompt + safety-net parser can still fire on
# short emissions like "Let me search." that never exit
# BUFFERING (would otherwise silently end the loop).
if content_buffer:
cumulative_display += content_buffer
yield {
"type": "content",
"text": strip_tool_markup(cumulative_display, final = True),
}
yield {"type": "status", "text": ""}
return
cleaned = strip_tool_markup(cumulative_display, final = True)
if len(cleaned) > len(last_emitted):
last_emitted = cleaned
yield {"type": "content", "text": cleaned}
detect_state = _state_streaming

if detect_state == _state_streaming:
# No tool detected mid-stream -- check for late tool XML.
safety_tc = None
if has_tool_signal(content_accum):
safety_tc = parse_tool_calls_from_text(
content_accum,
id_offset = next_call_id,
)
# No tool XML detected mid-stream -- run the parser anyway.
# The Llama-3.2 bare-JSON tool form ``{"name":..,"parameters":..}``
# carries no XML signal, so gating this on has_tool_signal()
# silently dropped real tool calls and re-prompted the model into
# giving up. parse_tool_calls_from_text is strict (it only fires
# on a valid tool-call shape), so plain answers stay untouched.
# This mirrors what llama-server already does for GGUF.
safety_tc = parse_tool_calls_from_text(
content_accum,
id_offset = next_call_id,
)
if not safety_tc:
# Re-prompt only when the model planned without acting
# (intent signal present); direct answers like "4" or
# "Hello!" never trigger. Mirrors GGUF.
_stripped = content_accum.strip()
if (
tools
and reprompt_count < _MAX_REPROMPTS
and 0 < len(_stripped) < _REPROMPT_MAX_CHARS
and _INTENT_SIGNAL.search(_stripped)
and not final_attempt_done
):
reprompt_count += 1
logger.info(
"Safetensors re-prompt %d/%d: model planned without "
"calling tools (%d chars)",
reprompt_count,
_MAX_REPROMPTS,
len(_stripped),
)
conversation.append({"role": "assistant", "content": _stripped})
conversation.append(
{"role": "user", "content": _REPROMPT_INSTRUCTION}
)
yield {"type": "status", "text": ""}
continue

# Final answer: streaming already emitted content.
# Skip a final=True re-strip so literal "<tool_call>"
# in prose survives when no real tool call parsed.
Expand Down Expand Up @@ -379,7 +434,10 @@ def run_safetensors_tool_loop(
# Clear the status badge before the next turn.
yield {"type": "status", "text": ""}

if iteration + 1 >= max_tool_iterations and not final_attempt_done:
# Track against the caller-requested cap, excluding re-prompt
# slots so a stalling model still gets a final-answer attempt.
_tool_iters_done = iteration + 1 - reprompt_count
if _tool_iters_done >= max_tool_iterations and not final_attempt_done:
# Budget exhausted; nudge a final plain answer.
final_attempt_done = True
conversation.append(
Expand Down
Loading
Loading