Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ecea9b1
Studio: tools, thinking, code execution and web search for safetensors
danielhanchen May 17, 2026
c55b6b3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 17, 2026
7365491
Studio safetensors: preserve literal <tool_call> prose, split templat…
danielhanchen May 18, 2026
64e2c18
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 18, 2026
f79a265
Scrub .github/workflows for staging push (matches staging base)
danielhanchen May 18, 2026
42ef0d3
Merge origin/main into head
danielhanchen May 18, 2026
6510daa
Studio safetensors: tighten tool loop guardrails
danielhanchen May 18, 2026
2931b60
gpt-oss name detector: short-circuit on empty input
danielhanchen May 18, 2026
18b4897
Studio safetensors tool loop: extend coverage
danielhanchen May 18, 2026
fcc9b8a
Scrub leaked references from comments and string literals
danielhanchen May 18, 2026
46c1b2a
Scrub leaked references from comments and string literals
danielhanchen May 18, 2026
9491c84
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 18, 2026
5d6fcc7
Restore main-pickup workflows and offline-gguf-cache test that stagin…
danielhanchen May 18, 2026
219a223
Merge tests branch into head
danielhanchen May 18, 2026
8746297
Sync .github/workflows with upstream author branch
danielhanchen May 18, 2026
ab36225
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 18, 2026
35db672
Restore PR scope: revert unrelated workflow / image / code changes
danielhanchen May 18, 2026
c5b987c
Drop temp/staging_fixes scratch files accidentally added in previous …
danielhanchen May 18, 2026
50c209a
Merge remote-tracking branch 'origin/main' into studio-safetensors-tools
danielhanchen May 18, 2026
8e0e475
Fix CI: restore llama_cpp.py to main + reapply only the parser delegate
danielhanchen May 18, 2026
327c42c
Studio safetensors: ship chat_template through worker IPC so tool/cod…
danielhanchen May 18, 2026
7c4566e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 18, 2026
3793f03
Merge remote-tracking branch 'origin/main' into studio-safetensors-tools
danielhanchen May 19, 2026
6c92b61
Studio safetensors: address PR #5520 review feedback
danielhanchen May 19, 2026
24266c5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2026
3f03ebb
Studio safetensors: tighten comments
danielhanchen May 19, 2026
bb6844c
Studio safetensors: gate supports_tools on parser compatibility
danielhanchen May 19, 2026
6124d93
Studio safetensors: pin Qwen3.5 classifier expected output
danielhanchen May 19, 2026
b1b1623
Studio MLX: populate chat_template_info so Mac tool/reasoning pills e…
danielhanchen May 19, 2026
dafad14
Studio MLX: accept tools / enable_thinking / reasoning_effort / prese…
danielhanchen May 19, 2026
b3ac068
Merge remote-tracking branch 'origin/main' into studio-safetensors-tools
danielhanchen May 19, 2026
124c981
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions studio/backend/core/inference/chat_template_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-only
# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0

"""
Dependency-light wrapper around tokenizer.apply_chat_template with a
kwarg fallback for templates that reject reasoning/tools args.
"""

from typing import Optional


def apply_chat_template_for_generation(
tokenizer,
messages: list,
*,
tools: Optional[list] = None,
enable_thinking: Optional[bool] = None,
reasoning_effort: Optional[str] = None,
preserve_thinking: Optional[bool] = None,
) -> str:
"""Render the chat prompt. Try richest kwargs first; drop one
group at a time on TypeError. Jinja / missing-variable errors
propagate."""
reasoning_kwargs: dict = {}
if enable_thinking is not None:
reasoning_kwargs["enable_thinking"] = enable_thinking
if reasoning_effort is not None:
reasoning_kwargs["reasoning_effort"] = reasoning_effort
if preserve_thinking is not None:
reasoning_kwargs["preserve_thinking"] = preserve_thinking

attempts: list[dict] = []
if tools and reasoning_kwargs:
attempts.append({"tools": tools, **reasoning_kwargs})
if tools:
attempts.append({"tools": tools})
if reasoning_kwargs:
attempts.append(dict(reasoning_kwargs))
attempts.append({})

last_exc: Optional[Exception] = None
for kwargs in attempts:
try:
return tokenizer.apply_chat_template(
messages,
tokenize = False,
add_generation_prompt = True,
**kwargs,
)
except TypeError as e:
last_exc = e
continue
except Exception as e:
last_exc = e
break
if last_exc is not None:
raise last_exc
raise RuntimeError(
"apply_chat_template_for_generation: no attempt produced a result"
)
138 changes: 123 additions & 15 deletions studio/backend/core/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,74 @@ def generate_with_adapter_control(
cancel_event = cancel_event, _adapter_state = use_adapter, **gen_kwargs
)

def generate_chat_completion_with_tools(
self,
messages: list,
tools: list,
system_prompt: str = "",
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 40,
min_p: float = 0.0,
max_new_tokens: int = 2048,
repetition_penalty: float = 1.0,
cancel_event = None,
enable_thinking: Optional[bool] = None,
reasoning_effort: Optional[str] = None,
preserve_thinking: Optional[bool] = None,
max_tool_iterations: int = 25,
auto_heal_tool_calls: bool = True,
tool_call_timeout: int = 300,
session_id: Optional[str] = None,
):
"""Run an agentic tool loop on top of ``generate_chat_response``.

Yields the same event-dict protocol used by the GGUF path so
the route layer can stream both backends through one helper.
Each event is one of:

* ``{"type": "status", "text": ...}``
* ``{"type": "content", "text": cumulative_text}``
* ``{"type": "tool_start", "tool_name", "tool_call_id", "arguments"}``
* ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
"""
from core.inference.safetensors_agentic import run_safetensors_tool_loop
from core.inference.tools import execute_tool

def _single_turn(conv: list):
# conv already has the system message -- avoid double-prepend.
yield from self._generate_chat_response_inner(
messages = conv,
system_prompt = "",
temperature = temperature,
top_p = top_p,
top_k = top_k,
min_p = min_p,
max_new_tokens = max_new_tokens,
repetition_penalty = repetition_penalty,
cancel_event = cancel_event,
tools = tools,
enable_thinking = enable_thinking,
reasoning_effort = reasoning_effort,
preserve_thinking = preserve_thinking,
)

initial = list(messages)
if system_prompt:
initial = [{"role": "system", "content": system_prompt}] + initial

yield from run_safetensors_tool_loop(
single_turn = _single_turn,
messages = initial,
tools = tools,
execute_tool = execute_tool,
cancel_event = cancel_event,
auto_heal_tool_calls = auto_heal_tool_calls,
max_tool_iterations = max_tool_iterations,
tool_call_timeout = tool_call_timeout,
session_id = session_id,
)

def generate_chat_response(
self,
messages: list,
Expand All @@ -851,10 +919,20 @@ def generate_chat_response(
max_new_tokens: int = 256,
repetition_penalty: float = 1.0,
cancel_event = None,
tools: Optional[list] = None,
enable_thinking: Optional[bool] = None,
reasoning_effort: Optional[str] = None,
preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""
Generate response for text or vision models.
The generation lock is acquired by the background generation thread.

``tools`` / ``enable_thinking`` / ``reasoning_effort`` /
``preserve_thinking`` are forwarded into
``tokenizer.apply_chat_template`` so templates that understand
these kwargs (Qwen3, Llama 3.1+, gpt-oss harmony, ...) advertise
the tool schemas and reasoning controls to the model.
"""
yield from self._generate_chat_response_inner(
messages = messages,
Expand All @@ -867,6 +945,10 @@ def generate_chat_response(
max_new_tokens = max_new_tokens,
repetition_penalty = repetition_penalty,
cancel_event = cancel_event,
tools = tools,
enable_thinking = enable_thinking,
reasoning_effort = reasoning_effort,
preserve_thinking = preserve_thinking,
)

def _generate_chat_response_inner(
Expand All @@ -882,6 +964,10 @@ def _generate_chat_response_inner(
repetition_penalty: float = 1.0,
cancel_event = None,
_adapter_state = None,
tools: Optional[list] = None,
enable_thinking: Optional[bool] = None,
reasoning_effort: Optional[str] = None,
preserve_thinking: Optional[bool] = None,
) -> Generator[str, None, None]:
"""
Inner generation logic. Called by both generate_chat_response
Expand Down Expand Up @@ -981,8 +1067,13 @@ def _generate_chat_response_inner(
f"Please use a model that includes a chat template, or manually set "
f"one via tokenizer.chat_template before inference."
)
formatted_prompt = tokenizer.apply_chat_template(
template_messages, tokenize = False, add_generation_prompt = True
formatted_prompt = self._apply_chat_template_for_generation(
tokenizer,
template_messages,
tools = tools,
enable_thinking = enable_thinking,
reasoning_effort = reasoning_effort,
preserve_thinking = preserve_thinking,
)
logger.debug(f"Formatted prompt: {formatted_prompt[:200]}...")
except Exception as e:
Expand Down Expand Up @@ -1319,20 +1410,9 @@ def generate_whisper_response(

def _is_gpt_oss_model(self, model_name: str = None) -> bool:
"""Check if the given (or active) model uses the gpt-oss harmony protocol."""
name = (model_name or self.active_model_name or "").lower()
try:
from utils.datasets import MODEL_TO_TEMPLATE_MAPPER
from utils.datasets import is_gpt_oss_model_name

# Exact match
if MODEL_TO_TEMPLATE_MAPPER.get(name) == "gpt-oss":
return True
# Partial match (e.g. name-bnb-4bit variants)
for key, tmpl in MODEL_TO_TEMPLATE_MAPPER.items():
if tmpl == "gpt-oss" and (key in name or name in key):
return True
except Exception:
pass
return "gpt-oss" in name
return is_gpt_oss_model_name(model_name or self.active_model_name or "")

def generate_stream(
self,
Expand Down Expand Up @@ -1715,6 +1795,34 @@ def __call__(
"Patched RepetitionPenaltyLogitsProcessor with 64-token window for OuteTTS"
)

def _apply_chat_template_for_generation(
self,
tokenizer,
messages: list,
*,
tools: Optional[list] = None,
enable_thinking: Optional[bool] = None,
reasoning_effort: Optional[str] = None,
preserve_thinking: Optional[bool] = None,
) -> str:
"""Render the chat prompt, peeling kwargs the template does not
understand. Delegates to the dependency-light helper module so
the fallback chain can be unit-tested without pulling unsloth /
torch into the test sandbox.
"""
from core.inference.chat_template_helpers import (
apply_chat_template_for_generation,
)

return apply_chat_template_for_generation(
tokenizer,
messages,
tools = tools,
enable_thinking = enable_thinking,
reasoning_effort = reasoning_effort,
preserve_thinking = preserve_thinking,
)

def format_chat_prompt(self, messages: list, system_prompt: str = None) -> str:
if not self.active_model_name or self.active_model_name not in self.models:
logger.error("No active model available")
Expand Down
16 changes: 6 additions & 10 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
from utils.subprocess_compat import (
windows_hidden_subprocess_kwargs as _windows_hidden_subprocess_kwargs,
)
from core.inference.tool_call_parser import (
parse_tool_calls_from_text as _shared_parse_tool_calls_from_text,
)

logger = get_logger(__name__)

Expand Down Expand Up @@ -3904,16 +3907,9 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo

@staticmethod
def _parse_tool_calls_from_text(content: str) -> list[dict]:
"""
Parse tool calls from XML markup in content text.

Handles formats like:
<tool_call>{"name":"web_search","arguments":{"query":"..."}}</tool_call>
<tool_call><function=web_search><parameter=query>...</parameter></function></tool_call>
Closing tags (</tool_call>, </function>, </parameter>) are all optional
since models frequently omit them.
"""
return parse_tool_calls_from_text(content)
"""Thin wrapper around the shared parser in tool_call_parser
so safetensors and llama_cpp pick up the same fixes."""
return _shared_parse_tool_calls_from_text(content)

@staticmethod
def _build_openai_messages(
Expand Down
Loading
Loading