unslothai · danielhanchen · May 19, 2026 · May 17, 2026 · May 17, 2026 · May 18, 2026
diff --git a/studio/backend/core/inference/chat_template_helpers.py b/studio/backend/core/inference/chat_template_helpers.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""
+Dependency-light wrapper around tokenizer.apply_chat_template with a
+kwarg fallback for templates that reject reasoning/tools args.
+"""
+
+from typing import Optional
+
+
+def apply_chat_template_for_generation(
+    tokenizer,
+    messages: list,
+    *,
+    tools: Optional[list] = None,
+    enable_thinking: Optional[bool] = None,
+    reasoning_effort: Optional[str] = None,
+    preserve_thinking: Optional[bool] = None,
+) -> str:
+    """Render the chat prompt. Try richest kwargs first; drop one
+    group at a time on TypeError. Jinja / missing-variable errors
+    propagate."""
+    reasoning_kwargs: dict = {}
+    if enable_thinking is not None:
+        reasoning_kwargs["enable_thinking"] = enable_thinking
+    if reasoning_effort is not None:
+        reasoning_kwargs["reasoning_effort"] = reasoning_effort
+    if preserve_thinking is not None:
+        reasoning_kwargs["preserve_thinking"] = preserve_thinking
+
+    attempts: list[dict] = []
+    if tools and reasoning_kwargs:
+        attempts.append({"tools": tools, **reasoning_kwargs})
+    if tools:
+        attempts.append({"tools": tools})
+    if reasoning_kwargs:
+        attempts.append(dict(reasoning_kwargs))
+    attempts.append({})
+
+    last_exc: Optional[Exception] = None
+    for kwargs in attempts:
+        try:
+            return tokenizer.apply_chat_template(
+                messages,
+                tokenize = False,
+                add_generation_prompt = True,
+                **kwargs,
+            )
+        except TypeError as e:
+            last_exc = e
+            continue
+        except Exception as e:
+            last_exc = e
+            break
+    if last_exc is not None:
+        raise last_exc
+    raise RuntimeError(
+        "apply_chat_template_for_generation: no attempt produced a result"
+    )
diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
@@ -839,6 +839,74 @@ def generate_with_adapter_control(
             cancel_event = cancel_event, _adapter_state = use_adapter, **gen_kwargs
         )
 
+    def generate_chat_completion_with_tools(
+        self,
+        messages: list,
+        tools: list,
+        system_prompt: str = "",
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 40,
+        min_p: float = 0.0,
+        max_new_tokens: int = 2048,
+        repetition_penalty: float = 1.0,
+        cancel_event = None,
+        enable_thinking: Optional[bool] = None,
+        reasoning_effort: Optional[str] = None,
+        preserve_thinking: Optional[bool] = None,
+        max_tool_iterations: int = 25,
+        auto_heal_tool_calls: bool = True,
+        tool_call_timeout: int = 300,
+        session_id: Optional[str] = None,
+    ):
+        """Run an agentic tool loop on top of ``generate_chat_response``.
+
+        Yields the same event-dict protocol used by the GGUF path so
+        the route layer can stream both backends through one helper.
+        Each event is one of:
+
+        * ``{"type": "status", "text": ...}``
+        * ``{"type": "content", "text": cumulative_text}``
+        * ``{"type": "tool_start", "tool_name", "tool_call_id", "arguments"}``
+        * ``{"type": "tool_end", "tool_name", "tool_call_id", "result"}``
+        """
+        from core.inference.safetensors_agentic import run_safetensors_tool_loop
+        from core.inference.tools import execute_tool
+
+        def _single_turn(conv: list):
+            # conv already has the system message -- avoid double-prepend.
+            yield from self._generate_chat_response_inner(
+                messages = conv,
+                system_prompt = "",
+                temperature = temperature,
+                top_p = top_p,
+                top_k = top_k,
+                min_p = min_p,
+                max_new_tokens = max_new_tokens,
+                repetition_penalty = repetition_penalty,
+                cancel_event = cancel_event,
+                tools = tools,
+                enable_thinking = enable_thinking,
+                reasoning_effort = reasoning_effort,
+                preserve_thinking = preserve_thinking,
+            )
+
+        initial = list(messages)
+        if system_prompt:
+            initial = [{"role": "system", "content": system_prompt}] + initial
+
+        yield from run_safetensors_tool_loop(
+            single_turn = _single_turn,
+            messages = initial,
+            tools = tools,
+            execute_tool = execute_tool,
+            cancel_event = cancel_event,
+            auto_heal_tool_calls = auto_heal_tool_calls,
+            max_tool_iterations = max_tool_iterations,
+            tool_call_timeout = tool_call_timeout,
+            session_id = session_id,
+        )
+
     def generate_chat_response(
         self,
         messages: list,
@@ -851,10 +919,20 @@ def generate_chat_response(
         max_new_tokens: int = 256,
         repetition_penalty: float = 1.0,
         cancel_event = None,
+        tools: Optional[list] = None,
+        enable_thinking: Optional[bool] = None,
+        reasoning_effort: Optional[str] = None,
+        preserve_thinking: Optional[bool] = None,
     ) -> Generator[str, None, None]:
         """
         Generate response for text or vision models.
         The generation lock is acquired by the background generation thread.
+
+        ``tools`` / ``enable_thinking`` / ``reasoning_effort`` /
+        ``preserve_thinking`` are forwarded into
+        ``tokenizer.apply_chat_template`` so templates that understand
+        these kwargs (Qwen3, Llama 3.1+, gpt-oss harmony, ...) advertise
+        the tool schemas and reasoning controls to the model.
         """
         yield from self._generate_chat_response_inner(
             messages = messages,
@@ -867,6 +945,10 @@ def generate_chat_response(
             max_new_tokens = max_new_tokens,
             repetition_penalty = repetition_penalty,
             cancel_event = cancel_event,
+            tools = tools,
+            enable_thinking = enable_thinking,
+            reasoning_effort = reasoning_effort,
+            preserve_thinking = preserve_thinking,
         )
 
     def _generate_chat_response_inner(
@@ -882,6 +964,10 @@ def _generate_chat_response_inner(
         repetition_penalty: float = 1.0,
         cancel_event = None,
         _adapter_state = None,
+        tools: Optional[list] = None,
+        enable_thinking: Optional[bool] = None,
+        reasoning_effort: Optional[str] = None,
+        preserve_thinking: Optional[bool] = None,
     ) -> Generator[str, None, None]:
         """
         Inner generation logic. Called by both generate_chat_response
@@ -981,8 +1067,13 @@ def _generate_chat_response_inner(
                     f"Please use a model that includes a chat template, or manually set "
                     f"one via tokenizer.chat_template before inference."
                 )
-            formatted_prompt = tokenizer.apply_chat_template(
-                template_messages, tokenize = False, add_generation_prompt = True
+            formatted_prompt = self._apply_chat_template_for_generation(
+                tokenizer,
+                template_messages,
+                tools = tools,
+                enable_thinking = enable_thinking,
+                reasoning_effort = reasoning_effort,
+                preserve_thinking = preserve_thinking,
             )
             logger.debug(f"Formatted prompt: {formatted_prompt[:200]}...")
         except Exception as e:
@@ -1319,20 +1410,9 @@ def generate_whisper_response(
 
     def _is_gpt_oss_model(self, model_name: str = None) -> bool:
         """Check if the given (or active) model uses the gpt-oss harmony protocol."""
-        name = (model_name or self.active_model_name or "").lower()
-        try:
-            from utils.datasets import MODEL_TO_TEMPLATE_MAPPER
+        from utils.datasets import is_gpt_oss_model_name
 
-            # Exact match
-            if MODEL_TO_TEMPLATE_MAPPER.get(name) == "gpt-oss":
-                return True
-            # Partial match (e.g. name-bnb-4bit variants)
-            for key, tmpl in MODEL_TO_TEMPLATE_MAPPER.items():
-                if tmpl == "gpt-oss" and (key in name or name in key):
-                    return True
-        except Exception:
-            pass
-        return "gpt-oss" in name
+        return is_gpt_oss_model_name(model_name or self.active_model_name or "")
 
     def generate_stream(
         self,
@@ -1715,6 +1795,34 @@ def __call__(
             "Patched RepetitionPenaltyLogitsProcessor with 64-token window for OuteTTS"
         )
 
+    def _apply_chat_template_for_generation(
+        self,
+        tokenizer,
+        messages: list,
+        *,
+        tools: Optional[list] = None,
+        enable_thinking: Optional[bool] = None,
+        reasoning_effort: Optional[str] = None,
+        preserve_thinking: Optional[bool] = None,
+    ) -> str:
+        """Render the chat prompt, peeling kwargs the template does not
+        understand. Delegates to the dependency-light helper module so
+        the fallback chain can be unit-tested without pulling unsloth /
+        torch into the test sandbox.
+        """
+        from core.inference.chat_template_helpers import (
+            apply_chat_template_for_generation,
+        )
+
+        return apply_chat_template_for_generation(
+            tokenizer,
+            messages,
+            tools = tools,
+            enable_thinking = enable_thinking,
+            reasoning_effort = reasoning_effort,
+            preserve_thinking = preserve_thinking,
+        )
+
     def format_chat_prompt(self, messages: list, system_prompt: str = None) -> str:
         if not self.active_model_name or self.active_model_name not in self.models:
             logger.error("No active model available")

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -44,6 +44,9 @@
 from utils.subprocess_compat import (
     windows_hidden_subprocess_kwargs as _windows_hidden_subprocess_kwargs,
 )
+from core.inference.tool_call_parser import (
+    parse_tool_calls_from_text as _shared_parse_tool_calls_from_text,
+)
 
 logger = get_logger(__name__)
 
@@ -3904,16 +3907,9 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo
 
     @staticmethod
     def _parse_tool_calls_from_text(content: str) -> list[dict]:
-        """
-        Parse tool calls from XML markup in content text.
-
-        Handles formats like:
-          <tool_call>{"name":"web_search","arguments":{"query":"..."}}</tool_call>
-          <tool_call><function=web_search><parameter=query>...</parameter></function></tool_call>
-        Closing tags (</tool_call>, </function>, </parameter>) are all optional
-        since models frequently omit them.
-        """
-        return parse_tool_calls_from_text(content)
+        """Thin wrapper around the shared parser in tool_call_parser
+        so safetensors and llama_cpp pick up the same fixes."""
+        return _shared_parse_tool_calls_from_text(content)
 
     @staticmethod
     def _build_openai_messages(