From d9380bb8653b355514df06d093bcab619a7ef358 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 10:33:09 +0000 Subject: [PATCH 01/14] Simplify tool-call dedup: drop hashlib, inline helpers The duplicate tool-call detector only compares calls within a single request from the same JSON parser, so dict key order is guaranteed identical for identical calls (Python 3.7+ insertion-ordered dicts). - Replace hashlib.md5(json.dumps(...)) with name + str(args) - Inline _tool_call_key, _is_duplicate_call, _record_tool_call since each was a one-liner used once - Remove unused hashlib import --- studio/backend/core/inference/llama_cpp.py | 23 ++++------------------ 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index c1f87ff936..4dffd67f24 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -10,7 +10,6 @@ import atexit import contextlib -import hashlib import json import re import struct @@ -2181,22 +2180,6 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: # identical call succeeded). _tool_call_history: list[tuple[str, bool]] = [] # (key, failed) - def _tool_call_key(name: str, args: dict) -> str: - raw = json.dumps({"t": name, "a": args}, sort_keys = True) - return hashlib.md5(raw.encode()).hexdigest() - - def _is_duplicate_call(name: str, args: dict) -> bool: - """Block if the immediately previous call was identical and succeeded.""" - if not _tool_call_history: - return False - key = _tool_call_key(name, args) - last_key, last_failed = _tool_call_history[-1] - return last_key == key and not last_failed - - def _record_tool_call(name: str, args: dict, failed: bool) -> None: - key = _tool_call_key(name, args) - _tool_call_history.append((key, failed)) - for iteration in range(max_tool_iterations): if cancel_event is not None and cancel_event.is_set(): return @@ -2692,7 +2675,9 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None: } # ── Duplicate call detection ────────────── - if _is_duplicate_call(tool_name, arguments): + _tc_key = tool_name + str(arguments) + _prev = _tool_call_history[-1] if _tool_call_history else None + if _prev and _prev[0] == _tc_key and not _prev[1]: result = ( "You already made this exact call. " "Do not repeat the same tool call. " @@ -2734,7 +2719,7 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None: _is_error = isinstance(result, str) and result.lstrip().startswith( _error_prefixes ) - _record_tool_call(tool_name, arguments, failed = _is_error) + _tool_call_history.append((_tc_key, _is_error)) _result_content = result if _is_error: _result_content = ( From cc8fbbd559e8b595ca6e7d0af41fdd999855c373 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 10:35:54 +0000 Subject: [PATCH 02/14] Remove tool_calling_benchmark_results.md from repo --- .../tests/tool_calling_benchmark_results.md | 62 ------------------- 1 file changed, 62 deletions(-) delete mode 100644 studio/backend/tests/tool_calling_benchmark_results.md diff --git a/studio/backend/tests/tool_calling_benchmark_results.md b/studio/backend/tests/tool_calling_benchmark_results.md deleted file mode 100644 index c2b0687895..0000000000 --- a/studio/backend/tests/tool_calling_benchmark_results.md +++ /dev/null @@ -1,62 +0,0 @@ -# GGUF Tool Calling Benchmark Results - -Prompt: "List and categorize all the songs that charted #3 on the Billboard Hot 100 in 2015." -10 runs per configuration, web search + code execution + thinking enabled. -GPU: NVIDIA B200, CUDA_VISIBLE_DEVICES=2. - -Ground truth: 4 songs peaked at #3 in 2015 -- "Love Me like You Do" (Ellie Goulding), "Earned It" (The Weeknd), "Watch Me" (Silento), "Drag Me Down" (One Direction). - -## Cartesian Grid: Model x Quant x KV Cache - -| Model | Quant | KV Cache | OK/10 | Avg Time | Avg Tools | XML Leaks | URL Fetch | Peak3 Avg | All 4/4 | Best Songs | -|-------|-------|----------|-------|----------|-----------|-----------|-----------|-----------|---------|------------| -| 4B | UD-Q4_K_XL | f16 | 10/10 | 9.8s | 3.5 | 0/10 | 4/10 | 0.8/4 | 2/10 | 9 | -| 4B | UD-Q4_K_XL | bf16 | 10/10 | 10.6s | 4.5 | 0/10 | 4/10 | 0.4/4 | 1/10 | 5 | -| 4B | Q8_0 | f16 | 10/10 | 4.9s | 2.4 | 0/10 | 8/10 | 0.4/4 | 1/10 | 5 | -| 4B | Q8_0 | bf16 | 10/10 | 8.0s | 3.0 | 0/10 | 5/10 | 0.0/4 | 0/10 | 0 | -| 9B | UD-Q4_K_XL | f16 | 10/10 | 6.7s | 2.0 | 0/10 | 5/10 | 0.0/4 | 0/10 | 3 | -| 9B | UD-Q4_K_XL | bf16 | 9/10 | 49.5s | 2.4 | 0/10 | 5/10 | 0.0/4 | 0/10 | 1 | -| 9B | Q8_0 | f16 | 10/10 | 7.4s | 2.5 | 0/10 | 5/10 | 0.0/4 | 0/10 | 2 | -| 9B | Q8_0 | bf16 | 10/10 | 10.4s | 2.7 | 0/10 | 6/10 | 1.0/4 | 2/10 | 15 | -| **27B** | **UD-Q4_K_XL** | **bf16** | **9/10** | **131.1s** | **13.8** | **0/10** | **7/10** | **2.7/4** | **6/10** | **27** | -| 27B | UD-Q4_K_XL | f16 | 7/10 | 201.6s | 14.1 | 0/10 | 8/10 | 2.0/4 | 5/10 | 26 | -| 27B | Q8_0 | f16 | 4/10 | 312.5s | 16.0 | 1/10 | 10/10 | 2.4/4 | 6/10 | 28 | -| 27B | Q8_0 | bf16 | 5/10 | 258.4s | 16.5 | 2/10 | 10/10 | 0.9/4 | 1/10 | 27 | -| 35B-A3B | UD-Q4_K_XL | f16 | 3/10 | 353.6s | 14.7 | 1/10 | 6/10 | 1.2/4 | 3/10 | 27 | -| 35B-A3B | UD-Q4_K_XL | bf16 | 3/10 | 356.2s | 17.2 | 1/10 | 8/10 | 1.6/4 | 4/10 | 27 | -| 35B-A3B | Q8_0 | f16 | 2/10 | 372.1s | 17.6 | 1/10 | 7/10 | 1.2/4 | 3/10 | 26 | -| 35B-A3B | Q8_0 | bf16 | 6/10 | 267.7s | 17.5 | 1/10 | 8/10 | 2.4/4 | 6/10 | 27 | - -**Column definitions:** -- **Peak3 Avg**: Average number of correct peak-#3 songs found per run (out of 4) -- **All 4/4**: Runs where all 4 correct songs were identified -- **Best Songs**: Maximum number of Billboard 2015 songs mentioned in any single run (out of 31 tracked) -- **URL Fetch**: Runs where the model used web_search with `url` parameter to fetch full page content - -## Key Findings - -1. **27B UD-Q4_K_XL + bf16 KV is the sweet spot.** 6/10 runs found all 4 correct songs, 0 XML leaks, 131s average. Best balance of accuracy, speed, and reliability. - -2. **Larger models use tools more effectively.** 27B and 35B-A3B models used 13-17 tool calls per query (vs 2-4 for 4B/9B), performing multiple searches and URL fetches to find the answer. - -3. **27B Q8_0 had the highest raw accuracy (6/10 all-4/4) but lower reliability** -- only 4/10 OK runs due to timeouts on long agentic chains. The UD-Q4_K_XL quant is more practical. - -4. **4B models were fastest (5-10s) but least accurate.** They occasionally found all 4 songs (2/10 best case) when they happened to fetch the right Wikipedia page. - -5. **9B was surprisingly weaker than 4B on this task.** It used fewer tool calls and rarely extracted song data from fetched pages. The 9B model may need higher temperature or different prompting for this specific task type. - -6. **35B-A3B had reliability issues.** Most runs timed out or errored due to slow per-token generation with many tool iterations. When it completed (2-6/10 OK), accuracy was comparable to 27B. - -7. **bf16 KV cache had mixed effects.** For 27B it improved both speed (131s vs 202s) and accuracy (6/10 vs 5/10 all-4/4). For smaller models it had no consistent benefit. - -8. **XML leaks are nearly eliminated.** 0/10 for all 4B and 9B configs, and only 1-2/10 for the largest models (which generate much more text in complex agentic loops). - -## Before vs After (4B UD-Q4_K_XL, f16 KV) - -| Metric | Before Changes | After Changes | -|--------|---------------|---------------| -| XML leaks | 10/10 | 0/10 | -| URL fetches | 0/10 | 4/10 | -| Peak3 accuracy | 0.0/4 | 0.8/4 | -| Runs with all 4 songs | 0/10 | 2/10 | -| Avg time | 12.3s | 9.8s | From dff2fd3c3fb88c7d2799499e4ee7fa4d1c3e7b44 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 10:55:53 +0000 Subject: [PATCH 03/14] Replace html2text with builtin HTML-to-Markdown converter Drop the external html2text (GPL-3.0) dependency and its regex fallback. Add _html_to_md.py (~190 lines, stdlib only) using html.parser.HTMLParser that handles headings, links, bold/italic, lists, tables, blockquotes, code blocks, and entity decoding. Strips script/style/head tags entirely. --- studio/backend/core/inference/_html_to_md.py | 276 +++++++++++++++++++ studio/backend/core/inference/tools.py | 27 +- 2 files changed, 280 insertions(+), 23 deletions(-) create mode 100644 studio/backend/core/inference/_html_to_md.py diff --git a/studio/backend/core/inference/_html_to_md.py b/studio/backend/core/inference/_html_to_md.py new file mode 100644 index 0000000000..4489e46610 --- /dev/null +++ b/studio/backend/core/inference/_html_to_md.py @@ -0,0 +1,276 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +""" +Minimal HTML-to-Markdown converter using only the standard library. + +Replaces the external ``html2text`` (GPL-3.0) dependency with a ~180-line +``html.parser.HTMLParser`` subclass. Covers headings, links, bold/italic, +lists, tables, blockquotes, code blocks, and entity decoding. +""" + +from __future__ import annotations + +import html +import re +from html.parser import HTMLParser + +__all__ = ["html_to_markdown"] + +_SKIP_TAGS = frozenset({"script", "style", "head", "noscript", "svg", "math"}) +_BLOCK_TAGS = frozenset({ + "p", "div", "section", "article", "header", "footer", "main", "aside", + "nav", "figure", "figcaption", "details", "summary", "hr", +}) +_HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"}) +_INLINE_EMPHASIS = {"strong": "**", "b": "**", "em": "*", "i": "*"} + + +class _MarkdownRenderer(HTMLParser): + """HTMLParser subclass that emits Markdown tokens into a list.""" + + def __init__(self): + super().__init__(convert_charrefs=False) + self._out: list[str] = [] + self._skip_depth: int = 0 + + # Link state + self._link_href: str | None = None + self._link_text_parts: list[str] = [] + self._in_link: bool = False + + # List state + self._list_stack: list[str] = [] # "ul" or "ol" + self._ol_counter: list[int] = [] + + # Table state + self._in_table: bool = False + self._current_row: list[str] = [] + self._cell_parts: list[str] = [] + self._in_cell: bool = False + self._header_row_done: bool = False + self._is_header_cell: bool = False + + # Pre/code state + self._in_pre: bool = False + self._pre_parts: list[str] = [] + + # Blockquote depth + self._bq_depth: int = 0 + + # ------------------------------------------------------------------ + def _emit(self, text: str) -> None: + if self._in_link: + self._link_text_parts.append(text) + elif self._in_cell: + self._cell_parts.append(text) + elif self._in_pre: + self._pre_parts.append(text) + else: + self._out.append(text) + + # ------------------------------------------------------------------ + # Tag handlers + # ------------------------------------------------------------------ + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + tag = tag.lower() + + if tag in _SKIP_TAGS: + self._skip_depth += 1 + return + if self._skip_depth: + return + + attr_dict = dict(attrs) + + if tag in _HEADING_TAGS: + level = int(tag[1]) + self._emit("\n\n" + "#" * level + " ") + + elif tag == "a": + self._link_href = attr_dict.get("href") + self._link_text_parts = [] + self._in_link = True + + elif tag in _INLINE_EMPHASIS: + self._emit(_INLINE_EMPHASIS[tag]) + + elif tag == "br": + self._emit("\n") + + elif tag in _BLOCK_TAGS: + self._emit("\n\n") + + elif tag == "hr": + self._emit("\n\n---\n\n") + + elif tag == "blockquote": + self._bq_depth += 1 + self._emit("\n\n" + "> " * self._bq_depth) + + elif tag == "ul": + self._list_stack.append("ul") + self._emit("\n") + + elif tag == "ol": + self._list_stack.append("ol") + self._ol_counter.append(0) + self._emit("\n") + + elif tag == "li": + indent = " " * max(0, len(self._list_stack) - 1) + if self._list_stack and self._list_stack[-1] == "ol": + self._ol_counter[-1] += 1 + self._emit(f"\n{indent}{self._ol_counter[-1]}. ") + else: + self._emit(f"\n{indent}* ") + + elif tag == "pre": + self._in_pre = True + self._pre_parts = [] + self._emit("\n\n```\n") + + elif tag == "code" and not self._in_pre: + self._emit("`") + + elif tag == "table": + self._in_table = True + self._header_row_done = False + self._emit("\n\n") + + elif tag == "tr": + self._current_row = [] + + elif tag in ("th", "td"): + self._cell_parts = [] + self._in_cell = True + self._is_header_cell = tag == "th" + + elif tag == "img": + alt = attr_dict.get("alt", "") + if alt: + self._emit(alt) + + def handle_endtag(self, tag: str) -> None: + tag = tag.lower() + + if tag in _SKIP_TAGS: + self._skip_depth = max(0, self._skip_depth - 1) + return + if self._skip_depth: + return + + if tag in _HEADING_TAGS: + self._emit("\n\n") + + elif tag == "a": + text = "".join(self._link_text_parts).strip() + href = self._link_href or "" + self._in_link = False + if href and text: + self._emit(f"[{text}]({href})") + elif text: + self._emit(text) + + elif tag in _INLINE_EMPHASIS: + self._emit(_INLINE_EMPHASIS[tag]) + + elif tag in _BLOCK_TAGS: + self._emit("\n\n") + + elif tag == "blockquote": + self._bq_depth = max(0, self._bq_depth - 1) + self._emit("\n\n") + + elif tag == "ul": + if self._list_stack and self._list_stack[-1] == "ul": + self._list_stack.pop() + self._emit("\n") + + elif tag == "ol": + if self._list_stack and self._list_stack[-1] == "ol": + self._list_stack.pop() + if self._ol_counter: + self._ol_counter.pop() + self._emit("\n") + + elif tag == "pre": + raw = "".join(self._pre_parts) + self._out.append(raw) + self._in_pre = False + self._emit("\n```\n\n") + + elif tag == "code" and not self._in_pre: + self._emit("`") + + elif tag in ("th", "td"): + self._in_cell = False + cell_text = "".join(self._cell_parts).strip() + self._current_row.append(cell_text) + + elif tag == "tr": + if self._current_row: + line = "| " + " | ".join(self._current_row) + " |" + self._emit(line + "\n") + if self._is_header_cell and not self._header_row_done: + sep = "| " + " | ".join("---" for _ in self._current_row) + " |" + self._emit(sep + "\n") + self._header_row_done = True + self._current_row = [] + self._is_header_cell = False + + elif tag == "table": + self._in_table = False + self._emit("\n") + + # ------------------------------------------------------------------ + # Text / entity handlers + # ------------------------------------------------------------------ + def handle_data(self, data: str) -> None: + if self._skip_depth: + return + if self._in_pre: + self._pre_parts.append(data) + return + # Collapse whitespace for non-pre content + text = re.sub(r"[ \t]+", " ", data) + self._emit(text) + + def handle_entityref(self, name: str) -> None: + if self._skip_depth: + return + self._emit(html.unescape(f"&{name};")) + + def handle_charref(self, name: str) -> None: + if self._skip_depth: + return + self._emit(html.unescape(f"&#{name};")) + + +# ------------------------------------------------------------------ +# Post-processing +# ------------------------------------------------------------------ +def _cleanup(text: str) -> str: + """Normalize whitespace and blank lines in the final output.""" + # Collapse runs of 3+ newlines into 2 + text = re.sub(r"\n{3,}", "\n\n", text) + # Remove trailing spaces on each line + text = re.sub(r" +$", "", text, flags=re.MULTILINE) + return text.strip() + + +# ------------------------------------------------------------------ +# Public API +# ------------------------------------------------------------------ +def html_to_markdown(source_html: str) -> str: + """Convert an HTML string to Markdown. + + Handles headings, links, bold/italic, lists (ordered and unordered), + tables, blockquotes, code blocks, and HTML entities. ``