Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 290 additions & 0 deletions studio/backend/core/inference/_html_to_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
# SPDX-License-Identifier: AGPL-3.0-only
# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0

"""
Minimal HTML-to-Markdown converter using only the standard library.

Replaces the external ``html2text`` (GPL-3.0) dependency with a ~180-line
``html.parser.HTMLParser`` subclass. Covers headings, links, bold/italic,
lists, tables, blockquotes, code blocks, and entity decoding.
"""

from __future__ import annotations

import html
import re
from html.parser import HTMLParser

__all__ = ["html_to_markdown"]

_SKIP_TAGS = frozenset({"script", "style", "head", "noscript", "svg", "math"})
_BLOCK_TAGS = frozenset(
{
"p",
"div",
"section",
"article",
"header",
"footer",
"main",
"aside",
"nav",
"figure",
"figcaption",
"details",
"summary",
"hr",

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The "hr" tag should be removed from the _BLOCK_TAGS set. Because it is included here, the specific handling for hr at line 118 in handle_starttag is unreachable, as the elif tag in _BLOCK_TAGS check at line 115 will match first and only emit newlines.

}
)
_HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"})
_INLINE_EMPHASIS = {"strong": "**", "b": "**", "em": "*", "i": "*"}


class _MarkdownRenderer(HTMLParser):
"""HTMLParser subclass that emits Markdown tokens into a list."""

def __init__(self):
super().__init__(convert_charrefs = False)
self._out: list[str] = []
self._skip_depth: int = 0

# Link state
self._link_href: str | None = None
self._link_text_parts: list[str] = []
self._in_link: bool = False

# List state
self._list_stack: list[str] = [] # "ul" or "ol"
self._ol_counter: list[int] = []

# Table state
self._in_table: bool = False
self._current_row: list[str] = []
self._cell_parts: list[str] = []
self._in_cell: bool = False
self._header_row_done: bool = False
self._is_header_cell: bool = False

# Pre/code state
self._in_pre: bool = False
self._pre_parts: list[str] = []

# Blockquote depth
self._bq_depth: int = 0

# ------------------------------------------------------------------
def _emit(self, text: str) -> None:
if self._in_link:
self._link_text_parts.append(text)
elif self._in_cell:
self._cell_parts.append(text)
elif self._in_pre:
self._pre_parts.append(text)
else:
self._out.append(text)

# ------------------------------------------------------------------
# Tag handlers
# ------------------------------------------------------------------
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
tag = tag.lower()

if tag in _SKIP_TAGS:
self._skip_depth += 1
return
Comment on lines +156 to +158

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Stop dropping body content when </head> is omitted

html_to_markdown() treats <head> as a skip-depth region, but HTML end tags for head are optional; many pages validly switch to <body> without emitting </head>. In this branch, _skip_depth is only decremented on explicit end tags, so once <head> is seen, all subsequent body tags/data are ignored and _fetch_page_text() can return (page returned no readable text) for otherwise normal documents. This is a functional regression in page extraction for common HTML5 markup.

Useful? React with 👍 / 👎.

if self._skip_depth:
return

attr_dict = dict(attrs)

if tag in _HEADING_TAGS:
level = int(tag[1])
self._emit("\n\n" + "#" * level + " ")

elif tag == "a":
self._link_href = attr_dict.get("href")
self._link_text_parts = []
self._in_link = True

elif tag in _INLINE_EMPHASIS:
self._emit(_INLINE_EMPHASIS[tag])

elif tag == "br":
self._emit("\n")

elif tag in _BLOCK_TAGS:
self._emit("\n\n")

elif tag == "hr":
self._emit("\n\n---\n\n")

elif tag == "blockquote":
self._bq_depth += 1
self._emit("\n\n" + "> " * self._bq_depth)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Markdown blockquotes require the > prefix on every line of the quoted content. The current implementation only emits the prefix once at the start of the blockquote. For multi-line or multi-paragraph content within a blockquote, subsequent lines will lack the necessary prefix, breaking the formatting.


elif tag == "ul":
self._list_stack.append("ul")
self._emit("\n")

elif tag == "ol":
self._list_stack.append("ol")
self._ol_counter.append(0)
self._emit("\n")

elif tag == "li":
indent = " " * max(0, len(self._list_stack) - 1)
if self._list_stack and self._list_stack[-1] == "ol":
self._ol_counter[-1] += 1
self._emit(f"\n{indent}{self._ol_counter[-1]}. ")
else:
self._emit(f"\n{indent}* ")

elif tag == "pre":
self._in_pre = True
self._pre_parts = []
self._emit("\n\n```\n")

elif tag == "code" and not self._in_pre:
self._emit("`")

elif tag == "table":
self._in_table = True
self._header_row_done = False
self._emit("\n\n")

elif tag == "tr":
self._current_row = []

elif tag in ("th", "td"):
self._cell_parts = []
self._in_cell = True
self._is_header_cell = tag == "th"

elif tag == "img":
alt = attr_dict.get("alt", "")
if alt:
self._emit(alt)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The converter currently only emits the alt text for images, which loses the image URL. It is generally better to preserve the full Markdown image syntax alt to maintain the original context and references.

Suggested change
self._emit(alt)
self._emit(f"![{alt}]({attr_dict.get('src', '')})")


def handle_endtag(self, tag: str) -> None:
tag = tag.lower()

if tag in _SKIP_TAGS:
self._skip_depth = max(0, self._skip_depth - 1)
return
if self._skip_depth:
return

if tag in _HEADING_TAGS:
self._emit("\n\n")

elif tag == "a":
text = "".join(self._link_text_parts).strip()
href = self._link_href or ""
self._in_link = False
if href and text:
self._emit(f"[{text}]({href})")
elif text:
self._emit(text)

elif tag in _INLINE_EMPHASIS:
self._emit(_INLINE_EMPHASIS[tag])

elif tag in _BLOCK_TAGS:
self._emit("\n\n")

elif tag == "blockquote":
self._bq_depth = max(0, self._bq_depth - 1)
self._emit("\n\n")

elif tag == "ul":
if self._list_stack and self._list_stack[-1] == "ul":
self._list_stack.pop()
self._emit("\n")

elif tag == "ol":
if self._list_stack and self._list_stack[-1] == "ol":
self._list_stack.pop()
if self._ol_counter:
self._ol_counter.pop()
self._emit("\n")

elif tag == "pre":
raw = "".join(self._pre_parts)
self._out.append(raw)
self._in_pre = False
self._emit("\n```\n\n")

elif tag == "code" and not self._in_pre:
self._emit("`")

elif tag in ("th", "td"):
self._in_cell = False
cell_text = "".join(self._cell_parts).strip()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Markdown tables do not support newlines within cells. If the HTML content contains
tags or other block elements inside a cell, the resulting Markdown will contain newlines that break the table's row structure. It's recommended to replace newlines within cell content with spaces.

Suggested change
cell_text = "".join(self._cell_parts).strip()
cell_text = "".join(self._cell_parts).strip().replace("\n", " ")

self._current_row.append(cell_text)

elif tag == "tr":
if self._current_row:
line = "| " + " | ".join(self._current_row) + " |"
self._emit(line + "\n")
if self._is_header_cell and not self._header_row_done:
sep = "| " + " | ".join("---" for _ in self._current_row) + " |"
self._emit(sep + "\n")
self._header_row_done = True

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for emitting a table separator is currently fragile as it depends on self._is_header_cell, which only reflects the state of the last cell in the row. If a row contains mixed and tags, or if the header row uses only tags, the separator might not be emitted, resulting in an invalid Markdown table. It's recommended to track if the row contains any header cells or to always emit a separator after the first row of every table.

self._current_row = []
self._is_header_cell = False

elif tag == "table":
self._in_table = False
self._emit("\n")

# ------------------------------------------------------------------
# Text / entity handlers
# ------------------------------------------------------------------
def handle_data(self, data: str) -> None:
if self._skip_depth:
return
if self._in_pre:
self._pre_parts.append(data)
return
# Collapse whitespace for non-pre content
text = re.sub(r"[ \t]+", " ", data)
self._emit(text)

def handle_entityref(self, name: str) -> None:
if self._skip_depth:
return
self._emit(html.unescape(f"&{name};"))

def handle_charref(self, name: str) -> None:
if self._skip_depth:
return
self._emit(html.unescape(f"&#{name};"))


# ------------------------------------------------------------------
# Post-processing
# ------------------------------------------------------------------
def _cleanup(text: str) -> str:
"""Normalize whitespace and blank lines in the final output."""
# Collapse runs of 3+ newlines into 2
text = re.sub(r"\n{3,}", "\n\n", text)
# Remove trailing spaces on each line
text = re.sub(r" +$", "", text, flags = re.MULTILINE)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The regex r" +$" only matches trailing spaces. It should be updated to r"[ \t]+$" to correctly handle and strip trailing tabs as well.

Suggested change
text = re.sub(r" +$", "", text, flags = re.MULTILINE)
text = re.sub(r"[ \t]+$", "", text, flags = re.MULTILINE)

return text.strip()


# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def html_to_markdown(source_html: str) -> str:
"""Convert an HTML string to Markdown.

Handles headings, links, bold/italic, lists (ordered and unordered),
tables, blockquotes, code blocks, and HTML entities. ``<script>``,
``<style>``, and ``<head>`` sections are stripped entirely.
"""
renderer = _MarkdownRenderer()
renderer.feed(source_html)
renderer.close()
raw = "".join(renderer._out)
Comment on lines +436 to +438

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Flush pending parser buffers before returning markdown

html_to_markdown() returns only renderer._out, but _MarkdownRenderer stores content for open <a>, <pre>, and table cells in side buffers until their closing tags fire. Because _fetch_page_text() reads a capped byte count before parsing, inputs can end mid-tag; when that happens, those buffers are never emitted and the tail of the page is silently dropped (for example, a truncated open anchor causes subsequent text to vanish). Flush any still-open buffers on parser close/end-of-document so truncated HTML still yields usable text.

Useful? React with 👍 / 👎.

return _cleanup(raw)
23 changes: 4 additions & 19 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import atexit
import contextlib
import hashlib
import json
import re
import struct
Expand Down Expand Up @@ -2181,22 +2180,6 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str:
# identical call succeeded).
_tool_call_history: list[tuple[str, bool]] = [] # (key, failed)

def _tool_call_key(name: str, args: dict) -> str:
raw = json.dumps({"t": name, "a": args}, sort_keys = True)
return hashlib.md5(raw.encode()).hexdigest()

def _is_duplicate_call(name: str, args: dict) -> bool:
"""Block if the immediately previous call was identical and succeeded."""
if not _tool_call_history:
return False
key = _tool_call_key(name, args)
last_key, last_failed = _tool_call_history[-1]
return last_key == key and not last_failed

def _record_tool_call(name: str, args: dict, failed: bool) -> None:
key = _tool_call_key(name, args)
_tool_call_history.append((key, failed))

for iteration in range(max_tool_iterations):
if cancel_event is not None and cancel_event.is_set():
return
Expand Down Expand Up @@ -2692,7 +2675,9 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None:
}

# ── Duplicate call detection ──────────────
if _is_duplicate_call(tool_name, arguments):
_tc_key = tool_name + json.dumps(arguments, sort_keys = True)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using str(arguments) for the duplicate tool-call key is unreliable because dictionary string representation depends on key insertion order. Logical duplicates with different key orders (which can occur during LLM generation and parsing) will result in different keys, causing the deduplication check to fail. Using json.dumps(arguments, sort_keys=True) provides a stable and robust key.

Suggested change
_tc_key = tool_name + json.dumps(arguments, sort_keys = True)
_tc_key = tool_name + json.dumps(arguments, sort_keys = True)

_prev = _tool_call_history[-1] if _tool_call_history else None
if _prev and _prev[0] == _tc_key and not _prev[1]:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Canonicalize args when building duplicate-call key

Building the dedup key with tool_name + str(arguments) makes loop prevention depend on Python dict insertion order rather than argument semantics. Fresh evidence in this revision is that each turn reparses tool arguments from model JSON (json.loads at lines 2637-2646), so the same call with reordered keys (e.g. {"q":...,"limit":...} then {"limit":...,"q":...}) produces different keys and bypasses the "previous successful call" guard, allowing repeated expensive or side-effectful tool executions.

Useful? React with 👍 / 👎.

result = (
"You already made this exact call. "
"Do not repeat the same tool call. "
Expand Down Expand Up @@ -2734,7 +2719,7 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None:
_is_error = isinstance(result, str) and result.lstrip().startswith(
_error_prefixes
)
_record_tool_call(tool_name, arguments, failed = _is_error)
_tool_call_history.append((_tc_key, _is_error))
_result_content = result
if _is_error:
_result_content = (
Expand Down
27 changes: 4 additions & 23 deletions studio/backend/core/inference/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def _fetch_page_text(
Blocks private/loopback/link-local targets (SSRF protection) and caps
the download size to avoid unbounded memory usage.
"""
import re as _re
from urllib.parse import urlparse

parsed = urlparse(url)
Expand Down Expand Up @@ -282,28 +281,10 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
except Exception as e:
return f"Failed to fetch URL: {e}"

# Convert HTML to text -- prefer html2text for clean markdown output
try:
import html2text as _h2t

converter = _h2t.HTML2Text()
converter.ignore_links = False
converter.ignore_images = True
converter.body_width = 0 # no wrapping
text = converter.handle(raw_html).strip()
except ImportError:
# Fallback: regex-based stripping
text = _re.sub(
r"<script[^>]*>.*?</script[^>]*>",
"",
raw_html,
flags = _re.DOTALL | _re.IGNORECASE,
)
text = _re.sub(
r"<style[^>]*>.*?</style[^>]*>", "", text, flags = _re.DOTALL | _re.IGNORECASE
)
text = _re.sub(r"<[^>]+>", " ", text)
text = _re.sub(r"\s+", " ", text).strip()
# Convert HTML to Markdown using the builtin converter (no external deps)
from ._html_to_md import html_to_markdown

text = html_to_markdown(raw_html)

if not text:
return "(page returned no readable text)"
Expand Down
Loading
Loading