-
-
Notifications
You must be signed in to change notification settings - Fork 6.1k
Studio: simplify tool-call dedup and replace html2text with builtin converter #4722
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
d9380bb
cc8fbbd
dff2fd3
40cd783
b304264
70341be
2046c5c
c99f828
2c27165
f779091
ad18902
b07afb5
d398489
24b857f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,290 @@ | ||||||
| # SPDX-License-Identifier: AGPL-3.0-only | ||||||
| # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 | ||||||
|
|
||||||
| """ | ||||||
| Minimal HTML-to-Markdown converter using only the standard library. | ||||||
|
|
||||||
| Replaces the external ``html2text`` (GPL-3.0) dependency with a ~180-line | ||||||
| ``html.parser.HTMLParser`` subclass. Covers headings, links, bold/italic, | ||||||
| lists, tables, blockquotes, code blocks, and entity decoding. | ||||||
| """ | ||||||
|
|
||||||
| from __future__ import annotations | ||||||
|
|
||||||
| import html | ||||||
| import re | ||||||
| from html.parser import HTMLParser | ||||||
|
|
||||||
| __all__ = ["html_to_markdown"] | ||||||
|
|
||||||
| _SKIP_TAGS = frozenset({"script", "style", "head", "noscript", "svg", "math"}) | ||||||
| _BLOCK_TAGS = frozenset( | ||||||
| { | ||||||
| "p", | ||||||
| "div", | ||||||
| "section", | ||||||
| "article", | ||||||
| "header", | ||||||
| "footer", | ||||||
| "main", | ||||||
| "aside", | ||||||
| "nav", | ||||||
| "figure", | ||||||
| "figcaption", | ||||||
| "details", | ||||||
| "summary", | ||||||
| "hr", | ||||||
| } | ||||||
| ) | ||||||
| _HEADING_TAGS = frozenset({"h1", "h2", "h3", "h4", "h5", "h6"}) | ||||||
| _INLINE_EMPHASIS = {"strong": "**", "b": "**", "em": "*", "i": "*"} | ||||||
|
|
||||||
|
|
||||||
| class _MarkdownRenderer(HTMLParser): | ||||||
| """HTMLParser subclass that emits Markdown tokens into a list.""" | ||||||
|
|
||||||
| def __init__(self): | ||||||
| super().__init__(convert_charrefs = False) | ||||||
| self._out: list[str] = [] | ||||||
| self._skip_depth: int = 0 | ||||||
|
|
||||||
| # Link state | ||||||
| self._link_href: str | None = None | ||||||
| self._link_text_parts: list[str] = [] | ||||||
| self._in_link: bool = False | ||||||
|
|
||||||
| # List state | ||||||
| self._list_stack: list[str] = [] # "ul" or "ol" | ||||||
| self._ol_counter: list[int] = [] | ||||||
|
|
||||||
| # Table state | ||||||
| self._in_table: bool = False | ||||||
| self._current_row: list[str] = [] | ||||||
| self._cell_parts: list[str] = [] | ||||||
| self._in_cell: bool = False | ||||||
| self._header_row_done: bool = False | ||||||
| self._is_header_cell: bool = False | ||||||
|
|
||||||
| # Pre/code state | ||||||
| self._in_pre: bool = False | ||||||
| self._pre_parts: list[str] = [] | ||||||
|
|
||||||
| # Blockquote depth | ||||||
| self._bq_depth: int = 0 | ||||||
|
|
||||||
| # ------------------------------------------------------------------ | ||||||
| def _emit(self, text: str) -> None: | ||||||
| if self._in_link: | ||||||
| self._link_text_parts.append(text) | ||||||
| elif self._in_cell: | ||||||
| self._cell_parts.append(text) | ||||||
| elif self._in_pre: | ||||||
| self._pre_parts.append(text) | ||||||
| else: | ||||||
| self._out.append(text) | ||||||
|
|
||||||
| # ------------------------------------------------------------------ | ||||||
| # Tag handlers | ||||||
| # ------------------------------------------------------------------ | ||||||
| def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: | ||||||
| tag = tag.lower() | ||||||
|
|
||||||
| if tag in _SKIP_TAGS: | ||||||
| self._skip_depth += 1 | ||||||
| return | ||||||
|
Comment on lines
+156
to
+158
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||||||
| if self._skip_depth: | ||||||
| return | ||||||
|
|
||||||
| attr_dict = dict(attrs) | ||||||
|
|
||||||
| if tag in _HEADING_TAGS: | ||||||
| level = int(tag[1]) | ||||||
| self._emit("\n\n" + "#" * level + " ") | ||||||
|
|
||||||
| elif tag == "a": | ||||||
| self._link_href = attr_dict.get("href") | ||||||
| self._link_text_parts = [] | ||||||
| self._in_link = True | ||||||
|
|
||||||
| elif tag in _INLINE_EMPHASIS: | ||||||
| self._emit(_INLINE_EMPHASIS[tag]) | ||||||
|
|
||||||
| elif tag == "br": | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| elif tag in _BLOCK_TAGS: | ||||||
| self._emit("\n\n") | ||||||
|
|
||||||
| elif tag == "hr": | ||||||
| self._emit("\n\n---\n\n") | ||||||
|
|
||||||
| elif tag == "blockquote": | ||||||
| self._bq_depth += 1 | ||||||
| self._emit("\n\n" + "> " * self._bq_depth) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Markdown blockquotes require the > prefix on every line of the quoted content. The current implementation only emits the prefix once at the start of the blockquote. For multi-line or multi-paragraph content within a blockquote, subsequent lines will lack the necessary prefix, breaking the formatting. |
||||||
|
|
||||||
| elif tag == "ul": | ||||||
| self._list_stack.append("ul") | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| elif tag == "ol": | ||||||
| self._list_stack.append("ol") | ||||||
| self._ol_counter.append(0) | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| elif tag == "li": | ||||||
| indent = " " * max(0, len(self._list_stack) - 1) | ||||||
| if self._list_stack and self._list_stack[-1] == "ol": | ||||||
| self._ol_counter[-1] += 1 | ||||||
| self._emit(f"\n{indent}{self._ol_counter[-1]}. ") | ||||||
| else: | ||||||
| self._emit(f"\n{indent}* ") | ||||||
|
|
||||||
| elif tag == "pre": | ||||||
| self._in_pre = True | ||||||
| self._pre_parts = [] | ||||||
| self._emit("\n\n```\n") | ||||||
|
|
||||||
| elif tag == "code" and not self._in_pre: | ||||||
| self._emit("`") | ||||||
|
|
||||||
| elif tag == "table": | ||||||
| self._in_table = True | ||||||
| self._header_row_done = False | ||||||
| self._emit("\n\n") | ||||||
|
|
||||||
| elif tag == "tr": | ||||||
| self._current_row = [] | ||||||
|
|
||||||
| elif tag in ("th", "td"): | ||||||
| self._cell_parts = [] | ||||||
| self._in_cell = True | ||||||
| self._is_header_cell = tag == "th" | ||||||
|
|
||||||
| elif tag == "img": | ||||||
| alt = attr_dict.get("alt", "") | ||||||
| if alt: | ||||||
| self._emit(alt) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|
|
||||||
| def handle_endtag(self, tag: str) -> None: | ||||||
| tag = tag.lower() | ||||||
|
|
||||||
| if tag in _SKIP_TAGS: | ||||||
| self._skip_depth = max(0, self._skip_depth - 1) | ||||||
| return | ||||||
| if self._skip_depth: | ||||||
| return | ||||||
|
|
||||||
| if tag in _HEADING_TAGS: | ||||||
| self._emit("\n\n") | ||||||
|
|
||||||
| elif tag == "a": | ||||||
| text = "".join(self._link_text_parts).strip() | ||||||
| href = self._link_href or "" | ||||||
| self._in_link = False | ||||||
| if href and text: | ||||||
| self._emit(f"[{text}]({href})") | ||||||
| elif text: | ||||||
| self._emit(text) | ||||||
|
|
||||||
| elif tag in _INLINE_EMPHASIS: | ||||||
| self._emit(_INLINE_EMPHASIS[tag]) | ||||||
|
|
||||||
| elif tag in _BLOCK_TAGS: | ||||||
| self._emit("\n\n") | ||||||
|
|
||||||
| elif tag == "blockquote": | ||||||
| self._bq_depth = max(0, self._bq_depth - 1) | ||||||
| self._emit("\n\n") | ||||||
|
|
||||||
| elif tag == "ul": | ||||||
| if self._list_stack and self._list_stack[-1] == "ul": | ||||||
| self._list_stack.pop() | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| elif tag == "ol": | ||||||
| if self._list_stack and self._list_stack[-1] == "ol": | ||||||
| self._list_stack.pop() | ||||||
| if self._ol_counter: | ||||||
| self._ol_counter.pop() | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| elif tag == "pre": | ||||||
| raw = "".join(self._pre_parts) | ||||||
| self._out.append(raw) | ||||||
| self._in_pre = False | ||||||
| self._emit("\n```\n\n") | ||||||
|
|
||||||
| elif tag == "code" and not self._in_pre: | ||||||
| self._emit("`") | ||||||
|
|
||||||
| elif tag in ("th", "td"): | ||||||
| self._in_cell = False | ||||||
| cell_text = "".join(self._cell_parts).strip() | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Markdown tables do not support newlines within cells. If the HTML content contains
Suggested change
|
||||||
| self._current_row.append(cell_text) | ||||||
|
|
||||||
| elif tag == "tr": | ||||||
| if self._current_row: | ||||||
| line = "| " + " | ".join(self._current_row) + " |" | ||||||
| self._emit(line + "\n") | ||||||
| if self._is_header_cell and not self._header_row_done: | ||||||
| sep = "| " + " | ".join("---" for _ in self._current_row) + " |" | ||||||
| self._emit(sep + "\n") | ||||||
| self._header_row_done = True | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for emitting a table separator is currently fragile as it depends on self._is_header_cell, which only reflects the state of the last cell in the row. If a row contains mixed and tags, or if the header row uses only tags, the separator might not be emitted, resulting in an invalid Markdown table. It's recommended to track if the row contains any header cells or to always emit a separator after the first row of every table. |
||||||
| self._current_row = [] | ||||||
| self._is_header_cell = False | ||||||
|
|
||||||
| elif tag == "table": | ||||||
| self._in_table = False | ||||||
| self._emit("\n") | ||||||
|
|
||||||
| # ------------------------------------------------------------------ | ||||||
| # Text / entity handlers | ||||||
| # ------------------------------------------------------------------ | ||||||
| def handle_data(self, data: str) -> None: | ||||||
| if self._skip_depth: | ||||||
| return | ||||||
| if self._in_pre: | ||||||
| self._pre_parts.append(data) | ||||||
| return | ||||||
| # Collapse whitespace for non-pre content | ||||||
| text = re.sub(r"[ \t]+", " ", data) | ||||||
| self._emit(text) | ||||||
|
|
||||||
| def handle_entityref(self, name: str) -> None: | ||||||
| if self._skip_depth: | ||||||
| return | ||||||
| self._emit(html.unescape(f"&{name};")) | ||||||
|
|
||||||
| def handle_charref(self, name: str) -> None: | ||||||
| if self._skip_depth: | ||||||
| return | ||||||
| self._emit(html.unescape(f"&#{name};")) | ||||||
|
|
||||||
|
|
||||||
| # ------------------------------------------------------------------ | ||||||
| # Post-processing | ||||||
| # ------------------------------------------------------------------ | ||||||
| def _cleanup(text: str) -> str: | ||||||
| """Normalize whitespace and blank lines in the final output.""" | ||||||
| # Collapse runs of 3+ newlines into 2 | ||||||
| text = re.sub(r"\n{3,}", "\n\n", text) | ||||||
| # Remove trailing spaces on each line | ||||||
| text = re.sub(r" +$", "", text, flags = re.MULTILINE) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
| return text.strip() | ||||||
|
|
||||||
|
|
||||||
| # ------------------------------------------------------------------ | ||||||
| # Public API | ||||||
| # ------------------------------------------------------------------ | ||||||
| def html_to_markdown(source_html: str) -> str: | ||||||
| """Convert an HTML string to Markdown. | ||||||
|
|
||||||
| Handles headings, links, bold/italic, lists (ordered and unordered), | ||||||
| tables, blockquotes, code blocks, and HTML entities. ``<script>``, | ||||||
| ``<style>``, and ``<head>`` sections are stripped entirely. | ||||||
| """ | ||||||
| renderer = _MarkdownRenderer() | ||||||
| renderer.feed(source_html) | ||||||
| renderer.close() | ||||||
| raw = "".join(renderer._out) | ||||||
|
Comment on lines
+436
to
+438
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||||||
| return _cleanup(raw) | ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -10,7 +10,6 @@ | |||||
|
|
||||||
| import atexit | ||||||
| import contextlib | ||||||
| import hashlib | ||||||
| import json | ||||||
| import re | ||||||
| import struct | ||||||
|
|
@@ -2181,22 +2180,6 @@ def _strip_tool_markup(text: str, *, final: bool = False) -> str: | |||||
| # identical call succeeded). | ||||||
| _tool_call_history: list[tuple[str, bool]] = [] # (key, failed) | ||||||
|
|
||||||
| def _tool_call_key(name: str, args: dict) -> str: | ||||||
| raw = json.dumps({"t": name, "a": args}, sort_keys = True) | ||||||
| return hashlib.md5(raw.encode()).hexdigest() | ||||||
|
|
||||||
| def _is_duplicate_call(name: str, args: dict) -> bool: | ||||||
| """Block if the immediately previous call was identical and succeeded.""" | ||||||
| if not _tool_call_history: | ||||||
| return False | ||||||
| key = _tool_call_key(name, args) | ||||||
| last_key, last_failed = _tool_call_history[-1] | ||||||
| return last_key == key and not last_failed | ||||||
|
|
||||||
| def _record_tool_call(name: str, args: dict, failed: bool) -> None: | ||||||
| key = _tool_call_key(name, args) | ||||||
| _tool_call_history.append((key, failed)) | ||||||
|
|
||||||
| for iteration in range(max_tool_iterations): | ||||||
| if cancel_event is not None and cancel_event.is_set(): | ||||||
| return | ||||||
|
|
@@ -2692,7 +2675,9 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None: | |||||
| } | ||||||
|
|
||||||
| # ── Duplicate call detection ────────────── | ||||||
| if _is_duplicate_call(tool_name, arguments): | ||||||
| _tc_key = tool_name + json.dumps(arguments, sort_keys = True) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using str(arguments) for the duplicate tool-call key is unreliable because dictionary string representation depends on key insertion order. Logical duplicates with different key orders (which can occur during LLM generation and parsing) will result in different keys, causing the deduplication check to fail. Using json.dumps(arguments, sort_keys=True) provides a stable and robust key.
Suggested change
|
||||||
| _prev = _tool_call_history[-1] if _tool_call_history else None | ||||||
| if _prev and _prev[0] == _tc_key and not _prev[1]: | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Building the dedup key with Useful? React with 👍 / 👎. |
||||||
| result = ( | ||||||
| "You already made this exact call. " | ||||||
| "Do not repeat the same tool call. " | ||||||
|
|
@@ -2734,7 +2719,7 @@ def _record_tool_call(name: str, args: dict, failed: bool) -> None: | |||||
| _is_error = isinstance(result, str) and result.lstrip().startswith( | ||||||
| _error_prefixes | ||||||
| ) | ||||||
| _record_tool_call(tool_name, arguments, failed = _is_error) | ||||||
| _tool_call_history.append((_tc_key, _is_error)) | ||||||
| _result_content = result | ||||||
| if _is_error: | ||||||
| _result_content = ( | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The "hr" tag should be removed from the _BLOCK_TAGS set. Because it is included here, the specific handling for hr at line 118 in handle_starttag is unreachable, as the elif tag in _BLOCK_TAGS check at line 115 will match first and only emit newlines.