diff --git a/CHANGELOG.md b/CHANGELOG.md index e982a0b..27a0f46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,10 @@ and this project attempts to adhere to [Semantic Versioning](https://semver.org/ ## [Unreleased] +### Changed + +- Added normalization of anchor-wrapped headings in base HTML preprocessor for better markdown output + ## [0.4.1] ### Fixed diff --git a/noxfile.py b/noxfile.py index 88420bf..e0be435 100644 --- a/noxfile.py +++ b/noxfile.py @@ -88,7 +88,7 @@ def coverage(session): # 2 -> code coverage percent unmet success_codes = [0, 2] - report_cmd = ["python", "-m", "coverage", "report"] + report_cmd = ["python", "-m", "coverage", "report", "--show-missing"] session.run(*report_cmd, success_codes=success_codes) if summary := os.getenv("GITHUB_STEP_SUMMARY"): diff --git a/src/docs2markdown/html.py b/src/docs2markdown/html.py index e136e76..7ba1343 100644 --- a/src/docs2markdown/html.py +++ b/src/docs2markdown/html.py @@ -16,6 +16,49 @@ def __init__(self, html: str) -> None: self.content_selectors: list[str] = self.get_content_selectors() self.generic_chrome_selectors: list[str] = self.get_generic_chrome_selectors() + def process_a(self, tag: Tag) -> None: + parent = tag.parent + + # normalize anchors that directly wrap a single heading element + if getattr(parent, "name", None) in {"h1", "h2", "h3", "h4", "h5", "h6"}: + return + + children = [ + child + for child in tag.children + if (getattr(child, "name", None) is not None) or str(child).strip() + ] + + if len(children) != 1: + return + + heading = children[0] + if not isinstance(heading, Tag) or heading.name not in { + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + }: + return + + # Restructure from: + # Text + # to: + # Text + heading.extract() + tag.replace_with(heading) + + anchor = self.soup.new_tag("a") + for attr, value in list(tag.attrs.items()): + anchor[attr] = value + + for child in list(heading.contents): + anchor.append(child.extract()) + + heading.append(anchor) + def get_content_selectors(self) -> list[str]: return [ "article#docs-content", diff --git a/tests/test_html.py b/tests/test_html.py index 2edfb15..5b6877d 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -2,6 +2,7 @@ import pytest from bs4 import BeautifulSoup +from bs4 import Tag from docs2markdown.html import BaseHtmlPreprocessor from docs2markdown.html import SphinxHtmlPreprocessor @@ -119,3 +120,40 @@ def test_sphinx_process_highlight_div_no_pre(soup): # Should return early without crashing, leaving div unchanged assert div.find("pre") is None assert div.find("code") is not None + + +def test_base_process_a_wraps_heading_anchor_structure(): + html = """ + + + +

Let they who are without syn…

+
+ + +""" + + processor = BaseHtmlPreprocessor(html.replace("\n", "")) + body = processor.soup.body + + anchor = body.find("a") + + processor.process_a(anchor) + + tag_children = [child for child in body.contents if isinstance(child, Tag)] + + assert len(tag_children) == 1 + + heading = tag_children[0] + + assert isinstance(heading, Tag) + assert heading.name == "h2" + assert len(heading.contents) == 1 + + inner_anchor = heading.contents[0] + + assert isinstance(inner_anchor, Tag) + assert inner_anchor.name == "a" + assert inner_anchor.get("href") == "#let-they-who-are-without-syn" + assert inner_anchor.get("id") == "let-they-who-are-without-syn" + assert inner_anchor.get_text(strip=True) == "Let they who are without syn…"