joshuadavidthomas · joshuadavidthomas · Nov 20, 2025 · Nov 19, 2025 · Nov 20, 2025
@@ -18,6 +18,10 @@ and this project attempts to adhere to [Semantic Versioning](https://semver.org/
 
 ## [Unreleased]
 
+### Changed
+
+- Added normalization of anchor-wrapped headings in base HTML preprocessor for better markdown output
+
 ## [0.4.1]
 
 ### Fixed

@@ -88,7 +88,7 @@ def coverage(session):
         # 2 -> code coverage percent unmet
         success_codes = [0, 2]
 
-        report_cmd = ["python", "-m", "coverage", "report"]
+        report_cmd = ["python", "-m", "coverage", "report", "--show-missing"]
         session.run(*report_cmd, success_codes=success_codes)
 
         if summary := os.getenv("GITHUB_STEP_SUMMARY"):

@@ -16,6 +16,49 @@ def __init__(self, html: str) -> None:
         self.content_selectors: list[str] = self.get_content_selectors()
         self.generic_chrome_selectors: list[str] = self.get_generic_chrome_selectors()
 
+    def process_a(self, tag: Tag) -> None:
+        parent = tag.parent
+
+        # normalize anchors that directly wrap a single heading element
+        if getattr(parent, "name", None) in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+            return
+
+        children = [
+            child
+            for child in tag.children
+            if (getattr(child, "name", None) is not None) or str(child).strip()
+        ]
+
+        if len(children) != 1:
+            return
+
+        heading = children[0]
+        if not isinstance(heading, Tag) or heading.name not in {
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+        }:
+            return
+
+        # Restructure from:
+        #   <a ...><hN>Text</hN></a>
+        # to:
+        #   <hN><a ...>Text</a></hN>
+        heading.extract()
+        tag.replace_with(heading)
+
+        anchor = self.soup.new_tag("a")
+        for attr, value in list(tag.attrs.items()):
+            anchor[attr] = value
+
+        for child in list(heading.contents):
+            anchor.append(child.extract())
+
+        heading.append(anchor)
+
     def get_content_selectors(self) -> list[str]:
         return [
             "article#docs-content",

@@ -2,6 +2,7 @@
 
 import pytest
 from bs4 import BeautifulSoup
+from bs4 import Tag
 
 from docs2markdown.html import BaseHtmlPreprocessor
 from docs2markdown.html import SphinxHtmlPreprocessor
@@ -119,3 +120,40 @@ def test_sphinx_process_highlight_div_no_pre(soup):
     # Should return early without crashing, leaving div unchanged
     assert div.find("pre") is None
     assert div.find("code") is not None
+
+
+def test_base_process_a_wraps_heading_anchor_structure():
+    html = """
+<html>
+<body>
+    <a id="let-they-who-are-without-syn" href="#let-they-who-are-without-syn" class="anchor">
+        <h2>Let they who are without syn…</h2>
+    </a>
+</body>
+</html>
+"""
+
+    processor = BaseHtmlPreprocessor(html.replace("\n", ""))
+    body = processor.soup.body
+
+    anchor = body.find("a")
+
+    processor.process_a(anchor)
+
+    tag_children = [child for child in body.contents if isinstance(child, Tag)]
+
+    assert len(tag_children) == 1
+
+    heading = tag_children[0]
+
+    assert isinstance(heading, Tag)
+    assert heading.name == "h2"
+    assert len(heading.contents) == 1
+
+    inner_anchor = heading.contents[0]
+
+    assert isinstance(inner_anchor, Tag)
+    assert inner_anchor.name == "a"
+    assert inner_anchor.get("href") == "#let-they-who-are-without-syn"
+    assert inner_anchor.get("id") == "let-they-who-are-without-syn"
+    assert inner_anchor.get_text(strip=True) == "Let they who are without syn…"