diff --git a/CHANGELOG.md b/CHANGELOG.md
index e982a0b..27a0f46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,10 @@ and this project attempts to adhere to [Semantic Versioning](https://semver.org/
## [Unreleased]
+### Changed
+
+- Added normalization of anchor-wrapped headings in base HTML preprocessor for better markdown output
+
## [0.4.1]
### Fixed
diff --git a/noxfile.py b/noxfile.py
index 88420bf..e0be435 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -88,7 +88,7 @@ def coverage(session):
# 2 -> code coverage percent unmet
success_codes = [0, 2]
- report_cmd = ["python", "-m", "coverage", "report"]
+ report_cmd = ["python", "-m", "coverage", "report", "--show-missing"]
session.run(*report_cmd, success_codes=success_codes)
if summary := os.getenv("GITHUB_STEP_SUMMARY"):
diff --git a/src/docs2markdown/html.py b/src/docs2markdown/html.py
index e136e76..7ba1343 100644
--- a/src/docs2markdown/html.py
+++ b/src/docs2markdown/html.py
@@ -16,6 +16,49 @@ def __init__(self, html: str) -> None:
self.content_selectors: list[str] = self.get_content_selectors()
self.generic_chrome_selectors: list[str] = self.get_generic_chrome_selectors()
+ def process_a(self, tag: Tag) -> None:
+ parent = tag.parent
+
+ # normalize anchors that directly wrap a single heading element
+ if getattr(parent, "name", None) in {"h1", "h2", "h3", "h4", "h5", "h6"}:
+ return
+
+ children = [
+ child
+ for child in tag.children
+ if (getattr(child, "name", None) is not None) or str(child).strip()
+ ]
+
+ if len(children) != 1:
+ return
+
+ heading = children[0]
+ if not isinstance(heading, Tag) or heading.name not in {
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ }:
+ return
+
+ # Restructure from:
+ #