Skip to content

Commit 5c6f7af

Browse files
authored
Macro rewrite (#2702)
* Move all remaining fixText() stuff (macros, curly aposes, and em dashes) into the parser. Manual test changes are now 100% expected. * Remove stray print() * Properly revert an unmatched macro to [] characters. * No need to check for a comment start, since that's already been parsed. * Fix the regex * Don't eagerly lowercase macros on parse. Recursively replace HTML-parsed macros. * Remove the zwsp from the em-dash, as browsers already allow a break opportunity there anyway. * Pipe previous token into the context, so apostrophe handling can be done after an element's end tag. * Pull out the 'turn metadata into properly-parsed text' into a function, invoke it on all the macros that need it. * Add parseTitle() for generating <title>-safe content, and use it (and parseText()) in more metadata. Add a printNodeTree() debugging tool. Rebase some tests, whose changes should all be known-good. * Actually create Doctype nodes, so I don't accidentally kick docs into quirks mode. * Rebase all the tests that look expected so far. * Whoops, restore header/footer addition. * Handle lists in HTML trees. * Correctly handle otherMetadata so it doesn't double-wrap with <dd> * update docs * Correct the line numbers downstream when I remove a newline while handling em-dashes. * rebase tests * Rebase tests that have expected changes * Switch parser functions to taking a ParseConfig * Whoops, give Note: paragraphs a line number. * Make multi-line start tags emit IncrementLineCountChar charaters, which increment the offset for Lines and are removed from the output. Switch em-dash line correction to use them. Store macro start/end chars as named constants, too. * Rather than eagerly adding ilccs, only generate them *on request when stringifying* if the reported line span (endLine - line) is different from the actual line span * Rebase tests with fixed line numbers * Add line-count test to exercise the errors more directly. * Add a test for accidental raw text line count * Parse char references in text (and properly escape text). * Instead of stripping comments for Markdown, replace them with a recognizable comment string. Then, if they're the only thing on the line, Markdown can just drop them. * rebase tests * Rebase more tests. Don't output escaped text, but do output char refs as the charref, not the underlying character. Explicitly handle whole-line comments in datablocks, now that I don't strip them early. * lint * Regen docs * rebase one final test
1 parent 29c533f commit 5c6f7af

File tree

305 files changed

+2963
-2188
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

305 files changed

+2963
-2188
lines changed

bikeshed/InputSource.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import requests
1414
import tenacity
1515

16-
from . import config, line, t
16+
from . import config, constants, line, t
1717
from . import messages as m
1818

1919

@@ -24,7 +24,27 @@ class InputContent:
2424

2525
@property
2626
def lines(self) -> list[line.Line]:
27-
return [line.Line(lineNo, text) for lineNo, text in enumerate(self.rawLines, 1)]
27+
ret = []
28+
offset = 0
29+
for i, text in enumerate(self.rawLines, 1):
30+
lineNo = i + offset
31+
# The early HTML parser runs before Markdown,
32+
# and in some cases removes linebreaks that were present
33+
# in the source. When properly invoked, it inserts
34+
# a special PUA char for each of these omitted linebreaks,
35+
# so I can remove them here and properly increment the
36+
# line number.
37+
# Current known causes of this:
38+
# * line-ending -- turned into em dashes
39+
# * multi-line start tags
40+
ilcc = constants.incrementLineCountChar
41+
if ilcc in text:
42+
offset += text.count(ilcc)
43+
text = text.replace(ilcc, "")
44+
45+
ret.append(line.Line(lineNo, text))
46+
47+
return ret
2848

2949
@property
3050
def content(self) -> str:

bikeshed/Spec.py

+4-17
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import sys
99
from collections import OrderedDict, defaultdict
1010
from datetime import datetime
11-
from functools import partial as curry
1211

1312
from . import (
1413
InputSource,
@@ -146,7 +145,7 @@ def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
146145
)
147146
self.md = metadata.join(self.mdBaseline, self.mdDefaults, self.mdDocument, self.mdCommandLine)
148147

149-
text = h.strFromNodes(h.initialDocumentParse(inputContent.content, doc=self))
148+
text = h.strFromNodes(h.initialDocumentParse(inputContent.content, h.ParseConfig.fromSpec(self)), withIlcc=True)
150149
inputContent.rawLines = [x + "\n" for x in text.split("\n")]
151150
return inputContent.lines
152151

@@ -167,7 +166,6 @@ def assembleDocument(self) -> Spec:
167166
u.stripBOM(self)
168167
if self.lineNumbers:
169168
self.lines = u.hackyLineNumbers(self.lines)
170-
self.lines = markdown.stripComments(self.lines)
171169
self.recordDependencies(self.inputSource)
172170
# Extract and process metadata
173171
self.lines, self.mdDocument = metadata.parse(lines=self.lines)
@@ -182,7 +180,7 @@ def assembleDocument(self) -> Spec:
182180
# Using all of that, load up the text macros so I can sub them into the computed-metadata file.
183181
self.md.fillTextMacros(self.macros, doc=self)
184182
jsonEscapedMacros = {k: json.dumps(v)[1:-1] for k, v in self.macros.items()}
185-
computedMdText = h.replaceMacros(
183+
computedMdText = h.replaceMacrosTextly(
186184
retrieve.retrieveBoilerplateFile(self, "computed-metadata", error=True),
187185
macros=jsonEscapedMacros,
188186
)
@@ -235,7 +233,7 @@ def assembleDocument(self) -> Spec:
235233
# Convert to a single string of html now, for convenience.
236234
self.html = "".join(x.text for x in self.lines)
237235
boilerplate.addHeaderFooter(self)
238-
self.html = self.fixText(self.html)
236+
self.html = h.replaceMacros(self.html, self.macros)
239237

240238
# Build the document
241239
self.document = h.parseDocument(self.html)
@@ -470,20 +468,9 @@ def log_message(self, format: t.Any, *args: t.Any) -> None:
470468
except Exception as e:
471469
m.die(f"Something went wrong while watching the file:\n{e}")
472470

473-
def fixText(self, text: str, moreMacros: dict[str, str] | None = None) -> str:
474-
# Do several textual replacements that need to happen *before* the document is parsed as h.
475-
476-
# If markdown shorthands are on, remove all `foo`s while processing,
477-
# so their contents don't accidentally trigger other stuff.
478-
# Also handle markdown escapes.
479-
if moreMacros is None:
480-
moreMacros = {}
471+
def fixText(self, text: str) -> str:
481472
textFunctor: func.Functor = func.Functor(text)
482-
483-
macros = dict(self.macros, **moreMacros)
484-
textFunctor = textFunctor.map(curry(h.replaceMacros, macros=macros))
485473
textFunctor = textFunctor.map(h.fixTypography)
486-
487474
return t.cast(str, textFunctor.extract())
488475

489476
def printTargets(self) -> None:

bikeshed/boilerplate.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020

2121
def boilerplateFromHtml(doc: t.SpecT, htmlString: str) -> t.NodesT:
22+
htmlString = h.parseText(htmlString, h.ParseConfig.fromSpec(doc))
23+
htmlString = h.replaceMacros(htmlString, doc.macros)
2224
htmlString = doc.fixText(htmlString)
2325
bp = h.E.div({}, h.parseHTML(htmlString))
2426
conditional.processConditionals(doc, bp)
@@ -121,7 +123,9 @@ def addHeaderFooter(doc: t.SpecT) -> None:
121123
header = retrieve.retrieveBoilerplateFile(doc, "header") if "header" in doc.md.boilerplate else ""
122124
footer = retrieve.retrieveBoilerplateFile(doc, "footer") if "footer" in doc.md.boilerplate else ""
123125

124-
doc.html = "\n".join([header, doc.html, footer])
126+
doc.html = "\n".join(
127+
[h.parseText(header, h.ParseConfig.fromSpec(doc)), doc.html, h.parseText(footer, h.ParseConfig.fromSpec(doc))],
128+
)
125129

126130

127131
def fillWith(tag: str, newElements: t.NodesT, doc: t.SpecT) -> None:
@@ -213,14 +217,16 @@ def addAtRisk(doc: t.SpecT) -> None:
213217
return
214218
html = "<p>The following features are at-risk, and may be dropped during the CR period:\n<ul>"
215219
for feature in doc.md.atRisk:
216-
html += "<li>" + doc.fixText(h.parseText(feature))
220+
html += "<li>" + doc.fixText(h.parseText(feature, h.ParseConfig.fromSpec(doc)))
217221
html += (
218222
"</ul><p>“At-risk” is a W3C Process term-of-art, and does not necessarily imply that the feature is in danger of being dropped or delayed. "
219223
+ "It means that the WG believes the feature may have difficulty being interoperably implemented in a timely manner, "
220224
+ "and marking it as such allows the WG to drop the feature if necessary when transitioning to the Proposed Rec stage, "
221225
+ "without having to publish a new Candidate Rec without the feature first."
222226
)
223-
fillWith("at-risk", h.parseHTML(html), doc=doc)
227+
html = h.replaceMacros(html, doc.macros)
228+
frag = h.parseHTML(html)
229+
fillWith("at-risk", frag, doc=doc)
224230

225231

226232
def addStyles(doc: t.SpecT) -> None:
@@ -1031,7 +1037,15 @@ def printPreviousVersion(v: dict[str, str]) -> t.ElementT | None:
10311037
# and upgrade html-text values into real elements
10321038
otherMd: OrderedDict[str, list[MetadataValueT]] = OrderedDict()
10331039
for k, vs in doc.md.otherMetadata.items():
1034-
parsed: list[t.NodesT] = [h.parseHTML(doc.fixText(v)) if isinstance(v, str) else v for v in vs]
1040+
parsed: list[t.NodesT] = []
1041+
for v in vs:
1042+
if isinstance(v, str):
1043+
htmlText = h.parseText(v, h.ParseConfig.fromSpec(doc))
1044+
htmlText = h.replaceMacros(htmlText, doc.macros)
1045+
htmlText = doc.fixText(htmlText)
1046+
parsed.append(h.parseHTML(htmlText))
1047+
else:
1048+
parsed.append(v)
10351049
if k in md:
10361050
md[k].extend(parsed)
10371051
else:
@@ -1075,7 +1089,7 @@ def createMdEntry(key: str, dirtyVals: t.Sequence[MetadataValueT], doc: t.SpecT)
10751089
ret = [h.E.dt(displayKey, ":")]
10761090
# Add all the values, wrapping in a <dd> if necessary.
10771091
for val in vals:
1078-
if h.isElement(val) and val.tag == "dd":
1092+
if h.isElement(val) and h.tagName(val) == "dd":
10791093
ret.append(val)
10801094
else:
10811095
ret.append(h.E.dd({}, val))

bikeshed/config/status.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"DRAFT-FINDING": "Draft Finding",
1515
"FINDING": "Finding",
1616
"whatwg/RD": "Review Draft",
17-
"w3c/ED": "Editor's Draft",
17+
"w3c/ED": "Editors Draft",
1818
"w3c/WD": "W3C Working Draft",
1919
"w3c/FPWD": "W3C First Public Working Draft",
2020
"w3c/LCWD": "W3C Last Call Working Draft",
@@ -26,7 +26,7 @@
2626
"w3c/WG-NOTE": "W3C Group Note",
2727
"w3c/IG-NOTE": "W3C Group Note",
2828
"w3c/NOTE": "W3C Group Note",
29-
"w3c/NOTE-ED": "Editor's Draft",
29+
"w3c/NOTE-ED": "Editors Draft",
3030
"w3c/NOTE-WD": "W3C Group Draft Note",
3131
"w3c/NOTE-FPWD": "W3C Group Draft Note",
3232
"w3c/DRY": "W3C Draft Registry",
@@ -49,7 +49,7 @@
4949
"iso/MEET": "Meeting Announcements",
5050
"iso/RESP": "Records of Response",
5151
"iso/MIN": "Minutes",
52-
"iso/ER": "Editor's Report",
52+
"iso/ER": "Editors Report",
5353
"iso/SD": "Standing Document",
5454
"iso/PWI": "Preliminary Work Item",
5555
"iso/NP": "New Proposal",
@@ -80,13 +80,13 @@
8080
"iso/FD-AMD": "Final Draft Amendment",
8181
"iso/PRF-AMD": "Proof Amendment",
8282
"iso/AMD": "Amendment",
83-
"fido/ED": "Editor's Draft",
83+
"fido/ED": "Editors Draft",
8484
"fido/WD": "Working Draft",
8585
"fido/RD": "Review Draft",
8686
"fido/ID": "Implementation Draft",
8787
"fido/PS": "Proposed Standard",
8888
"fido/FD": "Final Document",
89-
"khronos/ED": "Editor's Draft",
89+
"khronos/ED": "Editors Draft",
9090
"aom/PD": "Pre-Draft",
9191
"aom/WGD": "AOM Working Group Draft",
9292
"aom/WGA": "AOM Working Group Approved Draft",

bikeshed/constants.py

+5
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,8 @@
77
biblioDisplay: StringEnum = StringEnum("index", "inline", "direct")
88
chroot: bool = True
99
executeCode: bool = False
10+
11+
macroStartChar = "\uebbb"
12+
macroEndChar = "\uebbc"
13+
incrementLineCountChar = "\uebbd"
14+
bsComment = "<!--\uebbe-->"

bikeshed/datablocks.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import attr
99

10-
from . import biblio, config, h, printjson, refs, t
10+
from . import biblio, config, constants, h, printjson, refs, t
1111
from . import messages as m
1212
from .line import Line
1313

@@ -91,6 +91,8 @@ def transformDataBlocks(doc: t.SpecT, lines: list[Line] | list[str]) -> list[Lin
9191
blockLines: list[Line] = []
9292
newLines: list[Line] = []
9393
for line in _lines:
94+
if line.text.strip() == constants.bsComment:
95+
continue
9496
# Look for the start of a block.
9597
match = re.match(r"\s*<(pre|xmp)[\s>]", line.text, re.I)
9698
# Note that, by design, I don't pay attention to anything on the same line as the start tag,
@@ -611,7 +613,7 @@ def parseDefBlock(
611613
else:
612614
vals[key] = val
613615
for key, val in vals.items():
614-
vals[key] = h.parseText(val)
616+
vals[key] = h.parseText(val, h.ParseConfig.fromSpec(doc))
615617
return vals
616618

617619

@@ -1059,6 +1061,10 @@ def extendData(datas: InfoTreeT, infoLevels: InfoTreeT) -> None:
10591061
thisLine = None
10601062
if line.strip() == "":
10611063
continue
1064+
if re.match(r"^\s*<!--.*-->\s*$", line):
1065+
# HTML comment filling the whole line,
1066+
# go ahead and strip it
1067+
continue
10621068
ws, text = t.cast("re.Match", re.match(r"(\s*)(.*)", line)).groups()
10631069
if text.startswith("#"): # comment
10641070
continue

bikeshed/h/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,14 @@
5454
parseHTML,
5555
prependChild,
5656
previousElements,
57+
printNodeTree,
5758
relevantHeadings,
5859
removeAttr,
5960
removeClass,
6061
removeNode,
6162
replaceContents,
6263
replaceMacros,
64+
replaceMacrosTextly,
6365
replaceNode,
6466
replaceWithContents,
6567
safeID,
@@ -80,6 +82,7 @@
8082
Comment,
8183
EndTag,
8284
Failure,
85+
ParseConfig,
8386
ParseFailure,
8487
Result,
8588
StartTag,
@@ -109,6 +112,7 @@
109112
parseStyleToEnd,
110113
parseTagName,
111114
parseText,
115+
parseTitle,
112116
parseUnquotedAttrValue,
113117
parseWhitespace,
114118
parseXmpToEnd,

bikeshed/h/dom.py

+55-5
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from lxml.cssselect import CSSSelector
1111
from lxml.html import tostring
1212

13-
from .. import t
13+
from .. import constants, t
1414
from ..messages import die, warn
1515

1616
if t.TYPE_CHECKING:
@@ -155,6 +155,27 @@ def outerHTML(el: t.NodesT | None, literal: bool = False, with_tail: bool = Fals
155155
return t.cast(str, tostring(el, with_tail=with_tail, encoding="unicode"))
156156

157157

158+
def printNodeTree(node: t.NodeT | str) -> str:
159+
# Debugging tool
160+
if isinstance(node, str):
161+
return "#text: " + repr(node)
162+
if isinstance(node, list):
163+
s = "[]"
164+
else:
165+
s = f"{serializeTag(node)}"
166+
linesPerChild = [printNodeTree(child).split("\n") for child in childNodes(node)]
167+
if linesPerChild:
168+
for childLines in linesPerChild[:-1]:
169+
childLines[0] = " ├" + childLines[0]
170+
childLines[1:] = [" │" + line for line in childLines[1:]]
171+
s += "\n" + "\n".join(childLines)
172+
childLines = linesPerChild[-1]
173+
childLines[0] = " ╰" + childLines[0]
174+
childLines[1:] = [" " + line for line in childLines[1:]]
175+
s += "\n" + "\n".join(childLines)
176+
return s
177+
178+
158179
def linkTextsFromElement(el: t.ElementT) -> list[str]:
159180
if el.get("data-lt") == "":
160181
return []
@@ -787,10 +808,6 @@ def hasOnlyChild(el: t.ElementT, wsAllowed: bool = True) -> t.ElementT | None:
787808

788809
def fixTypography(text: str) -> str:
789810
# Replace straight aposes with curly quotes for possessives and contractions.
790-
text = re.sub(r"([\w])'([\w])", r"\1’\2", text)
791-
text = re.sub(r"(</[\w]+>)'([\w])", r"\1’\2", text)
792-
# Fix line-ending em dashes, or --, by moving the previous line up, so no space.
793-
text = re.sub(r"([^<][^!])(—|--)\r?\n\s*(\S)", r"\1—<wbr>\3", text)
794811
return text
795812

796813

@@ -834,6 +851,39 @@ def replaceMacros(text: str, macros: t.Mapping[str, str]) -> str:
834851
# Macro syntax is [FOO], where FOO is /[A-Z0-9-]+/
835852
# If written as [FOO?], failure to find a matching macro just replaced it with nothing;
836853
# otherwise, it throws a fatal error.
854+
855+
def macroReplacer(match: re.Match) -> str:
856+
text = match.group(1).lower().strip()
857+
if text.endswith("?"):
858+
text = text[:-1].strip()
859+
optional = True
860+
else:
861+
optional = False
862+
if text in macros:
863+
# For some reason I store all the macros in lowercase,
864+
# despite requiring them to be spelled with uppercase.
865+
return str(macros[text])
866+
# Nothing has matched, so start failing the macros.
867+
if optional:
868+
return ""
869+
die(
870+
f"Found unmatched text macro [{match.group(1)}]. Correct the macro, or escape it somehow (leading backslash, html escape, etc).",
871+
)
872+
return t.cast(str, "[" + match.group(0)[1:-1] + "]")
873+
874+
while "\uebbb" in text:
875+
# Loop, as macros might expand to more macros
876+
# (which hopefully were HTML-parsed).
877+
ms = constants.macroStartChar
878+
me = constants.macroEndChar
879+
text = re.sub(f"{ms}(.+?){me}", macroReplacer, text)
880+
return text
881+
882+
883+
def replaceMacrosTextly(text: str, macros: t.Mapping[str, str]) -> str:
884+
# Same as replaceMacros(), but does the substitution
885+
# directly on the text, rather than relying on the
886+
# html parser to have preparsed the macro syntax
837887
def macroReplacer(match: re.Match) -> str:
838888
fullText = t.cast(str, match.group(0))
839889
innerText = match.group(2).lower() or ""

0 commit comments

Comments
 (0)