|
10 | 10 | from lxml.cssselect import CSSSelector
|
11 | 11 | from lxml.html import tostring
|
12 | 12 |
|
13 |
| -from .. import t |
| 13 | +from .. import constants, t |
14 | 14 | from ..messages import die, warn
|
15 | 15 |
|
16 | 16 | if t.TYPE_CHECKING:
|
@@ -155,6 +155,27 @@ def outerHTML(el: t.NodesT | None, literal: bool = False, with_tail: bool = Fals
|
155 | 155 | return t.cast(str, tostring(el, with_tail=with_tail, encoding="unicode"))
|
156 | 156 |
|
157 | 157 |
|
| 158 | +def printNodeTree(node: t.NodeT | str) -> str: |
| 159 | + # Debugging tool |
| 160 | + if isinstance(node, str): |
| 161 | + return "#text: " + repr(node) |
| 162 | + if isinstance(node, list): |
| 163 | + s = "[]" |
| 164 | + else: |
| 165 | + s = f"{serializeTag(node)}" |
| 166 | + linesPerChild = [printNodeTree(child).split("\n") for child in childNodes(node)] |
| 167 | + if linesPerChild: |
| 168 | + for childLines in linesPerChild[:-1]: |
| 169 | + childLines[0] = " ├" + childLines[0] |
| 170 | + childLines[1:] = [" │" + line for line in childLines[1:]] |
| 171 | + s += "\n" + "\n".join(childLines) |
| 172 | + childLines = linesPerChild[-1] |
| 173 | + childLines[0] = " ╰" + childLines[0] |
| 174 | + childLines[1:] = [" " + line for line in childLines[1:]] |
| 175 | + s += "\n" + "\n".join(childLines) |
| 176 | + return s |
| 177 | + |
| 178 | + |
158 | 179 | def linkTextsFromElement(el: t.ElementT) -> list[str]:
|
159 | 180 | if el.get("data-lt") == "":
|
160 | 181 | return []
|
@@ -787,10 +808,6 @@ def hasOnlyChild(el: t.ElementT, wsAllowed: bool = True) -> t.ElementT | None:
|
787 | 808 |
|
788 | 809 | def fixTypography(text: str) -> str:
|
789 | 810 | # Replace straight aposes with curly quotes for possessives and contractions.
|
790 |
| - text = re.sub(r"([\w])'([\w])", r"\1’\2", text) |
791 |
| - text = re.sub(r"(</[\w]+>)'([\w])", r"\1’\2", text) |
792 |
| - # Fix line-ending em dashes, or --, by moving the previous line up, so no space. |
793 |
| - text = re.sub(r"([^<][^!])(—|--)\r?\n\s*(\S)", r"\1—<wbr>\3", text) |
794 | 811 | return text
|
795 | 812 |
|
796 | 813 |
|
@@ -834,6 +851,39 @@ def replaceMacros(text: str, macros: t.Mapping[str, str]) -> str:
|
834 | 851 | # Macro syntax is [FOO], where FOO is /[A-Z0-9-]+/
|
835 | 852 | # If written as [FOO?], failure to find a matching macro just replaced it with nothing;
|
836 | 853 | # otherwise, it throws a fatal error.
|
| 854 | + |
| 855 | + def macroReplacer(match: re.Match) -> str: |
| 856 | + text = match.group(1).lower().strip() |
| 857 | + if text.endswith("?"): |
| 858 | + text = text[:-1].strip() |
| 859 | + optional = True |
| 860 | + else: |
| 861 | + optional = False |
| 862 | + if text in macros: |
| 863 | + # For some reason I store all the macros in lowercase, |
| 864 | + # despite requiring them to be spelled with uppercase. |
| 865 | + return str(macros[text]) |
| 866 | + # Nothing has matched, so start failing the macros. |
| 867 | + if optional: |
| 868 | + return "" |
| 869 | + die( |
| 870 | + f"Found unmatched text macro [{match.group(1)}]. Correct the macro, or escape it somehow (leading backslash, html escape, etc).", |
| 871 | + ) |
| 872 | + return t.cast(str, "[" + match.group(0)[1:-1] + "]") |
| 873 | + |
| 874 | + while "\uebbb" in text: |
| 875 | + # Loop, as macros might expand to more macros |
| 876 | + # (which hopefully were HTML-parsed). |
| 877 | + ms = constants.macroStartChar |
| 878 | + me = constants.macroEndChar |
| 879 | + text = re.sub(f"{ms}(.+?){me}", macroReplacer, text) |
| 880 | + return text |
| 881 | + |
| 882 | + |
| 883 | +def replaceMacrosTextly(text: str, macros: t.Mapping[str, str]) -> str: |
| 884 | + # Same as replaceMacros(), but does the substitution |
| 885 | + # directly on the text, rather than relying on the |
| 886 | + # html parser to have preparsed the macro syntax |
837 | 887 | def macroReplacer(match: re.Match) -> str:
|
838 | 888 | fullText = t.cast(str, match.group(0))
|
839 | 889 | innerText = match.group(2).lower() or ""
|
|
0 commit comments