Skip to content

Commit ba5e222

Browse files
authored
use compiled regex for escaping patterns (#194)
Signed-off-by: chrispy <[email protected]>
1 parent 6984dca commit ba5e222

File tree

1 file changed

+29
-16
lines changed

1 file changed

+29
-16
lines changed

Diff for: markdownify/__init__.py

+29-16
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,38 @@
44
import six
55

66

7+
# General-purpose regex patterns
8+
re_convert_heading = re.compile(r'convert_h(\d+)')
79
re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
810
re_whitespace = re.compile(r'[\t ]+')
911
re_all_whitespace = re.compile(r'[\t \r\n]+')
1012
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
1113
re_html_heading = re.compile(r'h(\d+)')
1214

13-
# extract (leading_nl, content, trailing_nl) from a string
15+
# Pattern for creating convert_<tag> function names from tag names
16+
re_make_convert_fn_name = re.compile(r'[\[\]:-]')
17+
18+
# Extract (leading_nl, content, trailing_nl) from a string
1419
# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
1520
re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
1621

22+
# Escape miscellaneous special Markdown characters
23+
re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')
24+
25+
# Escape sequence of one or more consecutive '-', preceded
26+
# and followed by whitespace or start/end of fragment, as it
27+
# might be confused with an underline of a header, or with a
28+
# list marker
29+
re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')
30+
31+
# Escape sequence of up to six consecutive '#', preceded
32+
# and followed by whitespace or start/end of fragment, as
33+
# it might be confused with an ATX heading
34+
re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
35+
36+
# Escape '.' or ')' preceded by up to nine digits, as it might be
37+
# confused with a list item
38+
re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
1739

1840
# Heading styles
1941
ATX = 'atx'
@@ -346,7 +368,7 @@ def get_conv_fn(self, tag_name):
346368
return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags)
347369

348370
# For other tags, look up their conversion function by tag name
349-
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", tag_name)
371+
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name)
350372
convert_fn = getattr(self, convert_fn_name, None)
351373
return convert_fn
352374

@@ -365,20 +387,11 @@ def escape(self, text, parent_tags):
365387
if not text:
366388
return ''
367389
if self.options['escape_misc']:
368-
text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text)
369-
# A sequence of one or more consecutive '-', preceded and
370-
# followed by whitespace or start/end of fragment, might
371-
# be confused with an underline of a header, or with a
372-
# list marker.
373-
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
374-
# A sequence of up to six consecutive '#', preceded and
375-
# followed by whitespace or start/end of fragment, might
376-
# be confused with an ATX heading.
377-
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
378-
# '.' or ')' preceded by up to nine digits might be
379-
# confused with a list item.
380-
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
381-
text)
390+
text = re_escape_misc_chars.sub(r'\\\1', text)
391+
text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
392+
text = re_escape_misc_hashes.sub(r'\1\\\2', text)
393+
text = re_escape_misc_list_items.sub(r'\1\\\2', text)
394+
382395
if self.options['escape_asterisks']:
383396
text = text.replace('*', r'\*')
384397
if self.options['escape_underscores']:

0 commit comments

Comments
 (0)