From cd94b4b6f2b9b4af7591542eaf0bd98b680bad50 Mon Sep 17 00:00:00 2001 From: chrispy Date: Wed, 19 Feb 2025 20:12:58 -0500 Subject: [PATCH] use compiled regex for escaping patterns Signed-off-by: chrispy --- markdownify/__init__.py | 44 ++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index a1c6d9a..151ea5b 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -4,6 +4,7 @@ import six +# General-purpose regex patterns re_convert_heading = re.compile(r'convert_h(\d+)') re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) re_whitespace = re.compile(r'[\t ]+') @@ -11,10 +12,30 @@ re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') re_html_heading = re.compile(r'h[1-6]') -# extract (leading_nl, content, trailing_nl) from a string +# Pattern for creating convert_ function names from tag names +re_make_convert_fn_name = re.compile(r'[\[\]:-]') + +# Extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +# Escape miscellaneous special Markdown characters +re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') + +# Escape sequence of one or more consecutive '-', preceded +# and followed by whitespace or start/end of fragment, as it +# might be confused with an underline of a header, or with a +# list marker +re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))') + +# Escape sequence of up to six consecutive '#', preceded +# and followed by whitespace or start/end of fragment, as +# it might be confused with an ATX heading +re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))') + +# Escape '.' or ')' preceded by up to nine digits, as it might be +# confused with a list item +re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))') # Heading styles ATX = 'atx' @@ -266,7 +287,7 @@ def _can_ignore(el): text = ''.join(child_strings) # apply this tag's final conversion function - convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name) + convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', node.name) convert_fn = getattr(self, convert_fn_name, None) if convert_fn and self.should_convert_tag(node.name): text = convert_fn(node, text, parent_tags=parent_tags) @@ -351,20 +372,11 @@ def escape(self, text, parent_tags): if not text: return '' if self.options['escape_misc']: - text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text) - # A sequence of one or more consecutive '-', preceded and - # followed by whitespace or start/end of fragment, might - # be confused with an underline of a header, or with a - # list marker. - text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text) - # A sequence of up to six consecutive '#', preceded and - # followed by whitespace or start/end of fragment, might - # be confused with an ATX heading. - text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text) - # '.' or ')' preceded by up to nine digits might be - # confused with a list item. - text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2', - text) + text = re_escape_misc_chars.sub(r'\\\1', text) + text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text) + text = re_escape_misc_hashes.sub(r'\1\\\2', text) + text = re_escape_misc_list_items.sub(r'\1\\\2', text) + if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: