propagate parent tag context downward to improve runtime (#191)

chrispy-snps · web-flow · commit 5655f27208d2 · 2025-02-18T16:35:36.000-05:00
diff --git a/README.rst b/README.rst
@@ -180,7 +180,7 @@ If you have a special usecase that calls for a special conversion, you can
 always inherit from ``MarkdownConverter`` and override the method you want to
 change.
 The function that handles a HTML tag named ``abc`` is called
-``convert_abc(self, el, text, convert_as_inline)`` and returns a string
+``convert_abc(self, el, text, parent_tags)`` and returns a string
 containing the converted HTML tag.
 The ``MarkdownConverter`` object will handle the conversion based on the
 function names:
@@ -193,8 +193,8 @@ function names:
         """
         Create a custom MarkdownConverter that adds two newlines after an image
         """
-        def convert_img(self, el, text, convert_as_inline):
-            return super().convert_img(el, text, convert_as_inline) + '\n\n'
+        def convert_img(self, el, text, parent_tags):
+            return super().convert_img(el, text, parent_tags) + '\n\n'
 
     # Create shorthand method for conversion
     def md(html, **options):
@@ -208,7 +208,7 @@ function names:
         """
         Create a custom MarkdownConverter that ignores paragraphs
         """
-        def convert_p(self, el, text, convert_as_inline):
+        def convert_p(self, el, text, parent_tags):
             return ''
 
     # Create shorthand method for conversion
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
     the text if it looks like an HTML tag. markup_fn is necessary to allow for
     references to self.strong_em_symbol etc.
     """
-    def implementation(self, el, text, convert_as_inline):
+    def implementation(self, el, text, parent_tags):
         markup_prefix = markup_fn(self)
         if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
             markup_suffix = '</' + markup_prefix[1:]
         else:
             markup_suffix = markup_prefix
-        if el.find_parent(['pre', 'code', 'kbd', 'samp']):
+        if '_noformat' in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
         if not text:
@@ -170,24 +170,18 @@ def convert(self, html):
         return self.convert_soup(soup)
 
     def convert_soup(self, soup):
-        return self.process_tag(soup, convert_as_inline=False)
+        return self.process_tag(soup, parent_tags=set())
 
-    def process_element(self, node, convert_as_inline):
+    def process_element(self, node, parent_tags=None):
         if isinstance(node, NavigableString):
-            return self.process_text(node)
+            return self.process_text(node, parent_tags=parent_tags)
         else:
-            return self.process_tag(node, convert_as_inline)
+            return self.process_tag(node, parent_tags=parent_tags)
 
-    def process_tag(self, node, convert_as_inline):
-        text = ''
-
-        # For Markdown headings and table cells, convert children as inline
-        # (so that block element children do not produce newlines).
-        convert_children_as_inline = (
-            convert_as_inline  # propagated from parent
-            or html_heading_re.match(node.name) is not None  # headings
-            or node.name in ['td', 'th']  # table cells
-        )
+    def process_tag(self, node, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
+        if parent_tags is None:
+            parent_tags = set()
 
         # Collect child elements to process, ignoring whitespace-only text elements
         # adjacent to the inner/outer boundaries of block elements.
@@ -220,8 +214,27 @@ def _can_ignore(el):
 
         children_to_convert = [el for el in node.children if not _can_ignore(el)]
 
+        # Create a copy of this tag's parent context, then update it to include this tag
+        # to propagate down into the children.
+        parent_tags_for_children = set(parent_tags)
+        parent_tags_for_children.add(node.name)
+
+        # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
+        if (
+            html_heading_re.match(node.name) is not None  # headings
+            or node.name in {'td', 'th'}  # table cells
+        ):
+            parent_tags_for_children.add('_inline')
+
+        # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
+        if node.name in {'pre', 'code', 'kbd', 'samp'}:
+            parent_tags_for_children.add('_noformat')
+
         # Convert the children elements into a list of result strings.
-        child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert]
+        child_strings = [
+            self.process_element(el, parent_tags=parent_tags_for_children)
+            for el in children_to_convert
+        ]
 
         # Remove empty string values.
         child_strings = [s for s in child_strings if s]
@@ -256,11 +269,11 @@ def _can_ignore(el):
         convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
         convert_fn = getattr(self, convert_fn_name, None)
         if convert_fn and self.should_convert_tag(node.name):
-            text = convert_fn(node, text, convert_as_inline)
+            text = convert_fn(node, text, parent_tags=parent_tags)
 
         return text
 
-    def convert__document_(self, el, text, convert_as_inline):
+    def convert__document_(self, el, text, parent_tags):
         """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
         if self.options['strip_document'] == LSTRIP:
             text = text.lstrip('\n')  # remove leading separation newlines
@@ -275,19 +288,23 @@ def convert__document_(self, el, text, convert_as_inline):
 
         return text
 
-    def process_text(self, el):
+    def process_text(self, el, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
+        if parent_tags is None:
+            parent_tags = set()
+
         text = six.text_type(el) or ''
 
         # normalize whitespace if we're not inside a preformatted element
-        if not el.find_parent('pre'):
+        if 'pre' not in parent_tags:
             if self.options['wrap']:
                 text = all_whitespace_re.sub(' ', text)
             else:
                 text = newline_whitespace_re.sub('\n', text)
                 text = whitespace_re.sub(' ', text)
 
         # escape special characters if we're not inside a preformatted or code element
-        if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
+        if '_noformat' not in parent_tags:
             text = self.escape(text)
 
         # remove leading whitespace at the start or just after a
@@ -310,8 +327,8 @@ def __getattr__(self, attr):
         if m:
             n = int(m.group(1))
 
-            def convert_tag(el, text, convert_as_inline):
-                return self._convert_hn(n, el, text, convert_as_inline)
+            def convert_tag(el, text, parent_tags):
+                return self._convert_hn(n, el, text, parent_tags)
 
             convert_tag.__name__ = 'convert_h%s' % n
             setattr(self, convert_tag.__name__, convert_tag)
@@ -358,8 +375,8 @@ def underline(self, text, pad_char):
         text = (text or '').rstrip()
         return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
 
-    def convert_a(self, el, text, convert_as_inline):
-        if el.find_parent(['pre', 'code', 'kbd', 'samp']):
+    def convert_a(self, el, text, parent_tags):
+        if '_noformat' in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
         if not text:
@@ -380,10 +397,10 @@ def convert_a(self, el, text, convert_as_inline):
 
     convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
 
-    def convert_blockquote(self, el, text, convert_as_inline):
+    def convert_blockquote(self, el, text, parent_tags):
         # handle some early-exit scenarios
         text = (text or '').strip()
-        if convert_as_inline:
+        if '_inline' in parent_tags:
             return ' ' + text + ' '
         if not text:
             return "\n"
@@ -396,25 +413,25 @@ def _indent_for_blockquote(match):
 
         return '\n' + text + '\n\n'
 
-    def convert_br(self, el, text, convert_as_inline):
-        if convert_as_inline:
+    def convert_br(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
             return ""
 
         if self.options['newline_style'].lower() == BACKSLASH:
             return '\\\n'
         else:
             return '  \n'
 
-    def convert_code(self, el, text, convert_as_inline):
-        if el.parent.name == 'pre':
+    def convert_code(self, el, text, parent_tags):
+        if 'pre' in parent_tags:
             return text
         converter = abstract_inline_conversion(lambda self: '`')
-        return converter(self, el, text, convert_as_inline)
+        return converter(self, el, text, parent_tags)
 
     convert_del = abstract_inline_conversion(lambda self: '~~')
 
-    def convert_div(self, el, text, convert_as_inline):
-        if convert_as_inline:
+    def convert_div(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
             return ' ' + text.strip() + ' '
         text = text.strip()
         return '\n\n%s\n\n' % text if text else ''
@@ -427,9 +444,9 @@ def convert_div(self, el, text, convert_as_inline):
 
     convert_kbd = convert_code
 
-    def convert_dd(self, el, text, convert_as_inline):
+    def convert_dd(self, el, text, parent_tags):
         text = (text or '').strip()
-        if convert_as_inline:
+        if '_inline' in parent_tags:
             return ' ' + text + ' '
         if not text:
             return '\n'
@@ -445,11 +462,11 @@ def _indent_for_dd(match):
 
         return '%s\n' % text
 
-    def convert_dt(self, el, text, convert_as_inline):
+    def convert_dt(self, el, text, parent_tags):
         # remove newlines from term text
         text = (text or '').strip()
         text = all_whitespace_re.sub(' ', text)
-        if convert_as_inline:
+        if '_inline' in parent_tags:
             return ' ' + text + ' '
         if not text:
             return '\n'
@@ -459,9 +476,9 @@ def convert_dt(self, el, text, convert_as_inline):
 
         return '\n%s\n' % text
 
-    def _convert_hn(self, n, el, text, convert_as_inline):
+    def _convert_hn(self, n, el, text, parent_tags):
         """ Method name prefixed with _ to prevent <hn> to call this """
-        if convert_as_inline:
+        if '_inline' in parent_tags:
             return text
 
         # prevent MemoryErrors in case of very large n
@@ -478,46 +495,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
             return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
         return '\n\n%s %s\n\n' % (hashes, text)
 
-    def convert_hr(self, el, text, convert_as_inline):
+    def convert_hr(self, el, text, parent_tags):
         return '\n\n---\n\n'
 
     convert_i = convert_em
 
-    def convert_img(self, el, text, convert_as_inline):
+    def convert_img(self, el, text, parent_tags):
         alt = el.attrs.get('alt', None) or ''
         src = el.attrs.get('src', None) or ''
         title = el.attrs.get('title', None) or ''
         title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        if (convert_as_inline
+        if ('_inline' in parent_tags
                 and el.parent.name not in self.options['keep_inline_images_in']):
             return alt
 
         return '![%s](%s%s)' % (alt, src, title_part)
 
-    def convert_list(self, el, text, convert_as_inline):
+    def convert_list(self, el, text, parent_tags):
 
         # Converting a list to inline is undefined.
-        # Ignoring convert_to_inline for list.
+        # Ignoring inline conversion parents for list.
 
-        nested = False
         before_paragraph = False
         next_sibling = _next_block_content_sibling(el)
         if next_sibling and next_sibling.name not in ['ul', 'ol']:
             before_paragraph = True
-        while el:
-            if el.name == 'li':
-                nested = True
-                break
-            el = el.parent
-        if nested:
-            # remove trailing newline if nested
+        if 'li' in parent_tags:
+            # remove trailing newline if we're in a nested list
             return '\n' + text.rstrip()
         return '\n\n' + text + ('\n' if before_paragraph else '')
 
     convert_ul = convert_list
     convert_ol = convert_list
 
-    def convert_li(self, el, text, convert_as_inline):
+    def convert_li(self, el, text, parent_tags):
         # handle some early-exit scenarios
         text = (text or '').strip()
         if not text:
@@ -554,8 +565,8 @@ def _indent_for_li(match):
 
         return '%s\n' % text
 
-    def convert_p(self, el, text, convert_as_inline):
-        if convert_as_inline:
+    def convert_p(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
             return ' ' + text.strip() + ' '
         text = text.strip()
         if self.options['wrap']:
@@ -577,7 +588,7 @@ def convert_p(self, el, text, convert_as_inline):
                 text = '\n'.join(new_lines)
         return '\n\n%s\n\n' % text if text else ''
 
-    def convert_pre(self, el, text, convert_as_inline):
+    def convert_pre(self, el, text, parent_tags):
         if not text:
             return ''
         code_language = self.options['code_language']
@@ -587,10 +598,10 @@ def convert_pre(self, el, text, convert_as_inline):
 
         return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
 
-    def convert_script(self, el, text, convert_as_inline):
+    def convert_script(self, el, text, parent_tags):
         return ''
 
-    def convert_style(self, el, text, convert_as_inline):
+    def convert_style(self, el, text, parent_tags):
         return ''
 
     convert_s = convert_del
@@ -603,28 +614,28 @@ def convert_style(self, el, text, convert_as_inline):
 
     convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
 
-    def convert_table(self, el, text, convert_as_inline):
+    def convert_table(self, el, text, parent_tags):
         return '\n\n' + text.strip() + '\n\n'
 
-    def convert_caption(self, el, text, convert_as_inline):
+    def convert_caption(self, el, text, parent_tags):
         return text.strip() + '\n\n'
 
-    def convert_figcaption(self, el, text, convert_as_inline):
+    def convert_figcaption(self, el, text, parent_tags):
         return '\n\n' + text.strip() + '\n\n'
 
-    def convert_td(self, el, text, convert_as_inline):
+    def convert_td(self, el, text, parent_tags):
         colspan = 1
         if 'colspan' in el.attrs and el['colspan'].isdigit():
             colspan = int(el['colspan'])
         return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
 
-    def convert_th(self, el, text, convert_as_inline):
+    def convert_th(self, el, text, parent_tags):
         colspan = 1
         if 'colspan' in el.attrs and el['colspan'].isdigit():
             colspan = int(el['colspan'])
         return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
 
-    def convert_tr(self, el, text, convert_as_inline):
+    def convert_tr(self, el, text, parent_tags):
         cells = el.find_all(['td', 'th'])
         is_first_row = el.find_previous_sibling() is None
         is_headrow = (
diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
@@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
     """
     Create a custom MarkdownConverter for unit tests
     """
-    def convert_img(self, el, text, convert_as_inline):
+    def convert_img(self, el, text, parent_tags):
         """Add two newlines after an image"""
-        return super().convert_img(el, text, convert_as_inline) + '\n\n'
+        return super().convert_img(el, text, parent_tags) + '\n\n'
 
-    def convert_custom_tag(self, el, text, convert_as_inline):
+    def convert_custom_tag(self, el, text, parent_tags):
         """Ensure conversion function is found for tags with special characters in name"""
         return "FUNCTION USED: %s" % text