Skip to content

Commit 5655f27

Browse files
authored
propagate parent tag context downward to improve runtime (#191)
1 parent c52ba47 commit 5655f27

File tree

3 files changed

+84
-73
lines changed

3 files changed

+84
-73
lines changed

Diff for: README.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ If you have a special usecase that calls for a special conversion, you can
180180
always inherit from ``MarkdownConverter`` and override the method you want to
181181
change.
182182
The function that handles a HTML tag named ``abc`` is called
183-
``convert_abc(self, el, text, convert_as_inline)`` and returns a string
183+
``convert_abc(self, el, text, parent_tags)`` and returns a string
184184
containing the converted HTML tag.
185185
The ``MarkdownConverter`` object will handle the conversion based on the
186186
function names:
@@ -193,8 +193,8 @@ function names:
193193
"""
194194
Create a custom MarkdownConverter that adds two newlines after an image
195195
"""
196-
def convert_img(self, el, text, convert_as_inline):
197-
return super().convert_img(el, text, convert_as_inline) + '\n\n'
196+
def convert_img(self, el, text, parent_tags):
197+
return super().convert_img(el, text, parent_tags) + '\n\n'
198198
199199
# Create shorthand method for conversion
200200
def md(html, **options):
@@ -208,7 +208,7 @@ function names:
208208
"""
209209
Create a custom MarkdownConverter that ignores paragraphs
210210
"""
211-
def convert_p(self, el, text, convert_as_inline):
211+
def convert_p(self, el, text, parent_tags):
212212
return ''
213213
214214
# Create shorthand method for conversion

Diff for: markdownify/__init__.py

+77-66
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
5757
the text if it looks like an HTML tag. markup_fn is necessary to allow for
5858
references to self.strong_em_symbol etc.
5959
"""
60-
def implementation(self, el, text, convert_as_inline):
60+
def implementation(self, el, text, parent_tags):
6161
markup_prefix = markup_fn(self)
6262
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
6363
markup_suffix = '</' + markup_prefix[1:]
6464
else:
6565
markup_suffix = markup_prefix
66-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
66+
if '_noformat' in parent_tags:
6767
return text
6868
prefix, suffix, text = chomp(text)
6969
if not text:
@@ -170,24 +170,18 @@ def convert(self, html):
170170
return self.convert_soup(soup)
171171

172172
def convert_soup(self, soup):
173-
return self.process_tag(soup, convert_as_inline=False)
173+
return self.process_tag(soup, parent_tags=set())
174174

175-
def process_element(self, node, convert_as_inline):
175+
def process_element(self, node, parent_tags=None):
176176
if isinstance(node, NavigableString):
177-
return self.process_text(node)
177+
return self.process_text(node, parent_tags=parent_tags)
178178
else:
179-
return self.process_tag(node, convert_as_inline)
179+
return self.process_tag(node, parent_tags=parent_tags)
180180

181-
def process_tag(self, node, convert_as_inline):
182-
text = ''
183-
184-
# For Markdown headings and table cells, convert children as inline
185-
# (so that block element children do not produce newlines).
186-
convert_children_as_inline = (
187-
convert_as_inline # propagated from parent
188-
or html_heading_re.match(node.name) is not None # headings
189-
or node.name in ['td', 'th'] # table cells
190-
)
181+
def process_tag(self, node, parent_tags=None):
182+
# For the top-level element, initialize the parent context with an empty set.
183+
if parent_tags is None:
184+
parent_tags = set()
191185

192186
# Collect child elements to process, ignoring whitespace-only text elements
193187
# adjacent to the inner/outer boundaries of block elements.
@@ -220,8 +214,27 @@ def _can_ignore(el):
220214

221215
children_to_convert = [el for el in node.children if not _can_ignore(el)]
222216

217+
# Create a copy of this tag's parent context, then update it to include this tag
218+
# to propagate down into the children.
219+
parent_tags_for_children = set(parent_tags)
220+
parent_tags_for_children.add(node.name)
221+
222+
# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
223+
if (
224+
html_heading_re.match(node.name) is not None # headings
225+
or node.name in {'td', 'th'} # table cells
226+
):
227+
parent_tags_for_children.add('_inline')
228+
229+
# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
230+
if node.name in {'pre', 'code', 'kbd', 'samp'}:
231+
parent_tags_for_children.add('_noformat')
232+
223233
# Convert the children elements into a list of result strings.
224-
child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert]
234+
child_strings = [
235+
self.process_element(el, parent_tags=parent_tags_for_children)
236+
for el in children_to_convert
237+
]
225238

226239
# Remove empty string values.
227240
child_strings = [s for s in child_strings if s]
@@ -256,11 +269,11 @@ def _can_ignore(el):
256269
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
257270
convert_fn = getattr(self, convert_fn_name, None)
258271
if convert_fn and self.should_convert_tag(node.name):
259-
text = convert_fn(node, text, convert_as_inline)
272+
text = convert_fn(node, text, parent_tags=parent_tags)
260273

261274
return text
262275

263-
def convert__document_(self, el, text, convert_as_inline):
276+
def convert__document_(self, el, text, parent_tags):
264277
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
265278
if self.options['strip_document'] == LSTRIP:
266279
text = text.lstrip('\n') # remove leading separation newlines
@@ -275,19 +288,23 @@ def convert__document_(self, el, text, convert_as_inline):
275288

276289
return text
277290

278-
def process_text(self, el):
291+
def process_text(self, el, parent_tags=None):
292+
# For the top-level element, initialize the parent context with an empty set.
293+
if parent_tags is None:
294+
parent_tags = set()
295+
279296
text = six.text_type(el) or ''
280297

281298
# normalize whitespace if we're not inside a preformatted element
282-
if not el.find_parent('pre'):
299+
if 'pre' not in parent_tags:
283300
if self.options['wrap']:
284301
text = all_whitespace_re.sub(' ', text)
285302
else:
286303
text = newline_whitespace_re.sub('\n', text)
287304
text = whitespace_re.sub(' ', text)
288305

289306
# escape special characters if we're not inside a preformatted or code element
290-
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
307+
if '_noformat' not in parent_tags:
291308
text = self.escape(text)
292309

293310
# remove leading whitespace at the start or just after a
@@ -310,8 +327,8 @@ def __getattr__(self, attr):
310327
if m:
311328
n = int(m.group(1))
312329

313-
def convert_tag(el, text, convert_as_inline):
314-
return self._convert_hn(n, el, text, convert_as_inline)
330+
def convert_tag(el, text, parent_tags):
331+
return self._convert_hn(n, el, text, parent_tags)
315332

316333
convert_tag.__name__ = 'convert_h%s' % n
317334
setattr(self, convert_tag.__name__, convert_tag)
@@ -358,8 +375,8 @@ def underline(self, text, pad_char):
358375
text = (text or '').rstrip()
359376
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
360377

361-
def convert_a(self, el, text, convert_as_inline):
362-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
378+
def convert_a(self, el, text, parent_tags):
379+
if '_noformat' in parent_tags:
363380
return text
364381
prefix, suffix, text = chomp(text)
365382
if not text:
@@ -380,10 +397,10 @@ def convert_a(self, el, text, convert_as_inline):
380397

381398
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
382399

383-
def convert_blockquote(self, el, text, convert_as_inline):
400+
def convert_blockquote(self, el, text, parent_tags):
384401
# handle some early-exit scenarios
385402
text = (text or '').strip()
386-
if convert_as_inline:
403+
if '_inline' in parent_tags:
387404
return ' ' + text + ' '
388405
if not text:
389406
return "\n"
@@ -396,25 +413,25 @@ def _indent_for_blockquote(match):
396413

397414
return '\n' + text + '\n\n'
398415

399-
def convert_br(self, el, text, convert_as_inline):
400-
if convert_as_inline:
416+
def convert_br(self, el, text, parent_tags):
417+
if '_inline' in parent_tags:
401418
return ""
402419

403420
if self.options['newline_style'].lower() == BACKSLASH:
404421
return '\\\n'
405422
else:
406423
return ' \n'
407424

408-
def convert_code(self, el, text, convert_as_inline):
409-
if el.parent.name == 'pre':
425+
def convert_code(self, el, text, parent_tags):
426+
if 'pre' in parent_tags:
410427
return text
411428
converter = abstract_inline_conversion(lambda self: '`')
412-
return converter(self, el, text, convert_as_inline)
429+
return converter(self, el, text, parent_tags)
413430

414431
convert_del = abstract_inline_conversion(lambda self: '~~')
415432

416-
def convert_div(self, el, text, convert_as_inline):
417-
if convert_as_inline:
433+
def convert_div(self, el, text, parent_tags):
434+
if '_inline' in parent_tags:
418435
return ' ' + text.strip() + ' '
419436
text = text.strip()
420437
return '\n\n%s\n\n' % text if text else ''
@@ -427,9 +444,9 @@ def convert_div(self, el, text, convert_as_inline):
427444

428445
convert_kbd = convert_code
429446

430-
def convert_dd(self, el, text, convert_as_inline):
447+
def convert_dd(self, el, text, parent_tags):
431448
text = (text or '').strip()
432-
if convert_as_inline:
449+
if '_inline' in parent_tags:
433450
return ' ' + text + ' '
434451
if not text:
435452
return '\n'
@@ -445,11 +462,11 @@ def _indent_for_dd(match):
445462

446463
return '%s\n' % text
447464

448-
def convert_dt(self, el, text, convert_as_inline):
465+
def convert_dt(self, el, text, parent_tags):
449466
# remove newlines from term text
450467
text = (text or '').strip()
451468
text = all_whitespace_re.sub(' ', text)
452-
if convert_as_inline:
469+
if '_inline' in parent_tags:
453470
return ' ' + text + ' '
454471
if not text:
455472
return '\n'
@@ -459,9 +476,9 @@ def convert_dt(self, el, text, convert_as_inline):
459476

460477
return '\n%s\n' % text
461478

462-
def _convert_hn(self, n, el, text, convert_as_inline):
479+
def _convert_hn(self, n, el, text, parent_tags):
463480
""" Method name prefixed with _ to prevent <hn> to call this """
464-
if convert_as_inline:
481+
if '_inline' in parent_tags:
465482
return text
466483

467484
# prevent MemoryErrors in case of very large n
@@ -478,46 +495,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
478495
return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
479496
return '\n\n%s %s\n\n' % (hashes, text)
480497

481-
def convert_hr(self, el, text, convert_as_inline):
498+
def convert_hr(self, el, text, parent_tags):
482499
return '\n\n---\n\n'
483500

484501
convert_i = convert_em
485502

486-
def convert_img(self, el, text, convert_as_inline):
503+
def convert_img(self, el, text, parent_tags):
487504
alt = el.attrs.get('alt', None) or ''
488505
src = el.attrs.get('src', None) or ''
489506
title = el.attrs.get('title', None) or ''
490507
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
491-
if (convert_as_inline
508+
if ('_inline' in parent_tags
492509
and el.parent.name not in self.options['keep_inline_images_in']):
493510
return alt
494511

495512
return '![%s](%s%s)' % (alt, src, title_part)
496513

497-
def convert_list(self, el, text, convert_as_inline):
514+
def convert_list(self, el, text, parent_tags):
498515

499516
# Converting a list to inline is undefined.
500-
# Ignoring convert_to_inline for list.
517+
# Ignoring inline conversion parents for list.
501518

502-
nested = False
503519
before_paragraph = False
504520
next_sibling = _next_block_content_sibling(el)
505521
if next_sibling and next_sibling.name not in ['ul', 'ol']:
506522
before_paragraph = True
507-
while el:
508-
if el.name == 'li':
509-
nested = True
510-
break
511-
el = el.parent
512-
if nested:
513-
# remove trailing newline if nested
523+
if 'li' in parent_tags:
524+
# remove trailing newline if we're in a nested list
514525
return '\n' + text.rstrip()
515526
return '\n\n' + text + ('\n' if before_paragraph else '')
516527

517528
convert_ul = convert_list
518529
convert_ol = convert_list
519530

520-
def convert_li(self, el, text, convert_as_inline):
531+
def convert_li(self, el, text, parent_tags):
521532
# handle some early-exit scenarios
522533
text = (text or '').strip()
523534
if not text:
@@ -554,8 +565,8 @@ def _indent_for_li(match):
554565

555566
return '%s\n' % text
556567

557-
def convert_p(self, el, text, convert_as_inline):
558-
if convert_as_inline:
568+
def convert_p(self, el, text, parent_tags):
569+
if '_inline' in parent_tags:
559570
return ' ' + text.strip() + ' '
560571
text = text.strip()
561572
if self.options['wrap']:
@@ -577,7 +588,7 @@ def convert_p(self, el, text, convert_as_inline):
577588
text = '\n'.join(new_lines)
578589
return '\n\n%s\n\n' % text if text else ''
579590

580-
def convert_pre(self, el, text, convert_as_inline):
591+
def convert_pre(self, el, text, parent_tags):
581592
if not text:
582593
return ''
583594
code_language = self.options['code_language']
@@ -587,10 +598,10 @@ def convert_pre(self, el, text, convert_as_inline):
587598

588599
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
589600

590-
def convert_script(self, el, text, convert_as_inline):
601+
def convert_script(self, el, text, parent_tags):
591602
return ''
592603

593-
def convert_style(self, el, text, convert_as_inline):
604+
def convert_style(self, el, text, parent_tags):
594605
return ''
595606

596607
convert_s = convert_del
@@ -603,28 +614,28 @@ def convert_style(self, el, text, convert_as_inline):
603614

604615
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
605616

606-
def convert_table(self, el, text, convert_as_inline):
617+
def convert_table(self, el, text, parent_tags):
607618
return '\n\n' + text.strip() + '\n\n'
608619

609-
def convert_caption(self, el, text, convert_as_inline):
620+
def convert_caption(self, el, text, parent_tags):
610621
return text.strip() + '\n\n'
611622

612-
def convert_figcaption(self, el, text, convert_as_inline):
623+
def convert_figcaption(self, el, text, parent_tags):
613624
return '\n\n' + text.strip() + '\n\n'
614625

615-
def convert_td(self, el, text, convert_as_inline):
626+
def convert_td(self, el, text, parent_tags):
616627
colspan = 1
617628
if 'colspan' in el.attrs and el['colspan'].isdigit():
618629
colspan = int(el['colspan'])
619630
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
620631

621-
def convert_th(self, el, text, convert_as_inline):
632+
def convert_th(self, el, text, parent_tags):
622633
colspan = 1
623634
if 'colspan' in el.attrs and el['colspan'].isdigit():
624635
colspan = int(el['colspan'])
625636
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
626637

627-
def convert_tr(self, el, text, convert_as_inline):
638+
def convert_tr(self, el, text, parent_tags):
628639
cells = el.find_all(['td', 'th'])
629640
is_first_row = el.find_previous_sibling() is None
630641
is_headrow = (

Diff for: tests/test_custom_converter.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
66
"""
77
Create a custom MarkdownConverter for unit tests
88
"""
9-
def convert_img(self, el, text, convert_as_inline):
9+
def convert_img(self, el, text, parent_tags):
1010
"""Add two newlines after an image"""
11-
return super().convert_img(el, text, convert_as_inline) + '\n\n'
11+
return super().convert_img(el, text, parent_tags) + '\n\n'
1212

13-
def convert_custom_tag(self, el, text, convert_as_inline):
13+
def convert_custom_tag(self, el, text, parent_tags):
1414
"""Ensure conversion function is found for tags with special characters in name"""
1515
return "FUNCTION USED: %s" % text
1616

0 commit comments

Comments
 (0)