@@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
57
57
the text if it looks like an HTML tag. markup_fn is necessary to allow for
58
58
references to self.strong_em_symbol etc.
59
59
"""
60
- def implementation (self , el , text , convert_as_inline ):
60
+ def implementation (self , el , text , parent_tags ):
61
61
markup_prefix = markup_fn (self )
62
62
if markup_prefix .startswith ('<' ) and markup_prefix .endswith ('>' ):
63
63
markup_suffix = '</' + markup_prefix [1 :]
64
64
else :
65
65
markup_suffix = markup_prefix
66
- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
66
+ if '_noformat' in parent_tags :
67
67
return text
68
68
prefix , suffix , text = chomp (text )
69
69
if not text :
@@ -170,24 +170,18 @@ def convert(self, html):
170
170
return self .convert_soup (soup )
171
171
172
172
def convert_soup (self , soup ):
173
- return self .process_tag (soup , convert_as_inline = False )
173
+ return self .process_tag (soup , parent_tags = set () )
174
174
175
- def process_element (self , node , convert_as_inline ):
175
+ def process_element (self , node , parent_tags = None ):
176
176
if isinstance (node , NavigableString ):
177
- return self .process_text (node )
177
+ return self .process_text (node , parent_tags = parent_tags )
178
178
else :
179
- return self .process_tag (node , convert_as_inline )
179
+ return self .process_tag (node , parent_tags = parent_tags )
180
180
181
- def process_tag (self , node , convert_as_inline ):
182
- text = ''
183
-
184
- # For Markdown headings and table cells, convert children as inline
185
- # (so that block element children do not produce newlines).
186
- convert_children_as_inline = (
187
- convert_as_inline # propagated from parent
188
- or html_heading_re .match (node .name ) is not None # headings
189
- or node .name in ['td' , 'th' ] # table cells
190
- )
181
+ def process_tag (self , node , parent_tags = None ):
182
+ # For the top-level element, initialize the parent context with an empty set.
183
+ if parent_tags is None :
184
+ parent_tags = set ()
191
185
192
186
# Collect child elements to process, ignoring whitespace-only text elements
193
187
# adjacent to the inner/outer boundaries of block elements.
@@ -220,8 +214,27 @@ def _can_ignore(el):
220
214
221
215
children_to_convert = [el for el in node .children if not _can_ignore (el )]
222
216
217
+ # Create a copy of this tag's parent context, then update it to include this tag
218
+ # to propagate down into the children.
219
+ parent_tags_for_children = set (parent_tags )
220
+ parent_tags_for_children .add (node .name )
221
+
222
+ # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
223
+ if (
224
+ html_heading_re .match (node .name ) is not None # headings
225
+ or node .name in {'td' , 'th' } # table cells
226
+ ):
227
+ parent_tags_for_children .add ('_inline' )
228
+
229
+ # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
230
+ if node .name in {'pre' , 'code' , 'kbd' , 'samp' }:
231
+ parent_tags_for_children .add ('_noformat' )
232
+
223
233
# Convert the children elements into a list of result strings.
224
- child_strings = [self .process_element (el , convert_children_as_inline ) for el in children_to_convert ]
234
+ child_strings = [
235
+ self .process_element (el , parent_tags = parent_tags_for_children )
236
+ for el in children_to_convert
237
+ ]
225
238
226
239
# Remove empty string values.
227
240
child_strings = [s for s in child_strings if s ]
@@ -256,11 +269,11 @@ def _can_ignore(el):
256
269
convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
257
270
convert_fn = getattr (self , convert_fn_name , None )
258
271
if convert_fn and self .should_convert_tag (node .name ):
259
- text = convert_fn (node , text , convert_as_inline )
272
+ text = convert_fn (node , text , parent_tags = parent_tags )
260
273
261
274
return text
262
275
263
- def convert__document_ (self , el , text , convert_as_inline ):
276
+ def convert__document_ (self , el , text , parent_tags ):
264
277
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
265
278
if self .options ['strip_document' ] == LSTRIP :
266
279
text = text .lstrip ('\n ' ) # remove leading separation newlines
@@ -275,19 +288,23 @@ def convert__document_(self, el, text, convert_as_inline):
275
288
276
289
return text
277
290
278
- def process_text (self , el ):
291
+ def process_text (self , el , parent_tags = None ):
292
+ # For the top-level element, initialize the parent context with an empty set.
293
+ if parent_tags is None :
294
+ parent_tags = set ()
295
+
279
296
text = six .text_type (el ) or ''
280
297
281
298
# normalize whitespace if we're not inside a preformatted element
282
- if not el . find_parent ( 'pre' ) :
299
+ if 'pre' not in parent_tags :
283
300
if self .options ['wrap' ]:
284
301
text = all_whitespace_re .sub (' ' , text )
285
302
else :
286
303
text = newline_whitespace_re .sub ('\n ' , text )
287
304
text = whitespace_re .sub (' ' , text )
288
305
289
306
# escape special characters if we're not inside a preformatted or code element
290
- if not el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
307
+ if '_noformat' not in parent_tags :
291
308
text = self .escape (text )
292
309
293
310
# remove leading whitespace at the start or just after a
@@ -310,8 +327,8 @@ def __getattr__(self, attr):
310
327
if m :
311
328
n = int (m .group (1 ))
312
329
313
- def convert_tag (el , text , convert_as_inline ):
314
- return self ._convert_hn (n , el , text , convert_as_inline )
330
+ def convert_tag (el , text , parent_tags ):
331
+ return self ._convert_hn (n , el , text , parent_tags )
315
332
316
333
convert_tag .__name__ = 'convert_h%s' % n
317
334
setattr (self , convert_tag .__name__ , convert_tag )
@@ -358,8 +375,8 @@ def underline(self, text, pad_char):
358
375
text = (text or '' ).rstrip ()
359
376
return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
360
377
361
- def convert_a (self , el , text , convert_as_inline ):
362
- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
378
+ def convert_a (self , el , text , parent_tags ):
379
+ if '_noformat' in parent_tags :
363
380
return text
364
381
prefix , suffix , text = chomp (text )
365
382
if not text :
@@ -380,10 +397,10 @@ def convert_a(self, el, text, convert_as_inline):
380
397
381
398
convert_b = abstract_inline_conversion (lambda self : 2 * self .options ['strong_em_symbol' ])
382
399
383
- def convert_blockquote (self , el , text , convert_as_inline ):
400
+ def convert_blockquote (self , el , text , parent_tags ):
384
401
# handle some early-exit scenarios
385
402
text = (text or '' ).strip ()
386
- if convert_as_inline :
403
+ if '_inline' in parent_tags :
387
404
return ' ' + text + ' '
388
405
if not text :
389
406
return "\n "
@@ -396,25 +413,25 @@ def _indent_for_blockquote(match):
396
413
397
414
return '\n ' + text + '\n \n '
398
415
399
- def convert_br (self , el , text , convert_as_inline ):
400
- if convert_as_inline :
416
+ def convert_br (self , el , text , parent_tags ):
417
+ if '_inline' in parent_tags :
401
418
return ""
402
419
403
420
if self .options ['newline_style' ].lower () == BACKSLASH :
404
421
return '\\ \n '
405
422
else :
406
423
return ' \n '
407
424
408
- def convert_code (self , el , text , convert_as_inline ):
409
- if el . parent . name == 'pre' :
425
+ def convert_code (self , el , text , parent_tags ):
426
+ if 'pre' in parent_tags :
410
427
return text
411
428
converter = abstract_inline_conversion (lambda self : '`' )
412
- return converter (self , el , text , convert_as_inline )
429
+ return converter (self , el , text , parent_tags )
413
430
414
431
convert_del = abstract_inline_conversion (lambda self : '~~' )
415
432
416
- def convert_div (self , el , text , convert_as_inline ):
417
- if convert_as_inline :
433
+ def convert_div (self , el , text , parent_tags ):
434
+ if '_inline' in parent_tags :
418
435
return ' ' + text .strip () + ' '
419
436
text = text .strip ()
420
437
return '\n \n %s\n \n ' % text if text else ''
@@ -427,9 +444,9 @@ def convert_div(self, el, text, convert_as_inline):
427
444
428
445
convert_kbd = convert_code
429
446
430
- def convert_dd (self , el , text , convert_as_inline ):
447
+ def convert_dd (self , el , text , parent_tags ):
431
448
text = (text or '' ).strip ()
432
- if convert_as_inline :
449
+ if '_inline' in parent_tags :
433
450
return ' ' + text + ' '
434
451
if not text :
435
452
return '\n '
@@ -445,11 +462,11 @@ def _indent_for_dd(match):
445
462
446
463
return '%s\n ' % text
447
464
448
- def convert_dt (self , el , text , convert_as_inline ):
465
+ def convert_dt (self , el , text , parent_tags ):
449
466
# remove newlines from term text
450
467
text = (text or '' ).strip ()
451
468
text = all_whitespace_re .sub (' ' , text )
452
- if convert_as_inline :
469
+ if '_inline' in parent_tags :
453
470
return ' ' + text + ' '
454
471
if not text :
455
472
return '\n '
@@ -459,9 +476,9 @@ def convert_dt(self, el, text, convert_as_inline):
459
476
460
477
return '\n %s\n ' % text
461
478
462
- def _convert_hn (self , n , el , text , convert_as_inline ):
479
+ def _convert_hn (self , n , el , text , parent_tags ):
463
480
""" Method name prefixed with _ to prevent <hn> to call this """
464
- if convert_as_inline :
481
+ if '_inline' in parent_tags :
465
482
return text
466
483
467
484
# prevent MemoryErrors in case of very large n
@@ -478,46 +495,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
478
495
return '\n \n %s %s %s\n \n ' % (hashes , text , hashes )
479
496
return '\n \n %s %s\n \n ' % (hashes , text )
480
497
481
- def convert_hr (self , el , text , convert_as_inline ):
498
+ def convert_hr (self , el , text , parent_tags ):
482
499
return '\n \n ---\n \n '
483
500
484
501
convert_i = convert_em
485
502
486
- def convert_img (self , el , text , convert_as_inline ):
503
+ def convert_img (self , el , text , parent_tags ):
487
504
alt = el .attrs .get ('alt' , None ) or ''
488
505
src = el .attrs .get ('src' , None ) or ''
489
506
title = el .attrs .get ('title' , None ) or ''
490
507
title_part = ' "%s"' % title .replace ('"' , r'\"' ) if title else ''
491
- if (convert_as_inline
508
+ if ('_inline' in parent_tags
492
509
and el .parent .name not in self .options ['keep_inline_images_in' ]):
493
510
return alt
494
511
495
512
return '' % (alt , src , title_part )
496
513
497
- def convert_list (self , el , text , convert_as_inline ):
514
+ def convert_list (self , el , text , parent_tags ):
498
515
499
516
# Converting a list to inline is undefined.
500
- # Ignoring convert_to_inline for list.
517
+ # Ignoring inline conversion parents for list.
501
518
502
- nested = False
503
519
before_paragraph = False
504
520
next_sibling = _next_block_content_sibling (el )
505
521
if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
506
522
before_paragraph = True
507
- while el :
508
- if el .name == 'li' :
509
- nested = True
510
- break
511
- el = el .parent
512
- if nested :
513
- # remove trailing newline if nested
523
+ if 'li' in parent_tags :
524
+ # remove trailing newline if we're in a nested list
514
525
return '\n ' + text .rstrip ()
515
526
return '\n \n ' + text + ('\n ' if before_paragraph else '' )
516
527
517
528
convert_ul = convert_list
518
529
convert_ol = convert_list
519
530
520
- def convert_li (self , el , text , convert_as_inline ):
531
+ def convert_li (self , el , text , parent_tags ):
521
532
# handle some early-exit scenarios
522
533
text = (text or '' ).strip ()
523
534
if not text :
@@ -554,8 +565,8 @@ def _indent_for_li(match):
554
565
555
566
return '%s\n ' % text
556
567
557
- def convert_p (self , el , text , convert_as_inline ):
558
- if convert_as_inline :
568
+ def convert_p (self , el , text , parent_tags ):
569
+ if '_inline' in parent_tags :
559
570
return ' ' + text .strip () + ' '
560
571
text = text .strip ()
561
572
if self .options ['wrap' ]:
@@ -577,7 +588,7 @@ def convert_p(self, el, text, convert_as_inline):
577
588
text = '\n ' .join (new_lines )
578
589
return '\n \n %s\n \n ' % text if text else ''
579
590
580
- def convert_pre (self , el , text , convert_as_inline ):
591
+ def convert_pre (self , el , text , parent_tags ):
581
592
if not text :
582
593
return ''
583
594
code_language = self .options ['code_language' ]
@@ -587,10 +598,10 @@ def convert_pre(self, el, text, convert_as_inline):
587
598
588
599
return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
589
600
590
- def convert_script (self , el , text , convert_as_inline ):
601
+ def convert_script (self , el , text , parent_tags ):
591
602
return ''
592
603
593
- def convert_style (self , el , text , convert_as_inline ):
604
+ def convert_style (self , el , text , parent_tags ):
594
605
return ''
595
606
596
607
convert_s = convert_del
@@ -603,28 +614,28 @@ def convert_style(self, el, text, convert_as_inline):
603
614
604
615
convert_sup = abstract_inline_conversion (lambda self : self .options ['sup_symbol' ])
605
616
606
- def convert_table (self , el , text , convert_as_inline ):
617
+ def convert_table (self , el , text , parent_tags ):
607
618
return '\n \n ' + text .strip () + '\n \n '
608
619
609
- def convert_caption (self , el , text , convert_as_inline ):
620
+ def convert_caption (self , el , text , parent_tags ):
610
621
return text .strip () + '\n \n '
611
622
612
- def convert_figcaption (self , el , text , convert_as_inline ):
623
+ def convert_figcaption (self , el , text , parent_tags ):
613
624
return '\n \n ' + text .strip () + '\n \n '
614
625
615
- def convert_td (self , el , text , convert_as_inline ):
626
+ def convert_td (self , el , text , parent_tags ):
616
627
colspan = 1
617
628
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
618
629
colspan = int (el ['colspan' ])
619
630
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
620
631
621
- def convert_th (self , el , text , convert_as_inline ):
632
+ def convert_th (self , el , text , parent_tags ):
622
633
colspan = 1
623
634
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
624
635
colspan = int (el ['colspan' ])
625
636
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
626
637
627
- def convert_tr (self , el , text , convert_as_inline ):
638
+ def convert_tr (self , el , text , parent_tags ):
628
639
cells = el .find_all (['td' , 'th' ])
629
640
is_first_row = el .find_previous_sibling () is None
630
641
is_headrow = (
0 commit comments