1
- from bs4 import BeautifulSoup , NavigableString , Comment , Doctype
1
+ from bs4 import BeautifulSoup , Comment , Doctype , NavigableString , Tag
2
2
from textwrap import fill
3
3
import re
4
4
import six
@@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el):
79
79
if html_heading_re .match (el .name ) is not None :
80
80
return True
81
81
return el .name in ('p' , 'blockquote' ,
82
+ 'article' , 'div' , 'section' ,
82
83
'ol' , 'ul' , 'li' ,
83
84
'table' , 'thead' , 'tbody' , 'tfoot' ,
84
85
'tr' , 'td' , 'th' )
@@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el):
89
90
return should_remove_whitespace_inside (el ) or (el and el .name == 'pre' )
90
91
91
92
93
+ def _is_block_content_element (el ):
94
+ """
95
+ In a block context, returns:
96
+
97
+ - True for content elements (tags and non-whitespace text)
98
+ - False for non-content elements (whitespace text, comments, doctypes)
99
+ """
100
+ if isinstance (el , Tag ):
101
+ return True
102
+ elif isinstance (el , (Comment , Doctype )):
103
+ return False # (subclasses of NavigableString, must test first)
104
+ elif isinstance (el , NavigableString ):
105
+ return el .strip () != ''
106
+ else :
107
+ return False
108
+
109
+
110
+ def _prev_block_content_sibling (el ):
111
+ """Returns the first previous sibling that is a content element, else None."""
112
+ while el is not None :
113
+ el = el .previous_sibling
114
+ if _is_block_content_element (el ):
115
+ return el
116
+ return None
117
+
118
+
119
+ def _next_block_content_sibling (el ):
120
+ """Returns the first next sibling that is a content element, else None."""
121
+ while el is not None :
122
+ el = el .next_sibling
123
+ if _is_block_content_element (el ):
124
+ return el
125
+ return None
126
+
127
+
92
128
class MarkdownConverter (object ):
93
129
class DefaultOptions :
94
130
autolinks = True
@@ -143,29 +179,38 @@ def process_tag(self, node, convert_as_inline):
143
179
or node .name in ['td' , 'th' ] # table cells
144
180
)
145
181
146
- # Remove whitespace-only textnodes just before, after or
147
- # inside block-level elements.
182
+ # Collect child elements to process, ignoring whitespace-only text elements
183
+ # adjacent to the inner/outer boundaries of block elements.
148
184
should_remove_inside = should_remove_whitespace_inside (node )
149
- for el in node .children :
150
- # Only extract (remove) whitespace-only text node if any of the
151
- # conditions is true:
152
- # - el is the first element in its parent (block-level)
153
- # - el is the last element in its parent (block-level)
154
- # - el is adjacent to a block-level node
155
- can_extract = (should_remove_inside and (not el .previous_sibling
156
- or not el .next_sibling )
157
- or should_remove_whitespace_outside (el .previous_sibling )
158
- or should_remove_whitespace_outside (el .next_sibling ))
159
- if (isinstance (el , NavigableString )
160
- and six .text_type (el ).strip () == ''
161
- and can_extract ):
162
- el .extract ()
163
185
164
- # Convert the children first
165
- for el in node .children :
166
- if isinstance (el , Comment ) or isinstance (el , Doctype ):
167
- continue
186
+ def _can_ignore (el ):
187
+ if isinstance (el , Tag ):
188
+ # Tags are always processed.
189
+ return False
190
+ elif isinstance (el , (Comment , Doctype )):
191
+ # Comment and Doctype elements are always ignored.
192
+ # (subclasses of NavigableString, must test first)
193
+ return True
168
194
elif isinstance (el , NavigableString ):
195
+ if six .text_type (el ).strip () != '' :
196
+ # Non-whitespace text nodes are always processed.
197
+ return False
198
+ elif should_remove_inside and (not el .previous_sibling or not el .next_sibling ):
199
+ # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
200
+ return True
201
+ elif should_remove_whitespace_outside (el .previous_sibling ) or should_remove_whitespace_outside (el .next_sibling ):
202
+ # Outside block elements (including <pre>), ignore adjacent whitespace elements.
203
+ return True
204
+ else :
205
+ return False
206
+ else :
207
+ raise ValueError ('Unexpected element type: %s' % type (el ))
208
+
209
+ children_to_convert = [child for child in node .children if not _can_ignore (child )]
210
+
211
+ # Convert the children first
212
+ for el in children_to_convert :
213
+ if isinstance (el , NavigableString ):
169
214
text += self .process_text (el )
170
215
else :
171
216
text_strip = text .rstrip ('\n ' )
@@ -337,6 +382,16 @@ def convert_code(self, el, text, convert_as_inline):
337
382
338
383
convert_del = abstract_inline_conversion (lambda self : '~~' )
339
384
385
+ def convert_div (self , el , text , convert_as_inline ):
386
+ if convert_as_inline :
387
+ return ' ' + text .strip () + ' '
388
+ text = text .strip ()
389
+ return '\n \n %s\n \n ' % text if text else ''
390
+
391
+ convert_article = convert_div
392
+
393
+ convert_section = convert_div
394
+
340
395
convert_em = abstract_inline_conversion (lambda self : self .options ['strong_em_symbol' ])
341
396
342
397
convert_kbd = convert_code
@@ -415,7 +470,8 @@ def convert_list(self, el, text, convert_as_inline):
415
470
416
471
nested = False
417
472
before_paragraph = False
418
- if el .next_sibling and el .next_sibling .name not in ['ul' , 'ol' ]:
473
+ next_sibling = _next_block_content_sibling (el )
474
+ if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
419
475
before_paragraph = True
420
476
while el :
421
477
if el .name == 'li' :
@@ -539,22 +595,23 @@ def convert_th(self, el, text, convert_as_inline):
539
595
540
596
def convert_tr (self , el , text , convert_as_inline ):
541
597
cells = el .find_all (['td' , 'th' ])
598
+ is_first_row = el .find_previous_sibling () is None
542
599
is_headrow = (
543
600
all ([cell .name == 'th' for cell in cells ])
544
601
or (el .parent .name == 'thead'
545
602
# avoid multiple tr in thead
546
603
and len (el .parent .find_all ('tr' )) == 1 )
547
604
)
548
605
is_head_row_missing = (
549
- (not el . previous_sibling and not el .parent .name == 'tbody' )
550
- or (not el . previous_sibling and el .parent .name == 'tbody' and len (el .parent .parent .find_all (['thead' ])) < 1 )
606
+ (is_first_row and not el .parent .name == 'tbody' )
607
+ or (is_first_row and el .parent .name == 'tbody' and len (el .parent .parent .find_all (['thead' ])) < 1 )
551
608
)
552
609
overline = ''
553
610
underline = ''
554
611
if ((is_headrow
555
612
or (is_head_row_missing
556
613
and self .options ['table_infer_header' ]))
557
- and not el . previous_sibling ):
614
+ and is_first_row ):
558
615
# first row and:
559
616
# - is headline or
560
617
# - headline is missing and header inference is enabled
@@ -568,10 +625,10 @@ def convert_tr(self, el, text, convert_as_inline):
568
625
underline += '| ' + ' | ' .join (['---' ] * full_colspan ) + ' |' + '\n '
569
626
elif ((is_head_row_missing
570
627
and not self .options ['table_infer_header' ])
571
- or (not el . previous_sibling
628
+ or (is_first_row
572
629
and (el .parent .name == 'table'
573
630
or (el .parent .name == 'tbody'
574
- and not el .parent .previous_sibling )))):
631
+ and not el .parent .find_previous_sibling () )))):
575
632
# headline is missing and header inference is disabled or:
576
633
# first row, not headline, and:
577
634
# - the parent is table or
0 commit comments