Skip to content

Commit 3026602

Browse files
authored
make conversion non-destructive to soup; improve div/article/section handling (#184)
Signed-off-by: chrispy <[email protected]>
1 parent c52a50e commit 3026602

File tree

3 files changed

+98
-30
lines changed

3 files changed

+98
-30
lines changed

Diff for: markdownify/__init__.py

+84-27
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
1+
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
22
from textwrap import fill
33
import re
44
import six
@@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el):
7979
if html_heading_re.match(el.name) is not None:
8080
return True
8181
return el.name in ('p', 'blockquote',
82+
'article', 'div', 'section',
8283
'ol', 'ul', 'li',
8384
'table', 'thead', 'tbody', 'tfoot',
8485
'tr', 'td', 'th')
@@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el):
8990
return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
9091

9192

93+
def _is_block_content_element(el):
94+
"""
95+
In a block context, returns:
96+
97+
- True for content elements (tags and non-whitespace text)
98+
- False for non-content elements (whitespace text, comments, doctypes)
99+
"""
100+
if isinstance(el, Tag):
101+
return True
102+
elif isinstance(el, (Comment, Doctype)):
103+
return False # (subclasses of NavigableString, must test first)
104+
elif isinstance(el, NavigableString):
105+
return el.strip() != ''
106+
else:
107+
return False
108+
109+
110+
def _prev_block_content_sibling(el):
111+
"""Returns the first previous sibling that is a content element, else None."""
112+
while el is not None:
113+
el = el.previous_sibling
114+
if _is_block_content_element(el):
115+
return el
116+
return None
117+
118+
119+
def _next_block_content_sibling(el):
120+
"""Returns the first next sibling that is a content element, else None."""
121+
while el is not None:
122+
el = el.next_sibling
123+
if _is_block_content_element(el):
124+
return el
125+
return None
126+
127+
92128
class MarkdownConverter(object):
93129
class DefaultOptions:
94130
autolinks = True
@@ -143,29 +179,38 @@ def process_tag(self, node, convert_as_inline):
143179
or node.name in ['td', 'th'] # table cells
144180
)
145181

146-
# Remove whitespace-only textnodes just before, after or
147-
# inside block-level elements.
182+
# Collect child elements to process, ignoring whitespace-only text elements
183+
# adjacent to the inner/outer boundaries of block elements.
148184
should_remove_inside = should_remove_whitespace_inside(node)
149-
for el in node.children:
150-
# Only extract (remove) whitespace-only text node if any of the
151-
# conditions is true:
152-
# - el is the first element in its parent (block-level)
153-
# - el is the last element in its parent (block-level)
154-
# - el is adjacent to a block-level node
155-
can_extract = (should_remove_inside and (not el.previous_sibling
156-
or not el.next_sibling)
157-
or should_remove_whitespace_outside(el.previous_sibling)
158-
or should_remove_whitespace_outside(el.next_sibling))
159-
if (isinstance(el, NavigableString)
160-
and six.text_type(el).strip() == ''
161-
and can_extract):
162-
el.extract()
163185

164-
# Convert the children first
165-
for el in node.children:
166-
if isinstance(el, Comment) or isinstance(el, Doctype):
167-
continue
186+
def _can_ignore(el):
187+
if isinstance(el, Tag):
188+
# Tags are always processed.
189+
return False
190+
elif isinstance(el, (Comment, Doctype)):
191+
# Comment and Doctype elements are always ignored.
192+
# (subclasses of NavigableString, must test first)
193+
return True
168194
elif isinstance(el, NavigableString):
195+
if six.text_type(el).strip() != '':
196+
# Non-whitespace text nodes are always processed.
197+
return False
198+
elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
199+
# Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
200+
return True
201+
elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
202+
# Outside block elements (including <pre>), ignore adjacent whitespace elements.
203+
return True
204+
else:
205+
return False
206+
else:
207+
raise ValueError('Unexpected element type: %s' % type(el))
208+
209+
children_to_convert = [child for child in node.children if not _can_ignore(child)]
210+
211+
# Convert the children first
212+
for el in children_to_convert:
213+
if isinstance(el, NavigableString):
169214
text += self.process_text(el)
170215
else:
171216
text_strip = text.rstrip('\n')
@@ -337,6 +382,16 @@ def convert_code(self, el, text, convert_as_inline):
337382

338383
convert_del = abstract_inline_conversion(lambda self: '~~')
339384

385+
def convert_div(self, el, text, convert_as_inline):
386+
if convert_as_inline:
387+
return ' ' + text.strip() + ' '
388+
text = text.strip()
389+
return '\n\n%s\n\n' % text if text else ''
390+
391+
convert_article = convert_div
392+
393+
convert_section = convert_div
394+
340395
convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
341396

342397
convert_kbd = convert_code
@@ -415,7 +470,8 @@ def convert_list(self, el, text, convert_as_inline):
415470

416471
nested = False
417472
before_paragraph = False
418-
if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
473+
next_sibling = _next_block_content_sibling(el)
474+
if next_sibling and next_sibling.name not in ['ul', 'ol']:
419475
before_paragraph = True
420476
while el:
421477
if el.name == 'li':
@@ -539,22 +595,23 @@ def convert_th(self, el, text, convert_as_inline):
539595

540596
def convert_tr(self, el, text, convert_as_inline):
541597
cells = el.find_all(['td', 'th'])
598+
is_first_row = el.find_previous_sibling() is None
542599
is_headrow = (
543600
all([cell.name == 'th' for cell in cells])
544601
or (el.parent.name == 'thead'
545602
# avoid multiple tr in thead
546603
and len(el.parent.find_all('tr')) == 1)
547604
)
548605
is_head_row_missing = (
549-
(not el.previous_sibling and not el.parent.name == 'tbody')
550-
or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
606+
(is_first_row and not el.parent.name == 'tbody')
607+
or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
551608
)
552609
overline = ''
553610
underline = ''
554611
if ((is_headrow
555612
or (is_head_row_missing
556613
and self.options['table_infer_header']))
557-
and not el.previous_sibling):
614+
and is_first_row):
558615
# first row and:
559616
# - is headline or
560617
# - headline is missing and header inference is enabled
@@ -568,10 +625,10 @@ def convert_tr(self, el, text, convert_as_inline):
568625
underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
569626
elif ((is_head_row_missing
570627
and not self.options['table_infer_header'])
571-
or (not el.previous_sibling
628+
or (is_first_row
572629
and (el.parent.name == 'table'
573630
or (el.parent.name == 'tbody'
574-
and not el.parent.previous_sibling)))):
631+
and not el.parent.find_previous_sibling())))):
575632
# headline is missing and header inference is disabled or:
576633
# first row, not headline, and:
577634
# - the parent is table or

Diff for: tests/test_basic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ def test_single_tag():
66

77

88
def test_soup():
9-
assert md('<div><span>Hello</div></span>') == 'Hello'
9+
assert md('<div><span>Hello</div></span>') == '\n\nHello\n\n'
1010

1111

1212
def test_whitespace():

Diff for: tests/test_conversions.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,19 @@ def test_del():
114114
inline_tests('del', '~~')
115115

116116

117-
def test_div():
118-
assert md('Hello</div> World') == 'Hello World'
117+
def test_div_section_article():
118+
for tag in ['div', 'section', 'article']:
119+
assert md(f'<{tag}>456</{tag}>') == '\n\n456\n\n'
120+
assert md(f'123<{tag}>456</{tag}>789') == '123\n\n456\n\n789'
121+
assert md(f'123<{tag}>\n 456 \n</{tag}>789') == '123\n\n456\n\n789'
122+
assert md(f'123<{tag}><p>456</p></{tag}>789') == '123\n\n456\n\n789'
123+
assert md(f'123<{tag}>\n<p>456</p>\n</{tag}>789') == '123\n\n456\n\n789'
124+
assert md(f'123<{tag}><pre>4 5 6</pre></{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
125+
assert md(f'123<{tag}>\n<pre>4 5 6</pre>\n</{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
126+
assert md(f'123<{tag}>4\n5\n6</{tag}>789') == '123\n\n4\n5\n6\n\n789'
127+
assert md(f'123<{tag}>\n4\n5\n6\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
128+
assert md(f'123<{tag}>\n<p>\n4\n5\n6\n</p>\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
129+
assert md(f'<{tag}><h1>title</h1>body</{{tag}}>', heading_style=ATX) == '\n\n# title\n\nbody\n\n'
119130

120131

121132
def test_em():

0 commit comments

Comments
 (0)