Skip to content

Commit 60d8666

Browse files
committed
More carefully separate inline text from block content
There are various cases in which inline text fails to be separated by (sufficiently many) newlines from adjacent block content. A paragraph needs a blank line (two newlines) separating it from prior text, as does an underlined header; an ATX header needs a single newline separating it from prior text. A list needs at least one newline separating it from prior text, but in general two newlines (for an ordered list starting other than at 1, which will only be recognized given a blank line before). To avoid accumulation of more newlines than necessary, take care when concatenating the results of converting consecutive tags to remove redundant newlines (keeping the greater of the number ending the prior text and the number starting the subsequent text). This is thus an alternative to matthewwithanm#108 that tries to avoid the excess newline accumulation that was a concern there, as well as fixing more cases than just paragraphs, and updating tests. Fixes matthewwithanm#92 Fixes matthewwithanm#98
1 parent 43dbe20 commit 60d8666

File tree

4 files changed

+58
-47
lines changed

4 files changed

+58
-47
lines changed

markdownify/__init__.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,13 @@ def is_nested_node(el):
143143
elif isinstance(el, NavigableString):
144144
text += self.process_text(el)
145145
else:
146-
text += self.process_tag(el, convert_children_as_inline)
146+
text_strip = text.rstrip('\n')
147+
newlines_left = len(text) - len(text_strip)
148+
next_text = self.process_tag(el, convert_children_as_inline)
149+
next_text_strip = next_text.lstrip('\n')
150+
newlines_right = len(next_text) - len(next_text_strip)
151+
newlines = '\n' * max(newlines_left, newlines_right)
152+
text = text_strip + newlines + next_text_strip
147153

148154
if not children_only:
149155
convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -216,7 +222,7 @@ def indent(self, text, level):
216222

217223
def underline(self, text, pad_char):
218224
text = (text or '').rstrip()
219-
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
225+
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
220226

221227
def convert_a(self, el, text, convert_as_inline):
222228
prefix, suffix, text = chomp(text)
@@ -277,8 +283,8 @@ def convert_hn(self, n, el, text, convert_as_inline):
277283
return self.underline(text, line)
278284
hashes = '#' * n
279285
if style == ATX_CLOSED:
280-
return '%s %s %s\n\n' % (hashes, text, hashes)
281-
return '%s %s\n\n' % (hashes, text)
286+
return '\n%s %s %s\n\n' % (hashes, text, hashes)
287+
return '\n%s %s\n\n' % (hashes, text)
282288

283289
def convert_hr(self, el, text, convert_as_inline):
284290
return '\n\n---\n\n'
@@ -313,7 +319,7 @@ def convert_list(self, el, text, convert_as_inline):
313319
if nested:
314320
# remove trailing newline if nested
315321
return '\n' + self.indent(text, 1).rstrip()
316-
return text + ('\n' if before_paragraph else '')
322+
return '\n\n' + text + ('\n' if before_paragraph else '')
317323

318324
convert_ul = convert_list
319325
convert_ol = convert_list
@@ -344,7 +350,7 @@ def convert_p(self, el, text, convert_as_inline):
344350
width=self.options['wrap_width'],
345351
break_long_words=False,
346352
break_on_hyphens=False)
347-
return '%s\n\n' % text if text else ''
353+
return '\n\n%s\n\n' % text if text else ''
348354

349355
def convert_pre(self, el, text, convert_as_inline):
350356
if not text:

tests/test_advanced.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_chomp():
1414

1515
def test_nested():
1616
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
17-
assert text == 'This is an [example link](http://example.com/).\n\n'
17+
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'
1818

1919

2020
def test_ignore_comments():

tests/test_conversions.py

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -112,36 +112,38 @@ def test_em():
112112

113113

114114
def test_header_with_space():
115-
assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
116-
assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
117-
assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
118-
assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
119-
assert md('<h5>\n\nHello \n\n</h5>') == '##### Hello\n\n'
115+
assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
116+
assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
117+
assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
118+
assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
119+
assert md('<h5>\n\nHello \n\n</h5>') == '\n##### Hello\n\n'
120120

121121

122122
def test_h1():
123-
assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
123+
assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'
124124

125125

126126
def test_h2():
127-
assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
127+
assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'
128128

129129

130130
def test_hn():
131-
assert md('<h3>Hello</h3>') == '### Hello\n\n'
132-
assert md('<h4>Hello</h4>') == '#### Hello\n\n'
133-
assert md('<h5>Hello</h5>') == '##### Hello\n\n'
134-
assert md('<h6>Hello</h6>') == '###### Hello\n\n'
131+
assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
132+
assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
133+
assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
134+
assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'
135135

136136

137137
def test_hn_chained():
138-
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
139-
assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
138+
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n'
139+
assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
140+
assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
141+
assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
140142

141143

142144
def test_hn_nested_tag_heading_style():
143-
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
144-
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
145+
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
146+
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'
145147

146148

147149
def test_hn_nested_simple_tag():
@@ -157,12 +159,12 @@ def test_hn_nested_simple_tag():
157159
]
158160

159161
for tag, markdown in tag_to_markdown:
160-
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
162+
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'
161163

162-
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
164+
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'
163165

164166
# Nested lists not supported
165-
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
167+
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'
166168

167169

168170
def test_hn_nested_img():
@@ -172,18 +174,18 @@ def test_hn_nested_img():
172174
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
173175
]
174176
for image_attributes, markdown, title in image_attributes_to_markdown:
175-
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
176-
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
177+
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A ' + markdown + ' B\n\n'
178+
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
177179

178180

179181
def test_hn_atx_headings():
180-
assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
181-
assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
182+
assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
183+
assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'
182184

183185

184186
def test_hn_atx_closed_headings():
185-
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
186-
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
187+
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
188+
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'
187189

188190

189191
def test_head():
@@ -193,7 +195,7 @@ def test_head():
193195
def test_hr():
194196
assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
195197
assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
196-
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
198+
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n\n---\n\n\nWorld\n\n'
197199

198200

199201
def test_i():
@@ -210,12 +212,13 @@ def test_kbd():
210212

211213

212214
def test_p():
213-
assert md('<p>hello</p>') == 'hello\n\n'
214-
assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
215-
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
216-
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
217-
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
218-
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
215+
assert md('<p>hello</p>') == '\n\nhello\n\n'
216+
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
217+
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
218+
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
219+
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
220+
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
221+
assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
219222

220223

221224
def test_pre():

tests/test_lists.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,41 +41,43 @@
4141

4242

4343
def test_ol():
44-
assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
45-
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
44+
assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
45+
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
46+
assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
4647

4748

4849
def test_nested_ols():
49-
assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
50+
assert md(nested_ols) == '\n\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
5051

5152

5253
def test_ul():
53-
assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
54+
assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
5455
assert md("""<ul>
5556
<li>
5657
a
5758
</li>
5859
<li> b </li>
5960
<li> c
6061
</li>
61-
</ul>""") == '* a\n* b\n* c\n'
62+
</ul>""") == '\n\n* a\n* b\n* c\n'
6263

6364

6465
def test_inline_ul():
65-
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
66+
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
67+
assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'
6668

6769

6870
def test_nested_uls():
6971
"""
7072
Nested ULs should alternate bullet characters.
7173
7274
"""
73-
assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
75+
assert md(nested_uls) == '\n\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
7476

7577

7678
def test_bullets():
77-
assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
79+
assert md(nested_uls, bullets='-') == '\n\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
7880

7981

8082
def test_li_text():
81-
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
83+
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'

0 commit comments

Comments
 (0)