Skip to content

Commit ae0597d

Browse files
authored
remove superfluous leading/trailing whitespace (#181)
1 parent dbb5988 commit ae0597d

11 files changed

+67
-19
lines changed

Diff for: README.rst

+7
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,13 @@ wrap, wrap_width
150150
Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
151151
A `wrap_width` value of `None` reflows lines to unlimited line length.
152152

153+
strip_document
154+
Controls whether leading and/or trailing separation newlines are removed from
155+
the final converted document. Supported values are ``LSTRIP`` (leading),
156+
``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
157+
within the document are unaffected.
158+
Defaults to ``STRIP``.
159+
153160
Options may be specified as kwargs to the ``markdownify`` function, or as a
154161
nested ``Options`` class in ``MarkdownConverter`` subclasses.
155162

Diff for: markdownify/__init__.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
ASTERISK = '*'
2727
UNDERSCORE = '_'
2828

29+
# Document strip styles
30+
LSTRIP = 'lstrip'
31+
RSTRIP = 'rstrip'
32+
STRIP = 'strip'
33+
2934

3035
def chomp(text):
3136
"""
@@ -99,6 +104,7 @@ class DefaultOptions:
99104
keep_inline_images_in = []
100105
newline_style = SPACES
101106
strip = None
107+
strip_document = STRIP
102108
strong_em_symbol = ASTERISK
103109
sub_symbol = ''
104110
sup_symbol = ''
@@ -180,7 +186,18 @@ def process_tag(self, node, convert_as_inline):
180186
return text
181187

182188
def convert__document_(self, el, text, convert_as_inline):
183-
# for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
189+
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
190+
if self.options['strip_document'] == LSTRIP:
191+
text = text.lstrip('\n') # remove leading separation newlines
192+
elif self.options['strip_document'] == RSTRIP:
193+
text = text.rstrip('\n') # remove trailing separation newlines
194+
elif self.options['strip_document'] == STRIP:
195+
text = text.strip('\n') # remove leading and trailing separation newlines
196+
elif self.options['strip_document'] is None:
197+
pass # leave leading and trailing separation newlines as-is
198+
else:
199+
raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
200+
184201
return text
185202

186203
def process_text(self, el):
@@ -454,6 +471,7 @@ def _indent_for_li(match):
454471
def convert_p(self, el, text, convert_as_inline):
455472
if convert_as_inline:
456473
return ' ' + text.strip() + ' '
474+
text = text.strip()
457475
if self.options['wrap']:
458476
# Preserve newlines (and preceding whitespace) resulting
459477
# from <br> tags. Newlines in the input have already been
@@ -500,13 +518,13 @@ def convert_style(self, el, text, convert_as_inline):
500518
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
501519

502520
def convert_table(self, el, text, convert_as_inline):
503-
return '\n\n' + text + '\n'
521+
return '\n\n' + text.strip() + '\n\n'
504522

505523
def convert_caption(self, el, text, convert_as_inline):
506-
return text + '\n\n'
524+
return text.strip() + '\n\n'
507525

508526
def convert_figcaption(self, el, text, convert_as_inline):
509-
return '\n\n' + text + '\n\n'
527+
return '\n\n' + text.strip() + '\n\n'
510528

511529
def convert_td(self, el, text, convert_as_inline):
512530
colspan = 1

Diff for: tests/test_advanced.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from markdownify import markdownify as md
1+
from .utils import md
22

33

44
def test_chomp():

Diff for: tests/test_args.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Test whitelisting/blacklisting of specific tags.
33
44
"""
5-
from markdownify import markdownify as md
5+
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
6+
from .utils import md
67

78

89
def test_strip():
@@ -23,3 +24,11 @@ def test_convert():
2324
def test_do_not_convert():
2425
text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
2526
assert text == 'Some Text'
27+
28+
29+
def test_strip_document():
30+
assert markdownify("<p>Hello</p>") == "Hello" # test default of STRIP
31+
assert markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
32+
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
33+
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
34+
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"

Diff for: tests/test_basic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from markdownify import markdownify as md
1+
from .utils import md
22

33

44
def test_single_tag():

Diff for: tests/test_conversions.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
1+
from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
2+
from .utils import md
23

34

45
def inline_tests(tag, markup):
@@ -79,11 +80,6 @@ def test_br():
7980
assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
8081

8182

82-
def test_caption():
83-
assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
84-
assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'
85-
86-
8783
def test_code():
8884
inline_tests('code', '`')
8985
assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
@@ -126,6 +122,11 @@ def test_em():
126122
inline_tests('em', '*')
127123

128124

125+
def test_figcaption():
126+
assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
127+
assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")
128+
129+
129130
def test_header_with_space():
130131
assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
131132
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
@@ -236,6 +237,7 @@ def test_kbd():
236237

237238
def test_p():
238239
assert md('<p>hello</p>') == '\n\nhello\n\n'
240+
assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
239241
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
240242
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
241243
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'

Diff for: tests/test_custom_converter.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ def test_custom_conversion_functions():
2020
def md(html, **options):
2121
return UnitTestConverter(**options).convert(html)
2222

23-
assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
24-
assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
23+
assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
24+
assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'
2525

2626
assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"
2727

Diff for: tests/test_escaping.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import warnings
22
from bs4 import MarkupResemblesLocatorWarning
3-
from markdownify import markdownify as md
3+
from .utils import md
44

55

66
def test_asterisks():

Diff for: tests/test_lists.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from markdownify import markdownify as md
1+
from .utils import md
22

33

44
nested_uls = """

Diff for: tests/test_tables.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from markdownify import markdownify as md
1+
from .utils import md
22

33

44
table = """<table>
@@ -228,7 +228,10 @@
228228
</tbody>
229229
</table>"""
230230

231-
table_with_caption = """TEXT<table><caption>Caption</caption>
231+
table_with_caption = """TEXT<table>
232+
<caption>
233+
Caption
234+
</caption>
232235
<tbody><tr><td>Firstname</td>
233236
<td>Lastname</td>
234237
<td>Age</td>

Diff for: tests/utils.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from markdownify import MarkdownConverter
2+
3+
4+
# for unit testing, disable document-level stripping by default so that
5+
# separation newlines are included in testing
6+
def md(html, **options):
7+
options = {"strip_document": None, **options}
8+
9+
return MarkdownConverter(**options).convert(html)

0 commit comments

Comments
 (0)