From 60d86663d7f3cb5113e55ecede817f1a306420a2 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Tue, 9 Apr 2024 16:47:15 +0000 Subject: [PATCH 1/2] More carefully separate inline text from block content There are various cases in which inline text fails to be separated by (sufficiently many) newlines from adjacent block content. A paragraph needs a blank line (two newlines) separating it from prior text, as does an underlined header; an ATX header needs a single newline separating it from prior text. A list needs at least one newline separating it from prior text, but in general two newlines (for an ordered list starting other than at 1, which will only be recognized given a blank line before). To avoid accumulation of more newlines than necessary, take care when concatenating the results of converting consecutive tags to remove redundant newlines (keeping the greater of the number ending the prior text and the number starting the subsequent text). This is thus an alternative to #108 that tries to avoid the excess newline accumulation that was a concern there, as well as fixing more cases than just paragraphs, and updating tests. Fixes #92 Fixes #98 --- markdownify/__init__.py | 18 +++++++---- tests/test_advanced.py | 2 +- tests/test_conversions.py | 65 ++++++++++++++++++++------------------- tests/test_lists.py | 20 ++++++------ 4 files changed, 58 insertions(+), 47 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index eaa6ded..d0da098 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -143,7 +143,13 @@ def is_nested_node(el): elif isinstance(el, NavigableString): text += self.process_text(el) else: - text += self.process_tag(el, convert_children_as_inline) + text_strip = text.rstrip('\n') + newlines_left = len(text) - len(text_strip) + next_text = self.process_tag(el, convert_children_as_inline) + next_text_strip = next_text.lstrip('\n') + newlines_right = len(next_text) - len(next_text_strip) + newlines = '\n' * max(newlines_left, newlines_right) + text = text_strip + newlines + next_text_strip if not children_only: convert_fn = getattr(self, 'convert_%s' % node.name, None) @@ -216,7 +222,7 @@ def indent(self, text, level): def underline(self, text, pad_char): text = (text or '').rstrip() - return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' + return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' def convert_a(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) @@ -277,8 +283,8 @@ def convert_hn(self, n, el, text, convert_as_inline): return self.underline(text, line) hashes = '#' * n if style == ATX_CLOSED: - return '%s %s %s\n\n' % (hashes, text, hashes) - return '%s %s\n\n' % (hashes, text) + return '\n%s %s %s\n\n' % (hashes, text, hashes) + return '\n%s %s\n\n' % (hashes, text) def convert_hr(self, el, text, convert_as_inline): return '\n\n---\n\n' @@ -313,7 +319,7 @@ def convert_list(self, el, text, convert_as_inline): if nested: # remove trailing newline if nested return '\n' + self.indent(text, 1).rstrip() - return text + ('\n' if before_paragraph else '') + return '\n\n' + text + ('\n' if before_paragraph else '') convert_ul = convert_list convert_ol = convert_list @@ -344,7 +350,7 @@ def convert_p(self, el, text, convert_as_inline): width=self.options['wrap_width'], break_long_words=False, break_on_hyphens=False) - return '%s\n\n' % text if text else '' + return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): if not text: diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 14bf3cd..a3a5fda 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -14,7 +14,7 @@ def test_chomp(): def test_nested(): text = md('

This is an example link.

') - assert text == 'This is an [example link](http://example.com/).\n\n' + assert text == '\n\nThis is an [example link](http://example.com/).\n\n' def test_ignore_comments(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 9652143..e2c172a 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -112,36 +112,38 @@ def test_em(): def test_header_with_space(): - assert md('

\n\nHello

') == '### Hello\n\n' - assert md('

\n\nHello

') == '#### Hello\n\n' - assert md('
\n\nHello
') == '##### Hello\n\n' - assert md('
\n\nHello\n\n
') == '##### Hello\n\n' - assert md('
\n\nHello \n\n
') == '##### Hello\n\n' + assert md('

\n\nHello

') == '\n### Hello\n\n' + assert md('

\n\nHello

') == '\n#### Hello\n\n' + assert md('
\n\nHello
') == '\n##### Hello\n\n' + assert md('
\n\nHello\n\n
') == '\n##### Hello\n\n' + assert md('
\n\nHello \n\n
') == '\n##### Hello\n\n' def test_h1(): - assert md('

Hello

') == 'Hello\n=====\n\n' + assert md('

Hello

') == '\n\nHello\n=====\n\n' def test_h2(): - assert md('

Hello

') == 'Hello\n-----\n\n' + assert md('

Hello

') == '\n\nHello\n-----\n\n' def test_hn(): - assert md('

Hello

') == '### Hello\n\n' - assert md('

Hello

') == '#### Hello\n\n' - assert md('
Hello
') == '##### Hello\n\n' - assert md('
Hello
') == '###### Hello\n\n' + assert md('

Hello

') == '\n### Hello\n\n' + assert md('

Hello

') == '\n#### Hello\n\n' + assert md('
Hello
') == '\n##### Hello\n\n' + assert md('
Hello
') == '\n###### Hello\n\n' def test_hn_chained(): - assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n' - assert md('X

First

', heading_style=ATX) == 'X# First\n\n' + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n' + assert md('X

First

', heading_style=ATX) == 'X\n# First\n\n' + assert md('X

First

', heading_style=ATX_CLOSED) == 'X\n# First #\n\n' + assert md('X

First

') == 'X\n\nFirst\n=====\n\n' def test_hn_nested_tag_heading_style(): - assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' - assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' + assert md('

A

P

C

', heading_style=ATX_CLOSED) == '\n# A P C #\n\n' + assert md('

A

P

C

', heading_style=ATX) == '\n# A P C\n\n' def test_hn_nested_simple_tag(): @@ -157,12 +159,12 @@ def test_hn_nested_simple_tag(): ] for tag, markdown in tag_to_markdown: - assert md('

A <' + tag + '>' + tag + ' B

') == '### A ' + markdown + ' B\n\n' + assert md('

A <' + tag + '>' + tag + ' B

') == '\n### A ' + markdown + ' B\n\n' - assert md('

A
B

', heading_style=ATX) == '### A B\n\n' + assert md('

A
B

', heading_style=ATX) == '\n### A B\n\n' # Nested lists not supported - # assert md('

A

', heading_style=ATX) == '### A li1 li2 B\n\n' + # assert md('

A

', heading_style=ATX) == '\n### A li1 li2 B\n\n' def test_hn_nested_img(): @@ -172,18 +174,18 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

A B

') == '### A ' + markdown + ' B\n\n' - assert md('

A B

', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' + assert md('

A B

') == '\n### A ' + markdown + ' B\n\n' + assert md('

A B

', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' def test_hn_atx_headings(): - assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' - assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '\n# Hello\n\n' + assert md('

Hello

', heading_style=ATX) == '\n## Hello\n\n' def test_hn_atx_closed_headings(): - assert md('

Hello

', heading_style=ATX_CLOSED) == '# Hello #\n\n' - assert md('

Hello

', heading_style=ATX_CLOSED) == '## Hello ##\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '\n# Hello #\n\n' + assert md('

Hello

', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n' def test_head(): @@ -193,7 +195,7 @@ def test_head(): def test_hr(): assert md('Hello
World') == 'Hello\n\n---\n\nWorld' assert md('Hello
World') == 'Hello\n\n---\n\nWorld' - assert md('

Hello

\n
\n

World

') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' + assert md('

Hello

\n
\n

World

') == '\n\nHello\n\n\n---\n\n\nWorld\n\n' def test_i(): @@ -210,12 +212,13 @@ def test_kbd(): def test_p(): - assert md('

hello

') == 'hello\n\n' - assert md('

123456789 123456789

') == '123456789 123456789\n\n' - assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '123456789\n123456789\n\n' - assert md('

Some long link

', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n' - assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n' - assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n' + assert md('

hello

') == '\n\nhello\n\n' + assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' + assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' + assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' def test_pre(): diff --git a/tests/test_lists.py b/tests/test_lists.py index 5a04430..0b23179 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -41,16 +41,17 @@ def test_ol(): - assert md('
  1. a
  2. b
') == '1. a\n2. b\n' - assert md('
  1. a
  2. b
') == '3. a\n4. b\n' + assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' + assert md('
  1. a
  2. b
') == '\n\n3. a\n4. b\n' + assert md('foo
  1. a
  2. b
bar') == 'foo\n\n3. a\n4. b\n\nbar' def test_nested_ols(): - assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n' + assert md(nested_ols) == '\n\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n' def test_ul(): - assert md('') == '* a\n* b\n' + assert md('') == '\n\n* a\n* b\n' assert md("""""") == '* a\n* b\n* c\n' + """) == '\n\n* a\n* b\n* c\n' def test_inline_ul(): - assert md('

foo

bar

') == 'foo\n\n* a\n* b\n\nbar\n\n' + assert md('

foo

bar

') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n' + assert md('foobaz') == 'foo\n\n* bar\n\nbaz' def test_nested_uls(): @@ -70,12 +72,12 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' + assert md(nested_uls) == '\n\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' + assert md(nested_uls, bullets='-') == '\n\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' def test_li_text(): - assert md('') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' + assert md('') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n' From c2ffe46e858b3d98ed73f6c7107c88e186e3d43d Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 3 Oct 2024 00:30:50 +0000 Subject: [PATCH 2/2] Fix whitespace issues around wrapping This fixes various issues relating to how input whitespace is handled and how wrapping handles whitespace resulting from hard line breaks. This PR uses a branch based on that for #120 to avoid conflicts with the fixes and associated test changes there. My suggestion is thus first to merge #120 (which fixes two open issues), then to merge the remaining changes from this PR. Wrapping paragraphs has the effect of losing all newlines including those from `
` tags, contrary to HTML semantics (wrapping should be a matter of pretty-printing the output; input whitespace from the HTML input should be normalized, but `
` should remain as a hard line break). To fix this, we need to wrap the portions of a paragraph between hard line breaks separately. For this to work, ensure that when wrapping, all input whitespace is normalized at an early stage, including turning newlines into spaces. (Only ASCII whitespace is handled this way; `\s` is not used as it's not clear Unicode whitespace should get such normalization.) When not wrapping, there is still too much input whitespace preservation. If the input contains a blank line, that ends up as a paragraph break in the output, or breaks the header formatting when appearing in a header tag, though in terms of HTML semantics such a blank line is no different from a space. In the case of an ATX header, even a single newline appearing in the output breaks the Markdown. Thus, when not wrapping, arrange for input whitespace containing at least one `\r` or `\n` to be normalized to a single newline, and in the ATX header case, normalize to a space. Fixes #130 (probably, not sure exactly what the HTML input there is) Fixes #88 (a related case, anyway; the actual input in #88 has already been fixed) --- markdownify/__init__.py | 29 +++++++++++++++++++++++------ tests/test_basic.py | 1 + tests/test_conversions.py | 15 +++++++++++++-- tests/test_tables.py | 2 +- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index efb2d15..a37f870 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -7,7 +7,8 @@ convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') -all_whitespace_re = re.compile(r'[\s]+') +all_whitespace_re = re.compile(r'[\t \r\n]+') +newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') @@ -168,7 +169,11 @@ def process_text(self, el): # normalize whitespace if we're not inside a preformatted element if not el.find_parent('pre'): - text = whitespace_re.sub(' ', text) + if self.options['wrap']: + text = all_whitespace_re.sub(' ', text) + else: + text = newline_whitespace_re.sub('\n', text) + text = whitespace_re.sub(' ', text) # escape special characters if we're not inside a preformatted or code element if not el.find_parent(['pre', 'code', 'kbd', 'samp']): @@ -286,6 +291,7 @@ def convert_hn(self, n, el, text, convert_as_inline): if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) + text = all_whitespace_re.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: return '\n%s %s %s\n\n' % (hashes, text, hashes) @@ -351,10 +357,21 @@ def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return text if self.options['wrap']: - text = fill(text, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) + # Preserve newlines (and preceding whitespace) resulting + # from
tags. Newlines in the input have already been + # replaced by spaces. + lines = text.split('\n') + new_lines = [] + for line in lines: + line = line.lstrip() + line_no_trailing = line.rstrip() + trailing = line[len(line_no_trailing):] + line = fill(line, + width=self.options['wrap_width'], + break_long_words=False, + break_on_hyphens=False) + new_lines.append(line + trailing) + text = '\n'.join(new_lines) return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): diff --git a/tests/test_basic.py b/tests/test_basic.py index bf25ee0..66f8b6c 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -11,3 +11,4 @@ def test_soup(): def test_whitespace(): assert md(' a b \t\t c ') == ' a b c ' + assert md(' a b \n\n c ') == ' a b\nc ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index baa294b..9c1edc3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE def inline_tests(tag, markup): @@ -113,6 +113,7 @@ def test_em(): def test_header_with_space(): assert md('

\n\nHello

') == '\n### Hello\n\n' + assert md('

Hello\n\n\nWorld

') == '\n### Hello World\n\n' assert md('

\n\nHello

') == '\n#### Hello\n\n' assert md('
\n\nHello
') == '\n##### Hello\n\n' assert md('
\n\nHello\n\n
') == '\n##### Hello\n\n' @@ -174,7 +175,7 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

A B

') == '\n### A ' + markdown + ' B\n\n' + assert md('

A B

') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' assert md('

A B

', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' @@ -214,10 +215,20 @@ def test_kbd(): def test_p(): assert md('

hello

') == '\n\nhello\n\n' assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' diff --git a/tests/test_tables.py b/tests/test_tables.py index 594e5bf..fc6eee6 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -242,7 +242,7 @@ def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' + assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'