fix: blockquote code continuation (#3264)

BREAKING CHANGE: add space token after blockquote and hr if there are multiple newlines
markedjs · Jun 12, 2024 · 7ab8185 · 7ab8185
1 parent 0e40783
commit 7ab8185
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 24 deletions.
diff --git a/src/Lexer.ts b/src/Lexer.ts
@@ -101,9 +101,9 @@ export class _Lexer {
   /**
    * Lexing
    */
-  blockTokens(src: string, tokens?: Token[]): Token[];
-  blockTokens(src: string, tokens?: TokensList): TokensList;
-  blockTokens(src: string, tokens: Token[] = []) {
+  blockTokens(src: string, tokens?: Token[], lastParagraphClipped?: boolean): Token[];
+  blockTokens(src: string, tokens?: TokensList, lastParagraphClipped?: boolean): TokensList;
+  blockTokens(src: string, tokens: Token[] = [], lastParagraphClipped = false) {
     if (this.options.pedantic) {
       src = src.replace(/\t/g, '    ').replace(/^ +$/gm, '');
     } else {
@@ -115,7 +115,6 @@ export class _Lexer {
     let token: Tokens.Generic | undefined;
     let lastToken;
     let cutSrc;
-    let lastParagraphClipped;
 
     while (src) {
       if (this.options.extensions
@@ -249,7 +248,7 @@ export class _Lexer {
       }
       if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) {
         lastToken = tokens[tokens.length - 1];
-        if (lastParagraphClipped && lastToken.type === 'paragraph') {
+        if (lastParagraphClipped && lastToken?.type === 'paragraph') {
           lastToken.raw += '\n' + token.raw;
           lastToken.text += '\n' + token.text;
           this.inlineQueue.pop();

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
@@ -7,7 +7,7 @@ import {
 } from './helpers.ts';
 import type { Rules } from './rules.ts';
 import type { _Lexer } from './Lexer.ts';
-import type { Links, Tokens } from './Tokens.ts';
+import type { Links, Tokens, Token } from './Tokens.ts';
 import type { MarkedOptions } from './MarkedOptions.ts';
 
 function outputLink(cap: string[], link: Pick<Tokens.Link, 'href' | 'title'>, raw: string, lexer: _Lexer): Tokens.Link | Tokens.Image {
@@ -148,24 +148,89 @@ export class _Tokenizer {
     if (cap) {
       return {
         type: 'hr',
-        raw: cap[0]
+        raw: rtrim(cap[0], '\n')
       };
     }
   }
 
   blockquote(src: string): Tokens.Blockquote | undefined {
     const cap = this.rules.block.blockquote.exec(src);
     if (cap) {
-      // precede setext continuation with 4 spaces so it isn't a setext
-      let text = cap[0].replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n    $1');
-      text = rtrim(text.replace(/^ *>[ \t]?/gm, ''), '\n');
-      const top = this.lexer.state.top;
-      this.lexer.state.top = true;
-      const tokens = this.lexer.blockTokens(text);
-      this.lexer.state.top = top;
+      let lines = rtrim(cap[0], '\n').split('\n');
+      let raw = '';
+      let text = '';
+      const tokens: Token[] = [];
+
+      while (lines.length > 0) {
+        let inBlockquote = false;
+        const currentLines = [];
+
+        let i;
+        for (i = 0; i < lines.length; i++) {
+          // get lines up to a continuation
+          if (/^ {0,3}>/.test(lines[i])) {
+            currentLines.push(lines[i]);
+            inBlockquote = true;
+          } else if (!inBlockquote) {
+            currentLines.push(lines[i]);
+          } else {
+            break;
+          }
+        }
+        lines = lines.slice(i);
+
+        const currentRaw = currentLines.join('\n');
+        const currentText = currentRaw
+          // precede setext continuation with 4 spaces so it isn't a setext
+          .replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n    $1')
+          .replace(/^ {0,3}>[ \t]?/gm, '');
+        raw = raw ? `${raw}\n${currentRaw}` : currentRaw;
+        text = text ? `${text}\n${currentText}` : currentText;
+
+        // parse blockquote lines as top level tokens
+        // merge paragraphs if this is a continuation
+        const top = this.lexer.state.top;
+        this.lexer.state.top = true;
+        this.lexer.blockTokens(currentText, tokens, true);
+        this.lexer.state.top = top;
+
+        // if there is no continuation then we are done
+        if (lines.length === 0) {
+          break;
+        }
+
+        const lastToken = tokens[tokens.length - 1];
+
+        if (lastToken?.type === 'code') {
+          // blockquote continuation cannot be preceded by a code block
+          break;
+        } else if (lastToken?.type === 'blockquote') {
+          // include continuation in nested blockquote
+          const oldToken = lastToken as Tokens.Blockquote;
+          const newText = oldToken.raw + '\n' + lines.join('\n');
+          const newToken = this.blockquote(newText)!;
+          tokens[tokens.length - 1] = newToken;
+
+          raw = raw.substring(0, raw.length - oldToken.raw.length) + newToken.raw;
+          text = text.substring(0, text.length - oldToken.text.length) + newToken.text;
+          break;
+        } else if (lastToken?.type === 'list') {
+          // include continuation in nested list
+          const oldToken = lastToken as Tokens.List;
+          const newText = oldToken.raw + '\n' + lines.join('\n');
+          const newToken = this.list(newText)!;
+          tokens[tokens.length - 1] = newToken;
+
+          raw = raw.substring(0, raw.length - lastToken.raw.length) + newToken.raw;
+          text = text.substring(0, text.length - oldToken.raw.length) + newToken.raw;
+          lines = newText.substring(tokens[tokens.length - 1].raw.length).split('\n');
+          continue;
+        }
+      }
+
       return {
         type: 'blockquote',
-        raw: cap[0],
+        raw,
         tokens,
         text
       };

diff --git a/test/specs/commonmark/commonmark.0.31.json b/test/specs/commonmark/commonmark.0.31.json
@@ -1887,17 +1887,15 @@
     "example": 236,
     "start_line": 3838,
     "end_line": 3848,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> ```\nfoo\n```\n",
     "html": "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n<p>foo</p>\n<pre><code></code></pre>\n",
     "example": 237,
     "start_line": 3851,
     "end_line": 3861,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> foo\n    - bar\n",

diff --git a/test/specs/gfm/commonmark.0.31.json b/test/specs/gfm/commonmark.0.31.json
@@ -1887,17 +1887,15 @@
     "example": 236,
     "start_line": 3838,
     "end_line": 3848,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> ```\nfoo\n```\n",
     "html": "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n<p>foo</p>\n<pre><code></code></pre>\n",
     "example": 237,
     "start_line": 3851,
     "end_line": 3861,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> foo\n    - bar\n",

diff --git a/test/unit/marked.test.js b/test/unit/marked.test.js
@@ -18,7 +18,7 @@ describe('marked unit', () => {
 
       assert.strictEqual(tokens[0].type, 'paragraph');
       assert.strictEqual(tokens[2].tokens[0].type, 'paragraph');
-      assert.strictEqual(tokens[3].items[0].tokens[0].type, 'text');
+      assert.strictEqual(tokens[4].items[0].tokens[0].type, 'text');
     });
   });
 
@@ -910,6 +910,7 @@ br
         ['text', 'paragraph'],
         ['space', ''],
         ['hr', '---'],
+        ['space', ''],
         ['heading', '# heading'],
         ['text', 'heading'],
         ['code', '```code```'],
@@ -924,6 +925,7 @@ br
         ['blockquote', '> blockquote'],
         ['paragraph', 'blockquote'],
         ['text', 'blockquote'],
+        ['space', ''],
         ['list', '- list'],
         ['list_item', '- list'],
         ['text', 'list'],