From 701a86dfd998e68b615cd7f265eff88cec839d0c Mon Sep 17 00:00:00 2001
From: Tony Brix <tony@brix.ninja>
Date: Sat, 20 Apr 2024 17:45:50 -0600
Subject: [PATCH 1/4] fix: fix code continuation in blockquote

---
 src/Lexer.ts                               |  9 ++-
 src/Tokenizer.ts                           | 73 +++++++++++++++++++---
 test/specs/commonmark/commonmark.0.31.json |  6 +-
 test/specs/gfm/commonmark.0.31.json        |  6 +-
 test/unit/marked.test.js                   |  3 +-
 5 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/src/Lexer.ts b/src/Lexer.ts
index d6be1afd3c..dc2cea1231 100644
--- a/src/Lexer.ts
+++ b/src/Lexer.ts
@@ -101,9 +101,9 @@ export class _Lexer {
   /**
    * Lexing
    */
-  blockTokens(src: string, tokens?: Token[]): Token[];
-  blockTokens(src: string, tokens?: TokensList): TokensList;
-  blockTokens(src: string, tokens: Token[] = []) {
+  blockTokens(src: string, tokens?: Token[], lastParagraphClipped?: boolean): Token[];
+  blockTokens(src: string, tokens?: TokensList, lastParagraphClipped?: boolean): TokensList;
+  blockTokens(src: string, tokens: Token[] = [], lastParagraphClipped = false) {
     if (this.options.pedantic) {
       src = src.replace(/\t/g, '    ').replace(/^ +$/gm, '');
     } else {
@@ -115,7 +115,6 @@ export class _Lexer {
     let token: Tokens.Generic | undefined;
     let lastToken;
     let cutSrc;
-    let lastParagraphClipped;
 
     while (src) {
       if (this.options.extensions
@@ -249,7 +248,7 @@ export class _Lexer {
       }
       if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) {
         lastToken = tokens[tokens.length - 1];
-        if (lastParagraphClipped && lastToken.type === 'paragraph') {
+        if (lastParagraphClipped && lastToken?.type === 'paragraph') {
           lastToken.raw += '\n' + token.raw;
           lastToken.text += '\n' + token.text;
           this.inlineQueue.pop();
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 10b31454e2..77ed0b5d27 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -7,7 +7,7 @@ import {
 } from './helpers.ts';
 import type { Rules } from './rules.ts';
 import type { _Lexer } from './Lexer.ts';
-import type { Links, Tokens } from './Tokens.ts';
+import type { Links, Tokens, Token } from './Tokens.ts';
 import type { MarkedOptions } from './MarkedOptions.ts';
 
 function outputLink(cap: string[], link: Pick<Tokens.Link, 'href' | 'title'>, raw: string, lexer: _Lexer): Tokens.Link | Tokens.Image {
@@ -156,16 +156,71 @@ export class _Tokenizer {
   blockquote(src: string): Tokens.Blockquote | undefined {
     const cap = this.rules.block.blockquote.exec(src);
     if (cap) {
-      // precede setext continuation with 4 spaces so it isn't a setext
-      let text = cap[0].replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n    $1');
-      text = rtrim(text.replace(/^ *>[ \t]?/gm, ''), '\n');
-      const top = this.lexer.state.top;
-      this.lexer.state.top = true;
-      const tokens = this.lexer.blockTokens(text);
-      this.lexer.state.top = top;
+      let lines = rtrim(cap[0], '\n').split('\n');
+      let raw = '';
+      let text = '';
+      const tokens: Token[] = [];
+
+      while (lines.length > 0) {
+        let inBlockquote = false;
+        const currentLines = [];
+
+        while (lines.length > 0) {
+          if (/^ {0,3}>/.test(lines[0])) {
+            currentLines.push(lines.shift());
+            inBlockquote = true;
+          } else if (!inBlockquote) {
+            currentLines.push(lines.shift());
+          } else {
+            break;
+          }
+        }
+
+        const currentRaw = currentLines.join('\n');
+        const currentText = currentRaw
+        // precede setext continuation with 4 spaces so it isn't a setext
+          .replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n    $1')
+          .replace(/^ {0,3}>[ \t]?/gm, '');
+        raw = raw ? `${raw}\n${currentRaw}` : currentRaw;
+        text = text ? `${text}\n${currentText}` : currentText;
+        const top = this.lexer.state.top;
+        this.lexer.state.top = true;
+        this.lexer.blockTokens(currentText, tokens, true);
+        this.lexer.state.top = top;
+
+        if (lines.length === 0) {
+          break;
+        }
+
+        const lastToken = tokens[tokens.length - 1];
+
+        if (lastToken?.type === 'code') {
+          break;
+        } else if (lastToken?.type === 'blockquote') {
+          const oldBlockquoteToken = lastToken as Tokens.Blockquote;
+          const newText = oldBlockquoteToken.raw + '\n' + lines.join('\n');
+          const newBlockquoteToken = this.blockquote(newText)!;
+          tokens[tokens.length - 1] = newBlockquoteToken;
+
+          raw = raw.substring(0, raw.length - oldBlockquoteToken.raw.length) + newBlockquoteToken.raw;
+          text = text.substring(0, text.length - oldBlockquoteToken.text.length) + newBlockquoteToken.text;
+          break;
+        } else if (lastToken?.type === 'list') {
+          const oldListToken = lastToken as Tokens.List;
+          const newText = oldListToken.raw + '\n' + lines.join('\n');
+          const newListToken = this.list(newText)!;
+          tokens[tokens.length - 1] = newListToken;
+
+          raw = raw.substring(0, raw.length - lastToken.raw.length) + newListToken.raw;
+          text = text.substring(0, text.length - oldListToken.raw.length) + newListToken.raw;
+          lines = newText.substring(tokens[tokens.length - 1].raw.length).split('\n');
+          continue;
+        }
+      }
+
       return {
         type: 'blockquote',
-        raw: cap[0],
+        raw,
         tokens,
         text
       };
diff --git a/test/specs/commonmark/commonmark.0.31.json b/test/specs/commonmark/commonmark.0.31.json
index f9b5db2e61..cd0d9991e0 100644
--- a/test/specs/commonmark/commonmark.0.31.json
+++ b/test/specs/commonmark/commonmark.0.31.json
@@ -1887,8 +1887,7 @@
     "example": 236,
     "start_line": 3838,
     "end_line": 3848,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> ```\nfoo\n```\n",
@@ -1896,8 +1895,7 @@
     "example": 237,
     "start_line": 3851,
     "end_line": 3861,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> foo\n    - bar\n",
diff --git a/test/specs/gfm/commonmark.0.31.json b/test/specs/gfm/commonmark.0.31.json
index ff36b9f4d8..8cef7c132f 100644
--- a/test/specs/gfm/commonmark.0.31.json
+++ b/test/specs/gfm/commonmark.0.31.json
@@ -1887,8 +1887,7 @@
     "example": 236,
     "start_line": 3838,
     "end_line": 3848,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> ```\nfoo\n```\n",
@@ -1896,8 +1895,7 @@
     "example": 237,
     "start_line": 3851,
     "end_line": 3861,
-    "section": "Block quotes",
-    "shouldFail": true
+    "section": "Block quotes"
   },
   {
     "markdown": "> foo\n    - bar\n",
diff --git a/test/unit/marked.test.js b/test/unit/marked.test.js
index c9c0925a12..f885908bb9 100644
--- a/test/unit/marked.test.js
+++ b/test/unit/marked.test.js
@@ -18,7 +18,7 @@ describe('marked unit', () => {
 
       assert.strictEqual(tokens[0].type, 'paragraph');
       assert.strictEqual(tokens[2].tokens[0].type, 'paragraph');
-      assert.strictEqual(tokens[3].items[0].tokens[0].type, 'text');
+      assert.strictEqual(tokens[4].items[0].tokens[0].type, 'text');
     });
   });
 
@@ -924,6 +924,7 @@ br
         ['blockquote', '> blockquote'],
         ['paragraph', 'blockquote'],
         ['text', 'blockquote'],
+        ['space', ''],
         ['list', '- list'],
         ['list_item', '- list'],
         ['text', 'list'],

From ce4f6968d935bdc6d9ccc06a0a8add08da05a93b Mon Sep 17 00:00:00 2001
From: Tony Brix <tony@brix.ninja>
Date: Sat, 20 Apr 2024 22:39:37 -0600
Subject: [PATCH 2/4] fix space after hr

---
 src/Tokenizer.ts         | 2 +-
 test/unit/marked.test.js | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 77ed0b5d27..2b1dd44fe9 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -148,7 +148,7 @@ export class _Tokenizer {
     if (cap) {
       return {
         type: 'hr',
-        raw: cap[0]
+        raw: rtrim(cap[0], '\n')
       };
     }
   }
diff --git a/test/unit/marked.test.js b/test/unit/marked.test.js
index f885908bb9..fee341557b 100644
--- a/test/unit/marked.test.js
+++ b/test/unit/marked.test.js
@@ -910,6 +910,7 @@ br
         ['text', 'paragraph'],
         ['space', ''],
         ['hr', '---'],
+        ['space', ''],
         ['heading', '# heading'],
         ['text', 'heading'],
         ['code', '```code```'],

From 2f58989b42cf93c10b741aebd0b9e08cbbae351e Mon Sep 17 00:00:00 2001
From: Tony Brix <tony@brix.ninja>
Date: Sun, 21 Apr 2024 22:12:21 -0600
Subject: [PATCH 3/4] comment code

---
 src/Tokenizer.ts | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 2b1dd44fe9..2bdd9fa0cf 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -166,6 +166,7 @@ export class _Tokenizer {
         const currentLines = [];
 
         while (lines.length > 0) {
+          // get lines up to a continuation
           if (/^ {0,3}>/.test(lines[0])) {
             currentLines.push(lines.shift());
             inBlockquote = true;
@@ -178,16 +179,20 @@ export class _Tokenizer {
 
         const currentRaw = currentLines.join('\n');
         const currentText = currentRaw
-        // precede setext continuation with 4 spaces so it isn't a setext
+          // precede setext continuation with 4 spaces so it isn't a setext
           .replace(/\n {0,3}((?:=+|-+) *)(?=\n|$)/g, '\n    $1')
           .replace(/^ {0,3}>[ \t]?/gm, '');
         raw = raw ? `${raw}\n${currentRaw}` : currentRaw;
         text = text ? `${text}\n${currentText}` : currentText;
+
+        // parse blockquote lines as top level tokens
+        // merge paragraphs if this is a continuation
         const top = this.lexer.state.top;
         this.lexer.state.top = true;
         this.lexer.blockTokens(currentText, tokens, true);
         this.lexer.state.top = top;
 
+        // if there is no continuation then we are done
         if (lines.length === 0) {
           break;
         }
@@ -195,8 +200,10 @@ export class _Tokenizer {
         const lastToken = tokens[tokens.length - 1];
 
         if (lastToken?.type === 'code') {
+          // blockquote continuation cannot be preceded by a code block
           break;
         } else if (lastToken?.type === 'blockquote') {
+          // include continuation in nested blockquote
           const oldBlockquoteToken = lastToken as Tokens.Blockquote;
           const newText = oldBlockquoteToken.raw + '\n' + lines.join('\n');
           const newBlockquoteToken = this.blockquote(newText)!;
@@ -206,6 +213,7 @@ export class _Tokenizer {
           text = text.substring(0, text.length - oldBlockquoteToken.text.length) + newBlockquoteToken.text;
           break;
         } else if (lastToken?.type === 'list') {
+          // include continuation in nested list
           const oldListToken = lastToken as Tokens.List;
           const newText = oldListToken.raw + '\n' + lines.join('\n');
           const newListToken = this.list(newText)!;

From 738b877adf8f1adc6586573bc94ae6cda2db10a6 Mon Sep 17 00:00:00 2001
From: Tony Brix <tony@brix.ninja>
Date: Fri, 26 Apr 2024 09:41:46 -0600
Subject: [PATCH 4/4] fixes

---
 src/Tokenizer.ts | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 2bdd9fa0cf..63156d7e82 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -165,17 +165,19 @@ export class _Tokenizer {
         let inBlockquote = false;
         const currentLines = [];
 
-        while (lines.length > 0) {
+        let i;
+        for (i = 0; i < lines.length; i++) {
           // get lines up to a continuation
-          if (/^ {0,3}>/.test(lines[0])) {
-            currentLines.push(lines.shift());
+          if (/^ {0,3}>/.test(lines[i])) {
+            currentLines.push(lines[i]);
             inBlockquote = true;
           } else if (!inBlockquote) {
-            currentLines.push(lines.shift());
+            currentLines.push(lines[i]);
           } else {
             break;
           }
         }
+        lines = lines.slice(i);
 
         const currentRaw = currentLines.join('\n');
         const currentText = currentRaw
@@ -204,23 +206,23 @@ export class _Tokenizer {
           break;
         } else if (lastToken?.type === 'blockquote') {
           // include continuation in nested blockquote
-          const oldBlockquoteToken = lastToken as Tokens.Blockquote;
-          const newText = oldBlockquoteToken.raw + '\n' + lines.join('\n');
-          const newBlockquoteToken = this.blockquote(newText)!;
-          tokens[tokens.length - 1] = newBlockquoteToken;
+          const oldToken = lastToken as Tokens.Blockquote;
+          const newText = oldToken.raw + '\n' + lines.join('\n');
+          const newToken = this.blockquote(newText)!;
+          tokens[tokens.length - 1] = newToken;
 
-          raw = raw.substring(0, raw.length - oldBlockquoteToken.raw.length) + newBlockquoteToken.raw;
-          text = text.substring(0, text.length - oldBlockquoteToken.text.length) + newBlockquoteToken.text;
+          raw = raw.substring(0, raw.length - oldToken.raw.length) + newToken.raw;
+          text = text.substring(0, text.length - oldToken.text.length) + newToken.text;
           break;
         } else if (lastToken?.type === 'list') {
           // include continuation in nested list
-          const oldListToken = lastToken as Tokens.List;
-          const newText = oldListToken.raw + '\n' + lines.join('\n');
-          const newListToken = this.list(newText)!;
-          tokens[tokens.length - 1] = newListToken;
+          const oldToken = lastToken as Tokens.List;
+          const newText = oldToken.raw + '\n' + lines.join('\n');
+          const newToken = this.list(newText)!;
+          tokens[tokens.length - 1] = newToken;
 
-          raw = raw.substring(0, raw.length - lastToken.raw.length) + newListToken.raw;
-          text = text.substring(0, text.length - oldListToken.raw.length) + newListToken.raw;
+          raw = raw.substring(0, raw.length - lastToken.raw.length) + newToken.raw;
+          text = text.substring(0, text.length - oldToken.raw.length) + newToken.raw;
           lines = newText.substring(tokens[tokens.length - 1].raw.length).split('\n');
           continue;
         }