From 75037c6514e99c9b4fa300f62f04913fee4ea0e2 Mon Sep 17 00:00:00 2001 From: Alex Kocharin Date: Thu, 14 Apr 2022 19:00:44 +0300 Subject: [PATCH] Put escape sequences into separate token --- CHANGELOG.md | 17 +++++ lib/parser_core.js | 5 +- lib/parser_inline.js | 9 ++- lib/presets/commonmark.js | 5 +- lib/presets/zero.js | 5 +- lib/rules_core/text_join.js | 45 +++++++++++++ lib/rules_inline/escape.js | 67 ++++++++++++------- .../{text_collapse.js => fragments_join.js} | 2 +- test/fixtures/markdown-it/smartquotes.txt | 13 ++++ test/fixtures/markdown-it/typographer.txt | 17 +++++ test/misc.js | 8 +++ 11 files changed, 162 insertions(+), 31 deletions(-) create mode 100644 lib/rules_core/text_join.js rename lib/rules_inline/{text_collapse.js => fragments_join.js} (96%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b744166f..95f4ab967 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [13.0.0] - WIP +### Added +- Added a new token type `text_special` to store escaped characters, same as `text` but + unaffected by replacement plugins (smartquotes, typographer, linkifier, etc.). +- Added a new rule `text_join` in `core` ruler. Text replacement plugins may choose to + insert themselves before it. + +### Changed +- `text_collapse` rule is renamed to `fragments_join`. + +### Fixed +- Smartquotes, typographic replacements and plain text links can now be escaped + with backslash (e.g. `\(c)` or `google\.com` are no longer replaced). + + ## [12.3.2] - 2022-01-08 ### Security - Fix possible ReDOS in newline rule. Thanks to @MakeNowJust. @@ -592,6 +608,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Renamed presets folder (configs -> presets). +[13.0.0]: https://github.com/markdown-it/markdown-it/compare/12.3.2...13.0.0 [12.3.2]: https://github.com/markdown-it/markdown-it/compare/12.3.1...12.3.2 [12.3.1]: https://github.com/markdown-it/markdown-it/compare/12.3.0...12.3.1 [12.3.0]: https://github.com/markdown-it/markdown-it/compare/12.2.0...12.3.0 diff --git a/lib/parser_core.js b/lib/parser_core.js index 1eaa2b085..a8831fba5 100644 --- a/lib/parser_core.js +++ b/lib/parser_core.js @@ -16,7 +16,10 @@ var _rules = [ [ 'inline', require('./rules_core/inline') ], [ 'linkify', require('./rules_core/linkify') ], [ 'replacements', require('./rules_core/replacements') ], - [ 'smartquotes', require('./rules_core/smartquotes') ] + [ 'smartquotes', require('./rules_core/smartquotes') ], + // `text_join` finds `text_special` tokens (for escape sequences) + // and joins them with the rest of the text + [ 'text_join', require('./rules_core/text_join') ] ]; diff --git a/lib/parser_inline.js b/lib/parser_inline.js index c8e66d306..49fea64c5 100644 --- a/lib/parser_inline.js +++ b/lib/parser_inline.js @@ -26,11 +26,18 @@ var _rules = [ [ 'entity', require('./rules_inline/entity') ] ]; +// `rule2` ruleset was created specifically for emphasis/strikethrough +// post-processing and may be changed in the future. +// +// Don't use this for anything except pairs (plugins working with `balance_pairs`). +// var _rules2 = [ [ 'balance_pairs', require('./rules_inline/balance_pairs') ], [ 'strikethrough', require('./rules_inline/strikethrough').postProcess ], [ 'emphasis', require('./rules_inline/emphasis').postProcess ], - [ 'text_collapse', require('./rules_inline/text_collapse') ] + // rules for pairs separate '**' into its own text tokens, which may be left unused, + // rule below merges unused segments back with the rest of the text + [ 'fragments_join', require('./rules_inline/fragments_join') ] ]; diff --git a/lib/presets/commonmark.js b/lib/presets/commonmark.js index 706655375..c0a8275ce 100644 --- a/lib/presets/commonmark.js +++ b/lib/presets/commonmark.js @@ -38,7 +38,8 @@ module.exports = { rules: [ 'normalize', 'block', - 'inline' + 'inline', + 'text_join' ] }, @@ -73,7 +74,7 @@ module.exports = { rules2: [ 'balance_pairs', 'emphasis', - 'text_collapse' + 'fragments_join' ] } } diff --git a/lib/presets/zero.js b/lib/presets/zero.js index 5da413ca8..fc90aace0 100644 --- a/lib/presets/zero.js +++ b/lib/presets/zero.js @@ -39,7 +39,8 @@ module.exports = { rules: [ 'normalize', 'block', - 'inline' + 'inline', + 'text_join' ] }, @@ -55,7 +56,7 @@ module.exports = { ], rules2: [ 'balance_pairs', - 'text_collapse' + 'fragments_join' ] } } diff --git a/lib/rules_core/text_join.js b/lib/rules_core/text_join.js new file mode 100644 index 000000000..a0c083ac8 --- /dev/null +++ b/lib/rules_core/text_join.js @@ -0,0 +1,45 @@ +// Join raw text tokens with the rest of the text +// +// This is set as a separate rule to provide an opportunity for plugins +// to run text replacements after text join, but before escape join. +// +// For example, `\:)` shouldn't be replaced with an emoji. +// +'use strict'; + + +module.exports = function text_join(state) { + var j, l, tokens, curr, max, last, + blockTokens = state.tokens; + + for (j = 0, l = blockTokens.length; j < l; j++) { + if (blockTokens[j].type !== 'inline') continue; + + tokens = blockTokens[j].children; + max = tokens.length; + + for (curr = 0; curr < max; curr++) { + if (tokens[curr].type === 'text_special') { + tokens[curr].type = 'text'; + } + } + + for (curr = last = 0; curr < max; curr++) { + if (tokens[curr].type === 'text' && + curr + 1 < max && + tokens[curr + 1].type === 'text') { + + // collapse two adjacent text nodes + tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content; + } else { + if (curr !== last) { tokens[last] = tokens[curr]; } + + last++; + } + } + + if (curr !== last) { + tokens.length = last; + } + } +}; diff --git a/lib/rules_inline/escape.js b/lib/rules_inline/escape.js index 229ead0a2..8adda0c3b 100644 --- a/lib/rules_inline/escape.js +++ b/lib/rules_inline/escape.js @@ -13,40 +13,59 @@ for (var i = 0; i < 256; i++) { ESCAPED.push(0); } module.exports = function escape(state, silent) { - var ch, pos = state.pos, max = state.posMax; - - if (state.src.charCodeAt(pos) !== 0x5C/* \ */) { return false; } + var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax; + if (state.src.charCodeAt(pos) !== 0x5C/* \ */) return false; pos++; - if (pos < max) { - ch = state.src.charCodeAt(pos); + // '\' at the end of the inline block + if (pos >= max) return false; + + ch1 = state.src.charCodeAt(pos); - if (ch < 256 && ESCAPED[ch] !== 0) { - if (!silent) { state.pending += state.src[pos]; } - state.pos += 2; - return true; + if (ch1 === 0x0A) { + if (!silent) { + state.push('hardbreak', 'br', 0); } - if (ch === 0x0A) { - if (!silent) { - state.push('hardbreak', 'br', 0); - } + pos++; + // skip leading whitespaces from next line + while (pos < max) { + ch1 = state.src.charCodeAt(pos); + if (!isSpace(ch1)) break; + pos++; + } + + state.pos = pos; + return true; + } + + escapedStr = state.src[pos]; + if (ch1 >= 0xD800 && ch1 <= 0xDBFF && pos + 1 < max) { + ch2 = state.src.charCodeAt(pos + 1); + + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + escapedStr += state.src[pos + 1]; pos++; - // skip leading whitespaces from next line - while (pos < max) { - ch = state.src.charCodeAt(pos); - if (!isSpace(ch)) { break; } - pos++; - } - - state.pos = pos; - return true; } } - if (!silent) { state.pending += '\\'; } - state.pos++; + origStr = '\\' + escapedStr; + + if (!silent) { + token = state.push('text_special', '', 0); + + if (ch1 < 256 && ESCAPED[ch1] !== 0) { + token.content = escapedStr; + } else { + token.content = origStr; + } + + token.markup = origStr; + token.info = 'escape'; + } + + state.pos = pos + 1; return true; }; diff --git a/lib/rules_inline/text_collapse.js b/lib/rules_inline/fragments_join.js similarity index 96% rename from lib/rules_inline/text_collapse.js rename to lib/rules_inline/fragments_join.js index 390b0fe5f..0eafb45fe 100644 --- a/lib/rules_inline/text_collapse.js +++ b/lib/rules_inline/fragments_join.js @@ -9,7 +9,7 @@ 'use strict'; -module.exports = function text_collapse(state) { +module.exports = function fragments_join(state) { var curr, last, level = 0, tokens = state.tokens, diff --git a/test/fixtures/markdown-it/smartquotes.txt b/test/fixtures/markdown-it/smartquotes.txt index fdeb883ba..72bb2dcc5 100644 --- a/test/fixtures/markdown-it/smartquotes.txt +++ b/test/fixtures/markdown-it/smartquotes.txt @@ -164,3 +164,16 @@ Should parse quotes adjacent to inline html, #677:

“test


test”

. + +Should be escapable: +. +"foo" + +\"foo" + +"foo\" +. +

“foo”

+

"foo"

+

"foo"

+. diff --git a/test/fixtures/markdown-it/typographer.txt b/test/fixtures/markdown-it/typographer.txt index e9e2bfee5..ca3cc0af5 100644 --- a/test/fixtures/markdown-it/typographer.txt +++ b/test/fixtures/markdown-it/typographer.txt @@ -60,6 +60,13 @@ dupes

!!! ??? ,

. +copyright should be escapable +. +\(c) +. +

(c)

+. + dashes . @@ -80,6 +87,16 @@ markdownit--awesome

markdownit–awesome

. +dashes should be escapable +. +foo \-- bar + +foo -\- bar +. +

foo -- bar

+

foo -- bar

+. + regression tests for #624 . 1---2---3 diff --git a/test/misc.js b/test/misc.js index 6850a16f0..fb0999c11 100644 --- a/test/misc.js +++ b/test/misc.js @@ -254,6 +254,14 @@ describe('Misc', function () { md.render('# test\n\n - hello\n - world\n') ); }); + + it('Should escape surrogate pairs (coverage)', function () { + var md = markdownit(); + + assert.strictEqual(md.render('\\\uD835\uDC9C'), '

\\\uD835\uDC9C

\n'); + assert.strictEqual(md.render('\\\uD835x'), '

\\\uD835x

\n'); + assert.strictEqual(md.render('\\\uD835'), '

\\\uD835

\n'); + }); });