diff --git a/packages/tailwindcss-language-service/src/util/doc.test.ts b/packages/tailwindcss-language-service/src/util/doc.test.ts new file mode 100644 index 00000000..bd5a6607 --- /dev/null +++ b/packages/tailwindcss-language-service/src/util/doc.test.ts @@ -0,0 +1,811 @@ +import { getTextWithoutComments } from './doc' +import { test, expect } from 'vitest' + +test('Cleans JS', () => { + let input = ` + /* Single-line block comment */ + const a = 1; + + // Single-line comment + const b = 2; + + /* + * Multi-line block comment + * with asterisks on each line + * spanning multiple lines + */ + const c = 3; + + /** + * JSDoc style comment + * @param {string} name - The name parameter + * @returns {void} + */ + function greet(name) { + console.log("Hello, " + name); + } + + // Double-quoted strings + const str1 = "This is a double-quoted string"; + const str2 = "String with \\"escaped\\" quotes"; + const str3 = "String with // fake comment inside"; + const str4 = "String with /* fake block comment */ inside"; + + // Single-quoted strings + const str5 = 'This is a single-quoted string'; + const str6 = 'String with \\'escaped\\' quotes'; + const str7 = 'String with // fake comment inside'; + const str8 = 'String with /* fake block comment */ inside'; + + // Template literals - single line + const tmpl1 = \`Simple template literal\`; + const tmpl2 = \`Template with \${expression} interpolation\`; + const tmpl3 = \`Template with // fake comment\`; + const tmpl4 = \`Template with /* fake block */ comment\`; + + // Template literals - multi-line + const tmpl5 = \` + Multi-line template literal + spanning several lines + with \${nested} expressions + \`; + + const tmpl6 = \` + Template with \${ + // This is a real comment inside interpolation + someValue + } complex interpolation + \`; + + // Regex patterns - various flags + const regex1 = /simple/; + const regex2 = /with-flags/gi; + const regex3 = /multi-flag/gimsuy; + const regex4 = /pattern\\/with\\/slashes/g; + const regex5 = /pattern with spaces/; + const regex6 = /[a-z]+(foo|bar)*\\d{2,4}/i; + + // Regex that looks like comments + const regex7 = /\\/\\/ not a comment/; + const regex8 = /\\/\\* also not a comment \\*\\//; + + // Division vs regex ambiguity + const division = 10 / 2 / 1; + const afterParen = (x) => /regex-after-arrow/g; + const inCondition = /test/.test(str) ? /yes/g : /no/i; + + // Nested structures + const obj = { + // Comment inside object + key: "value", /* inline comment */ + nested: { + /* deeply nested comment */ + deep: \`template \${ + // comment in template + value /* another */ + }\` + } + }; + + // Class with various comment styles + class Example { + // Property comment + prop = "value"; + + /** + * Method JSDoc + */ + method() { + // Method body comment + return /* inline */ true; + } + } + + // Arrow functions with comments + const arrow1 = () => /* comment */ 42; + const arrow2 = (/* param comment */) => {}; + const arrow3 = (a /* inline */, b) => a + b; + + // Edge cases + const empty = ""; + const emptyTemplate = \`\`; + const emptyRegex = /(?:)/; + + // URL-like strings (contains //) + const url = "https://example.com/path"; + const protocol = 'file://localhost/'; + + // Consecutive comments + // First comment + // Second comment + /* Block one */ /* Block two */ + const afterComments = true; + + // Mixed quotes and escapes + const mixed = "It's a \\"quoted\\" string"; + const mixed2 = 'It\\'s a "quoted" string'; + const mixed3 = \`It's a "quoted" \\\`template\\\`\`; + + // Tagged template literals + const tagged = html\`
content
\`; + const css = css\` + .selector { + /* CSS comment inside template */ + color: red; // not a JS comment + } + \`; + + // Regex with special characters + const specialRegex = /[\\[\\]{}()*+?.,\\\\^$|#\\s]/g; + const unicodeRegex = /\\p{Script=Latin}/u; + + // Comments at end of lines with various content + const withTrailing = 123; // trailing comment + const withBlock = 456; /* trailing block */ + const both = 789; // line /* nested */ + + // String concatenation that looks tricky + const concat = "start" + /* comment */ "end"; + const concat2 = 'a' + 'b' + /* c */ 'd'; + ` + + let result = getTextWithoutComments(input, 'js') + expect(result).toMatchInlineSnapshot(` + " + + const a = 1; + + + const b = 2; + + + + + + + const c = 3; + + + + + + + function greet(name) { + console.log("Hello, " + name); + } + + + const str1 = "This is a double-quoted string"; + const str2 = "String with \\"escaped\\" quotes"; + const str3 = "String with // fake comment inside"; + const str4 = "String with /* fake block comment */ inside"; + + + const str5 = 'This is a single-quoted string'; + const str6 = 'String with \\'escaped\\' quotes'; + const str7 = 'String with // fake comment inside'; + const str8 = 'String with /* fake block comment */ inside'; + + + const tmpl1 = \`Simple template literal\`; + const tmpl2 = \`Template with \${expression} interpolation\`; + const tmpl3 = \`Template with // fake comment\`; + const tmpl4 = \`Template with /* fake block */ comment\`; + + + const tmpl5 = \` + Multi-line template literal + spanning several lines + with \${nested} expressions + \`; + + const tmpl6 = \` + Template with \${ + // This is a real comment inside interpolation + someValue + } complex interpolation + \`; + + + const regex1 = ; + const regex2 = gi; + const regex3 = gimsuy; + const regex4 = g; + const regex5 = ; + const regex6 = i; + + + const regex7 = ; + const regex8 = ; + + + const division = 10 / 2 / 1; + const afterParen = (x) => g; + const inCondition = .test(str) ? g : i; + + + const obj = { + + key: "value", + nested: { + + deep: \`template \${ + // comment in template + value /* another */ + }\` + } + }; + + + class Example { + + prop = "value"; + + + + + method() { + + return true; + } + } + + + const arrow1 = () => 42; + const arrow2 = ( ) => {}; + const arrow3 = (a , b) => a + b; + + + const empty = ""; + const emptyTemplate = \`\`; + const emptyRegex = ; + + + const url = "https://example.com/path"; + const protocol = 'file://localhost/'; + + + + + + const afterComments = true; + + + const mixed = "It's a \\"quoted\\" string"; + const mixed2 = 'It\\'s a "quoted" string'; + const mixed3 = \`It's a "quoted" \\\`template\\\`\`; + + + const tagged = html\`
content
\`; + const css = css\` + .selector { + /* CSS comment inside template */ + color: red; // not a JS comment + } + \`; + + + const specialRegex = g; + const unicodeRegex = u; + + + const withTrailing = 123; + const withBlock = 456; + const both = 789; + + + const concat = "start" + "end"; + const concat2 = 'a' + 'b' + 'd'; + " + `) +}) + +test('Cleans HTML', () => { + let input = ` + +
+ + +

Some text

+ + + More text + + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Empty comment above
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ +
+ +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + + Content for non-IE browsers + +
Content
+ + +
+ class="test" + >Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + + + + + + ` + + let result = getTextWithoutComments(input, 'html') + expect(result).toMatchInlineSnapshot(` + " + +
+ + + + +

Some text

+ + + + + + More text + + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Empty comment above
+ + +
Content
+ + +
Content
+ + +
Content
+ + +
Content
+ +
+ +
Content
+ + +
Content
+ + +
Content
+ + + + +
Content
+ + + Content for non-IE browsers + +
Content
+ + +
Content
+ + +
Content
+ + + + + + + + +
Content
+ + +
Content
+ + +
Content
+ + + + + + + " + `) +}) + +test('Cleans CSS', () => { + let input = ` + /* Simple single-line block comment */ + .class1 { color: red; } + + /* + * Multi-line block comment + * with asterisks on each line + * spanning multiple lines + */ + .class2 { color: blue; } + + /** + * Doc-style comment + * Often used for documentation + */ + .class3 { color: green; } + + /* Comment with "double quotes" inside */ + .class4 { color: yellow; } + + /* Comment with 'single quotes' inside */ + .class5 { color: orange; } + + /* Comment with special chars: < > & */ + .class6 { color: purple; } + + /* Comment with CSS-like content: .fake { color: red; } */ + .class7 { color: pink; } + + /* Comment with URL-like content: https://example.com */ + .class8 { color: brown; } + + /* Comment with // which is not a line comment in CSS */ + .class9 { color: gray; } + + /* Consecutive */ /* Comments */ + .class10 { color: teal; } + + .inline { color: /* inline comment */ red; } + + .property /* comment between property */ : /* and value */ blue; + + /* Empty comment: *//**/ + .class11 { color: navy; } + + /* Comment with asterisks **** inside *** */ + .class12 { color: olive; } + + /* Comment ending with multiple asterisks ***/ + .class13 { color: maroon; } + + /*** Comment starting with multiple asterisks */ + .class14 { color: lime; } + + /* Comment with nested /* fake opening */ + .class15 { color: aqua; } + + /* Comment with HTML:
content
*/ + .class16 { color: fuchsia; } + + /* Comment with JS: const x = 1; // not a comment */ + .class17 { color: silver; } + + /* Comment with escaped content: \\*/ still in comment */ + .class18 { color: black; } + + /* Multi-line + comment without + asterisks on + each line */ + .class19 { color: white; } + + .url-property { + background: url("image.png"); /* comment after url */ + background: url('image.png'); /* another comment */ + background: url(image.png); /* unquoted url */ + } + + .string-property { + content: "String with /* fake comment */ inside"; + content: 'String with /* fake comment */ inside'; + content: "String with // not a comment"; + } + + .data-uri { + background: url("data:image/svg+xml,/* not a comment */"); + } + + /* Comment before at-rule */ + @media screen and (min-width: 768px) { + /* Comment inside at-rule */ + .responsive { color: red; } + } + + /* Comment before keyframes */ + @keyframes spin { + /* Comment at start */ + 0% { transform: rotate(0deg); } + /* Comment between keyframes */ + 100% { transform: rotate(360deg); } + /* Comment at end */ + } + + :root { + --my-var: red; /* Comment after custom property */ + /* Comment before custom property */ + --another-var: blue; + } + + .calc-property { + width: calc(100% - 20px); /* Comment after calc */ + height: calc(/* comment in calc */ 50vh - 10px); + } + + /* Comment with unicode: 你好 مرحبا 🎉 */ + .unicode { color: red; } + + /* Very long comment that goes on and on and on and on and on and on and on and on and on and on and on and on and on */ + .long { color: red; } + + /* + Comment with various whitespace: + tabs + spaces + + blank lines + */ + .whitespace { color: red; } + + /* Comment immediately before brace */{ + color: red; + } + + .selector/* comment in selector */.chained { color: red; } + + /* Final comment at end of file */ + ` + + let result = getTextWithoutComments(input, 'css') + expect(result).toMatchInlineSnapshot(` + " + + .class1 { color: red; } + + + + + + + .class2 { color: blue; } + + + + + + .class3 { color: green; } + + + .class4 { color: yellow; } + + + .class5 { color: orange; } + + + .class6 { color: purple; } + + + .class7 { color: pink; } + + + .class8 { color: brown; } + + + .class9 { color: gray; } + + + .class10 { color: teal; } + + .inline { color: red; } + + .property : blue; + + + .class11 { color: navy; } + + + .class12 { color: olive; } + + + .class13 { color: maroon; } + + + .class14 { color: lime; } + + + .class15 { color: aqua; } + + + .class16 { color: fuchsia; } + + + .class17 { color: silver; } + + + .class18 { color: black; } + + + + + + .class19 { color: white; } + + .url-property { + background: url("image.png"); + background: url('image.png'); + background: url(image.png); + } + + .string-property { + content: "String with /* fake comment */ inside"; + content: 'String with /* fake comment */ inside'; + content: "String with // not a comment"; + } + + .data-uri { + background: url("data:image/svg+xml,/* not a comment */"); + } + + + @media screen and (min-width: 768px) { + + .responsive { color: red; } + } + + + @keyframes spin { + + 0% { transform: rotate(0deg); } + + 100% { transform: rotate(360deg); } + + } + + :root { + --my-var: red; + + --another-var: blue; + } + + .calc-property { + width: calc(100% - 20px); + height: calc( 50vh - 10px); + } + + + .unicode { color: red; } + + + .long { color: red; } + + + + + + + + + .whitespace { color: red; } + + { + color: red; + } + + .selector .chained { color: red; } + + + " + `) +}) + +test('Cleans multibyte CSS', () => { + let input = `/* Comment with unicode: 你好 مرحبا 🎉 */` + + let result = getTextWithoutComments(input, 'css') + expect(input.length).toEqual(result.length) + expect(result).toEqual(' ') +}) diff --git a/packages/tailwindcss-language-service/src/util/doc.ts b/packages/tailwindcss-language-service/src/util/doc.ts index 8cf0dd66..d3f6a79a 100644 --- a/packages/tailwindcss-language-service/src/util/doc.ts +++ b/packages/tailwindcss-language-service/src/util/doc.ts @@ -1,255 +1,250 @@ import type { Range } from 'vscode-languageserver' import type { TextDocument } from 'vscode-languageserver-textdocument' -import moo from 'moo' -import { spliceChangesIntoString, StringChange } from './splice-changes-into-string' + +const BACKSLASH = 0x5c // \ +const SLASH = 0x2f // / +const LINE_BREAK = 0x0a // \n +const COMMA = 0x2c // , +const COLON = 0x3a // : +const EQUALS = 0x3d // = +const SEMICOLON = 0x3b // ; +const BRACKET_OPEN = 0x5b // [ +const BRACKET_CLOSE = 0x5d // ] +const QUESTION_MARK = 0x3f // ? +const PAREN_OPEN = 0x28 // ( +const CURLY_OPEN = 0x7b // { +const DOUBLE_QUOTE = 0x22 // " +const SINGLE_QUOTE = 0x27 // ' +const BACKTICK = 0x60 // ` +const ASTERISK = 0x2a // * +const SPACE = 0x20 // " " +const TAB = 0x09 // \t +const GREATER_THAN = 0x3e // > +const LESS_THAN = 0x3c // < +const EXCLAMATION_MARK = 0x21 // ! +const DASH = 0x2d // - + +const decoder = new TextDecoder('utf-16') export function getTextWithoutComments( doc: TextDocument, type: 'html' | 'js' | 'css', range?: Range, ): string + export function getTextWithoutComments(text: string, type: 'html' | 'js' | 'css'): string +/** + * Cleanup the given document and/or code for analysis + * + * We preprocess text to ensure we don't look inside comments for class lists, + * `@apply` directives, or embedded documents. + * + * The following are replaced with whitespace while preserving line breaks: + * - Single line comments + * - Multi line comments + * - Regex literals (where applicable) + * + * Preservation of line breaks is critical for mapping positions back to the + * original source code. + */ export function getTextWithoutComments( - docOrText: TextDocument | string, + input: TextDocument | string, type: 'html' | 'js' | 'css', range?: Range, ): string { - let text = typeof docOrText === 'string' ? docOrText : docOrText.getText(range) - - if (type === 'js') { - return getJsWithoutComments(text) + let text = typeof input === 'string' ? input : input.getText(range) + + // We want to replace "unncessary" or "uninteresting" substrings with + // whitespace. Notably, we must do this without changing character offsets + // or the length of the resulting string. This is critical for mapping + // offsets and positions back to the original, unprocessed document. + // + // We can simplify the replacement process by using a mutable view of the + // string which eliminates bookkeeping and intermediate allocations. + // + // We cannot use the builtin `TextEncoder` as it only outputs UTF-8 bytes and + // using that would mean that in-place replacements of multi-byte characters + // with spaces changes the length of the string and any following offsets. + // + // Building up a typed array of UTF-16 code units manually is quick, gives us + // a mutable view of the string, and can be very quickly turned into a string + // by using `TextDecoder` with a UTF-16 encoding. + let bytes = new Uint16Array(text.length) + + for (let i = 0; i < text.length; i++) { + bytes[i] = text.charCodeAt(i) } - if (type === 'css') { - return getCssWithoutComments(text) + if (type === 'js') { + cleanJS(bytes) + } else if (type === 'css') { + cleanCSS(bytes) + } else if (type === 'html') { + cleanHTML(bytes) } - return text.replace(//gs, replace) + return decoder.decode(bytes) } -function getCssWithoutComments(input: string) { - const DOUBLE_QUOTE = 0x22 // " - const SINGLE_QUOTE = 0x27 // ' - const BACKSLASH = 0x5c // \ - const SLASH = 0x2f // / - const ASTERISK = 0x2a // * - const LINE_BREAK = 0x0a // \n - - let changes: StringChange[] = [] - - // Collect ranges for every comment in the input. - for (let i = 0; i < input.length; ++i) { - let currentChar = input.charCodeAt(i) +/** + * Clean CSS, SCSS, Less, or similar CSS-like code + */ +function cleanCSS(bytes: Uint16Array): void { + for (let i = 0; i < bytes.length; ++i) { + let currentChar = bytes[i] if (currentChar === BACKSLASH) { i += 1 } - // Skip over strings — they are to be left untouched + // Skip over strings else if (currentChar === SINGLE_QUOTE || currentChar === DOUBLE_QUOTE) { - for (let j = i + 1; j < input.length; ++j) { - let peekChar = input.charCodeAt(j) + for (let j = i + 1; j < bytes.length; ++j) { + let peek = bytes[j] // Current character is a `\` therefore the next character is escaped. - if (peekChar === BACKSLASH) { + if (peek === BACKSLASH) { j += 1 } // End of the string. - else if (peekChar === currentChar) { + else if (peek === currentChar) { i = j break - } else if (peekChar === LINE_BREAK) { + } else if (peek === LINE_BREAK) { i = j break } } - } else if (currentChar === SLASH && input.charCodeAt(i + 1) === ASTERISK) { - let start = i + } - for (let j = i + 2; j < input.length; j++) { - let peekChar = input.charCodeAt(j) + // Replace comments with whitespace + else if (currentChar === SLASH && bytes[i + 1] === ASTERISK) { + let end = bytes.length + + for (let j = i + 2; j < bytes.length; j++) { + let peek = bytes[j] // Current character is a `\` therefore the next character is escaped. - if (peekChar === BACKSLASH) { + if (peek === BACKSLASH) { j += 1 } // End of the comment - else if (peekChar === ASTERISK && input.charCodeAt(j + 1) === SLASH) { - i = j + 1 + else if (peek === ASTERISK && bytes[j + 1] === SLASH) { + end = j + 1 break } } - changes.push({ - start, - end: i + 1, - replacement: replace(input.slice(start, i + 1)), - }) - } - } - - return spliceChangesIntoString(input, changes) -} - -function replace(match: string): string { - return match.replace(/./gs, (char) => (char === '\n' ? '\n' : ' ')) -} + replaceWithWhitespace(bytes, i, end) -let jsLexer: moo.Lexer - -function getJsWithoutComments(text: string): string { - if (!jsLexer) { - jsLexer = moo.states({ - main: { - commentLine: /\/\/.*?$/, - commentBlock: { match: /\/\*[^]*?\*\//, lineBreaks: true }, - stringDouble: /"(?:[^"\\]|\\.)*"/, - stringSingle: /'(?:[^'\\]|\\.)*'/, - stringBacktick: /`(?:[^`\\]|\\.)*`/, - other: { match: /[^]/, lineBreaks: true }, - }, - }) - } - - let str = '' - jsLexer.reset(text) - - for (let token of jsLexer) { - if (token.type === 'commentLine') { - str += ' '.repeat(token.value.length) - } else if (token.type === 'commentBlock') { - str += token.value.replace(/./g, ' ') - } else { - str += token.value + i = end } } - - str = stripRegexLiterals(str) - - return str } -function stripRegexLiterals(input: string) { - const BACKSLASH = 0x5c // \ - const SLASH = 0x2f // / - const LINE_BREAK = 0x0a // \n - const COMMA = 0x2c // , - const COLON = 0x3a // : - const EQUALS = 0x3d // = - const SEMICOLON = 0x3b // ; - const BRACKET_OPEN = 0x5b // [ - const BRACKET_CLOSE = 0x5d // ] - const QUESTION_MARK = 0x3f // ? - const PAREN_OPEN = 0x28 // ( - const CURLY_OPEN = 0x7b // { - const DOUBLE_QUOTE = 0x22 // " - const SINGLE_QUOTE = 0x27 // ' - const BACKTICK = 0x60 // ` - - let SPACE = 0x20 // " " - let TAB = 0x09 // \t - - // Top level; or - // after comma - // after colon - // after equals - // after semicolon - // after square bracket (arrays, object property expressions) - // after question mark - // after open paren - // after curly (jsx only) - - let inRegex = false - let inEscape = false +/** + * Clean JS, TS, or similar JS-like code + */ +function cleanJS(bytes: Uint16Array): void { let inCharacterClass = false - - let regexStart = -1 - let regexEnd = -1 + let prevNonWS = NaN // Based on the oxc_parser crate // https://github.com/oxc-project/oxc/blob/5f97f28ddbd2cd303a306f7fb0092b0e54bda43c/crates/oxc_parser/src/lexer/regex.rs#L29 - let prev = null - for (let i = 0; i < input.length; ++i) { - let c = input.charCodeAt(i) + for (let i = 0; i < bytes.length; ++i) { + let char = bytes[i] + let peek = bytes[i + 1] - if (inRegex) { - if (c === LINE_BREAK) { - break - } else if (inEscape) { - inEscape = false - } else if (c === SLASH && !inCharacterClass) { - inRegex = false - regexEnd = i - break - } else if (c === BRACKET_OPEN) { - inCharacterClass = true - } else if (c === BACKSLASH) { - inEscape = true - } else if (c === BRACKET_CLOSE) { - inCharacterClass = false - } - - continue + // Escaped characters + if (char === BACKSLASH) { + i += 1 } - // Skip over strings - if (c === SINGLE_QUOTE) { - for (let j = i; j < input.length; ++j) { - let peekChar = input.charCodeAt(j) - - if (peekChar === BACKSLASH) { + // Skip over strings using single quotes + else if (char === SINGLE_QUOTE) { + for (let j = i + 1; j < bytes.length; ++j) { + let peek = bytes[j] + if (peek === BACKSLASH) { j += 1 - } else if (peekChar === SINGLE_QUOTE) { + } else if (peek === SINGLE_QUOTE) { i = j break - } else if (peekChar === LINE_BREAK) { + } else if (peek === LINE_BREAK) { i = j break } } } - // - else if (c === DOUBLE_QUOTE) { - for (let j = i; j < input.length; ++j) { - let peekChar = input.charCodeAt(j) - if (peekChar === BACKSLASH) { + // Skip over strings using double quotes + else if (char === DOUBLE_QUOTE) { + for (let j = i + 1; j < bytes.length; ++j) { + let peek = bytes[j] + if (peek === BACKSLASH) { j += 1 - } else if (peekChar === DOUBLE_QUOTE) { + } else if (peek === DOUBLE_QUOTE) { i = j break - } else if (peekChar === LINE_BREAK) { + } else if (peek === LINE_BREAK) { i = j break } } } - // - else if (c === BACKTICK) { - for (let j = i; j < input.length; ++j) { - let peekChar = input.charCodeAt(j) - if (peekChar === BACKSLASH) { + // Skip over template literals + else if (char === BACKTICK) { + for (let j = i + 1; j < bytes.length; ++j) { + let peek = bytes[j] + if (peek === BACKSLASH) { j += 1 - } else if (peekChar === BACKTICK) { + } else if (peek === BACKTICK) { i = j break - } else if (peekChar === LINE_BREAK) { - i = j + } + } + } + + // Replace single line comments with whitespace + else if (char === SLASH && peek === SLASH) { + let end = bytes.length + for (let j = i + 2; j < bytes.length; ++j) { + let peek = bytes[j] + if (peek === LINE_BREAK) { + end = j break } } + + replaceWithWhitespace(bytes, i, end) + + i = end } - // - else if (c === SPACE || c === TAB) { - // do nothing + + // Replace multi line comments with whitespace but preserve line breaks + else if (char === SLASH && peek === ASTERISK) { + let end = bytes.length + for (let j = i + 2; j < bytes.length; ++j) { + let curr = bytes[j] + let peek = bytes[j + 1] + if (curr === ASTERISK && peek === SLASH) { + end = j + 1 + break + } + } + + replaceWithWhitespace(bytes, i, end) + + i = end } + // - else if (c === SLASH) { - if ( + else if (char === SLASH) { + let prev = prevNonWS + let canStartRegex = prev === COMMA || prev === COLON || prev === EQUALS || @@ -258,24 +253,128 @@ function stripRegexLiterals(input: string) { prev === QUESTION_MARK || prev === PAREN_OPEN || prev === CURLY_OPEN || - prev === LINE_BREAK - ) { - inRegex = true - regexStart = i + prev === GREATER_THAN || + prev === LINE_BREAK || + prev === SPACE || + prev === TAB || + isNaN(prev) + + if (!canStartRegex) continue + + let end = -1 + + for (let j = i + 1; j < bytes.length; ++j) { + let peek = bytes[j] + if (peek === LINE_BREAK) { + end = j + break + } else if (peek === BACKSLASH) { + j += 1 + } else if (peek === SLASH && !inCharacterClass) { + end = j + break + } else if (peek === BRACKET_OPEN) { + inCharacterClass = true + } else if (peek === BRACKET_CLOSE) { + inCharacterClass = false + } } + + // This is likely an unterminated regex literal + // We'll skip the regex `/` character if this happens and proceed + // as if it were not there + if (end === -1) continue + + replaceWithWhitespace(bytes, i, end) } - // + + // Whitespace can be left as is + else if (char === SPACE || char === TAB) { + // + } + + // We want to capture the previous non-whitespace character else { - prev = c + prevNonWS = char } } +} + +/** + * Clean HTML or HTML-like code + * + * We *intentionally* don't try to skip comments inside "raw text" HTML tags: + * -