+
+
+
+
+
+
+ `
+
+ let result = getTextWithoutComments(input, 'html')
+ expect(result).toMatchInlineSnapshot(`
+ "
+
+
+
+
+
+
+
Some text
+
+
+
+
+
+
More text
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
Empty comment above
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
+
+
+
+
Content
+
+
+
Content
+
+
+
+
+
Content
+
+
+ Content for non-IE browsers
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
+
+
+
+
+
+
Content
+
+
+
Content
+
+
+
Content
+
+
+
+
+
+
+ "
+ `)
+})
+
+test('Cleans CSS', () => {
+ let input = `
+ /* Simple single-line block comment */
+ .class1 { color: red; }
+
+ /*
+ * Multi-line block comment
+ * with asterisks on each line
+ * spanning multiple lines
+ */
+ .class2 { color: blue; }
+
+ /**
+ * Doc-style comment
+ * Often used for documentation
+ */
+ .class3 { color: green; }
+
+ /* Comment with "double quotes" inside */
+ .class4 { color: yellow; }
+
+ /* Comment with 'single quotes' inside */
+ .class5 { color: orange; }
+
+ /* Comment with special chars: < > & */
+ .class6 { color: purple; }
+
+ /* Comment with CSS-like content: .fake { color: red; } */
+ .class7 { color: pink; }
+
+ /* Comment with URL-like content: https://example.com */
+ .class8 { color: brown; }
+
+ /* Comment with // which is not a line comment in CSS */
+ .class9 { color: gray; }
+
+ /* Consecutive */ /* Comments */
+ .class10 { color: teal; }
+
+ .inline { color: /* inline comment */ red; }
+
+ .property /* comment between property */ : /* and value */ blue;
+
+ /* Empty comment: *//**/
+ .class11 { color: navy; }
+
+ /* Comment with asterisks **** inside *** */
+ .class12 { color: olive; }
+
+ /* Comment ending with multiple asterisks ***/
+ .class13 { color: maroon; }
+
+ /*** Comment starting with multiple asterisks */
+ .class14 { color: lime; }
+
+ /* Comment with nested /* fake opening */
+ .class15 { color: aqua; }
+
+ /* Comment with HTML:
content
*/
+ .class16 { color: fuchsia; }
+
+ /* Comment with JS: const x = 1; // not a comment */
+ .class17 { color: silver; }
+
+ /* Comment with escaped content: \\*/ still in comment */
+ .class18 { color: black; }
+
+ /* Multi-line
+ comment without
+ asterisks on
+ each line */
+ .class19 { color: white; }
+
+ .url-property {
+ background: url("image.png"); /* comment after url */
+ background: url('image.png'); /* another comment */
+ background: url(image.png); /* unquoted url */
+ }
+
+ .string-property {
+ content: "String with /* fake comment */ inside";
+ content: 'String with /* fake comment */ inside';
+ content: "String with // not a comment";
+ }
+
+ .data-uri {
+ background: url("data:image/svg+xml,
");
+ }
+
+ /* Comment before at-rule */
+ @media screen and (min-width: 768px) {
+ /* Comment inside at-rule */
+ .responsive { color: red; }
+ }
+
+ /* Comment before keyframes */
+ @keyframes spin {
+ /* Comment at start */
+ 0% { transform: rotate(0deg); }
+ /* Comment between keyframes */
+ 100% { transform: rotate(360deg); }
+ /* Comment at end */
+ }
+
+ :root {
+ --my-var: red; /* Comment after custom property */
+ /* Comment before custom property */
+ --another-var: blue;
+ }
+
+ .calc-property {
+ width: calc(100% - 20px); /* Comment after calc */
+ height: calc(/* comment in calc */ 50vh - 10px);
+ }
+
+ /* Comment with unicode: 你好 مرحبا 🎉 */
+ .unicode { color: red; }
+
+ /* Very long comment that goes on and on and on and on and on and on and on and on and on and on and on and on and on */
+ .long { color: red; }
+
+ /*
+ Comment with various whitespace:
+ tabs
+ spaces
+
+ blank lines
+ */
+ .whitespace { color: red; }
+
+ /* Comment immediately before brace */{
+ color: red;
+ }
+
+ .selector/* comment in selector */.chained { color: red; }
+
+ /* Final comment at end of file */
+ `
+
+ let result = getTextWithoutComments(input, 'css')
+ expect(result).toMatchInlineSnapshot(`
+ "
+
+ .class1 { color: red; }
+
+
+
+
+
+
+ .class2 { color: blue; }
+
+
+
+
+
+ .class3 { color: green; }
+
+
+ .class4 { color: yellow; }
+
+
+ .class5 { color: orange; }
+
+
+ .class6 { color: purple; }
+
+
+ .class7 { color: pink; }
+
+
+ .class8 { color: brown; }
+
+
+ .class9 { color: gray; }
+
+
+ .class10 { color: teal; }
+
+ .inline { color: red; }
+
+ .property : blue;
+
+
+ .class11 { color: navy; }
+
+
+ .class12 { color: olive; }
+
+
+ .class13 { color: maroon; }
+
+
+ .class14 { color: lime; }
+
+
+ .class15 { color: aqua; }
+
+
+ .class16 { color: fuchsia; }
+
+
+ .class17 { color: silver; }
+
+
+ .class18 { color: black; }
+
+
+
+
+
+ .class19 { color: white; }
+
+ .url-property {
+ background: url("image.png");
+ background: url('image.png');
+ background: url(image.png);
+ }
+
+ .string-property {
+ content: "String with /* fake comment */ inside";
+ content: 'String with /* fake comment */ inside';
+ content: "String with // not a comment";
+ }
+
+ .data-uri {
+ background: url("data:image/svg+xml,
");
+ }
+
+
+ @media screen and (min-width: 768px) {
+
+ .responsive { color: red; }
+ }
+
+
+ @keyframes spin {
+
+ 0% { transform: rotate(0deg); }
+
+ 100% { transform: rotate(360deg); }
+
+ }
+
+ :root {
+ --my-var: red;
+
+ --another-var: blue;
+ }
+
+ .calc-property {
+ width: calc(100% - 20px);
+ height: calc( 50vh - 10px);
+ }
+
+
+ .unicode { color: red; }
+
+
+ .long { color: red; }
+
+
+
+
+
+
+
+
+ .whitespace { color: red; }
+
+ {
+ color: red;
+ }
+
+ .selector .chained { color: red; }
+
+
+ "
+ `)
+})
+
+test('Cleans multibyte CSS', () => {
+ let input = `/* Comment with unicode: 你好 مرحبا 🎉 */`
+
+ let result = getTextWithoutComments(input, 'css')
+ expect(input.length).toEqual(result.length)
+ expect(result).toEqual(' ')
+})
diff --git a/packages/tailwindcss-language-service/src/util/doc.ts b/packages/tailwindcss-language-service/src/util/doc.ts
index 8cf0dd66..d3f6a79a 100644
--- a/packages/tailwindcss-language-service/src/util/doc.ts
+++ b/packages/tailwindcss-language-service/src/util/doc.ts
@@ -1,255 +1,250 @@
import type { Range } from 'vscode-languageserver'
import type { TextDocument } from 'vscode-languageserver-textdocument'
-import moo from 'moo'
-import { spliceChangesIntoString, StringChange } from './splice-changes-into-string'
+
+const BACKSLASH = 0x5c // \
+const SLASH = 0x2f // /
+const LINE_BREAK = 0x0a // \n
+const COMMA = 0x2c // ,
+const COLON = 0x3a // :
+const EQUALS = 0x3d // =
+const SEMICOLON = 0x3b // ;
+const BRACKET_OPEN = 0x5b // [
+const BRACKET_CLOSE = 0x5d // ]
+const QUESTION_MARK = 0x3f // ?
+const PAREN_OPEN = 0x28 // (
+const CURLY_OPEN = 0x7b // {
+const DOUBLE_QUOTE = 0x22 // "
+const SINGLE_QUOTE = 0x27 // '
+const BACKTICK = 0x60 // `
+const ASTERISK = 0x2a // *
+const SPACE = 0x20 // " "
+const TAB = 0x09 // \t
+const GREATER_THAN = 0x3e // >
+const LESS_THAN = 0x3c // <
+const EXCLAMATION_MARK = 0x21 // !
+const DASH = 0x2d // -
+
+const decoder = new TextDecoder('utf-16')
export function getTextWithoutComments(
doc: TextDocument,
type: 'html' | 'js' | 'css',
range?: Range,
): string
+
export function getTextWithoutComments(text: string, type: 'html' | 'js' | 'css'): string
+/**
+ * Cleanup the given document and/or code for analysis
+ *
+ * We preprocess text to ensure we don't look inside comments for class lists,
+ * `@apply` directives, or embedded documents.
+ *
+ * The following are replaced with whitespace while preserving line breaks:
+ * - Single line comments
+ * - Multi line comments
+ * - Regex literals (where applicable)
+ *
+ * Preservation of line breaks is critical for mapping positions back to the
+ * original source code.
+ */
export function getTextWithoutComments(
- docOrText: TextDocument | string,
+ input: TextDocument | string,
type: 'html' | 'js' | 'css',
range?: Range,
): string {
- let text = typeof docOrText === 'string' ? docOrText : docOrText.getText(range)
-
- if (type === 'js') {
- return getJsWithoutComments(text)
+ let text = typeof input === 'string' ? input : input.getText(range)
+
+ // We want to replace "unncessary" or "uninteresting" substrings with
+ // whitespace. Notably, we must do this without changing character offsets
+ // or the length of the resulting string. This is critical for mapping
+ // offsets and positions back to the original, unprocessed document.
+ //
+ // We can simplify the replacement process by using a mutable view of the
+ // string which eliminates bookkeeping and intermediate allocations.
+ //
+ // We cannot use the builtin `TextEncoder` as it only outputs UTF-8 bytes and
+ // using that would mean that in-place replacements of multi-byte characters
+ // with spaces changes the length of the string and any following offsets.
+ //
+ // Building up a typed array of UTF-16 code units manually is quick, gives us
+ // a mutable view of the string, and can be very quickly turned into a string
+ // by using `TextDecoder` with a UTF-16 encoding.
+ let bytes = new Uint16Array(text.length)
+
+ for (let i = 0; i < text.length; i++) {
+ bytes[i] = text.charCodeAt(i)
}
- if (type === 'css') {
- return getCssWithoutComments(text)
+ if (type === 'js') {
+ cleanJS(bytes)
+ } else if (type === 'css') {
+ cleanCSS(bytes)
+ } else if (type === 'html') {
+ cleanHTML(bytes)
}
- return text.replace(//gs, replace)
+ return decoder.decode(bytes)
}
-function getCssWithoutComments(input: string) {
- const DOUBLE_QUOTE = 0x22 // "
- const SINGLE_QUOTE = 0x27 // '
- const BACKSLASH = 0x5c // \
- const SLASH = 0x2f // /
- const ASTERISK = 0x2a // *
- const LINE_BREAK = 0x0a // \n
-
- let changes: StringChange[] = []
-
- // Collect ranges for every comment in the input.
- for (let i = 0; i < input.length; ++i) {
- let currentChar = input.charCodeAt(i)
+/**
+ * Clean CSS, SCSS, Less, or similar CSS-like code
+ */
+function cleanCSS(bytes: Uint16Array): void {
+ for (let i = 0; i < bytes.length; ++i) {
+ let currentChar = bytes[i]
if (currentChar === BACKSLASH) {
i += 1
}
- // Skip over strings — they are to be left untouched
+ // Skip over strings
else if (currentChar === SINGLE_QUOTE || currentChar === DOUBLE_QUOTE) {
- for (let j = i + 1; j < input.length; ++j) {
- let peekChar = input.charCodeAt(j)
+ for (let j = i + 1; j < bytes.length; ++j) {
+ let peek = bytes[j]
// Current character is a `\` therefore the next character is escaped.
- if (peekChar === BACKSLASH) {
+ if (peek === BACKSLASH) {
j += 1
}
// End of the string.
- else if (peekChar === currentChar) {
+ else if (peek === currentChar) {
i = j
break
- } else if (peekChar === LINE_BREAK) {
+ } else if (peek === LINE_BREAK) {
i = j
break
}
}
- } else if (currentChar === SLASH && input.charCodeAt(i + 1) === ASTERISK) {
- let start = i
+ }
- for (let j = i + 2; j < input.length; j++) {
- let peekChar = input.charCodeAt(j)
+ // Replace comments with whitespace
+ else if (currentChar === SLASH && bytes[i + 1] === ASTERISK) {
+ let end = bytes.length
+
+ for (let j = i + 2; j < bytes.length; j++) {
+ let peek = bytes[j]
// Current character is a `\` therefore the next character is escaped.
- if (peekChar === BACKSLASH) {
+ if (peek === BACKSLASH) {
j += 1
}
// End of the comment
- else if (peekChar === ASTERISK && input.charCodeAt(j + 1) === SLASH) {
- i = j + 1
+ else if (peek === ASTERISK && bytes[j + 1] === SLASH) {
+ end = j + 1
break
}
}
- changes.push({
- start,
- end: i + 1,
- replacement: replace(input.slice(start, i + 1)),
- })
- }
- }
-
- return spliceChangesIntoString(input, changes)
-}
-
-function replace(match: string): string {
- return match.replace(/./gs, (char) => (char === '\n' ? '\n' : ' '))
-}
+ replaceWithWhitespace(bytes, i, end)
-let jsLexer: moo.Lexer
-
-function getJsWithoutComments(text: string): string {
- if (!jsLexer) {
- jsLexer = moo.states({
- main: {
- commentLine: /\/\/.*?$/,
- commentBlock: { match: /\/\*[^]*?\*\//, lineBreaks: true },
- stringDouble: /"(?:[^"\\]|\\.)*"/,
- stringSingle: /'(?:[^'\\]|\\.)*'/,
- stringBacktick: /`(?:[^`\\]|\\.)*`/,
- other: { match: /[^]/, lineBreaks: true },
- },
- })
- }
-
- let str = ''
- jsLexer.reset(text)
-
- for (let token of jsLexer) {
- if (token.type === 'commentLine') {
- str += ' '.repeat(token.value.length)
- } else if (token.type === 'commentBlock') {
- str += token.value.replace(/./g, ' ')
- } else {
- str += token.value
+ i = end
}
}
-
- str = stripRegexLiterals(str)
-
- return str
}
-function stripRegexLiterals(input: string) {
- const BACKSLASH = 0x5c // \
- const SLASH = 0x2f // /
- const LINE_BREAK = 0x0a // \n
- const COMMA = 0x2c // ,
- const COLON = 0x3a // :
- const EQUALS = 0x3d // =
- const SEMICOLON = 0x3b // ;
- const BRACKET_OPEN = 0x5b // [
- const BRACKET_CLOSE = 0x5d // ]
- const QUESTION_MARK = 0x3f // ?
- const PAREN_OPEN = 0x28 // (
- const CURLY_OPEN = 0x7b // {
- const DOUBLE_QUOTE = 0x22 // "
- const SINGLE_QUOTE = 0x27 // '
- const BACKTICK = 0x60 // `
-
- let SPACE = 0x20 // " "
- let TAB = 0x09 // \t
-
- // Top level; or
- // after comma
- // after colon
- // after equals
- // after semicolon
- // after square bracket (arrays, object property expressions)
- // after question mark
- // after open paren
- // after curly (jsx only)
-
- let inRegex = false
- let inEscape = false
+/**
+ * Clean JS, TS, or similar JS-like code
+ */
+function cleanJS(bytes: Uint16Array): void {
let inCharacterClass = false
-
- let regexStart = -1
- let regexEnd = -1
+ let prevNonWS = NaN
// Based on the oxc_parser crate
// https://github.com/oxc-project/oxc/blob/5f97f28ddbd2cd303a306f7fb0092b0e54bda43c/crates/oxc_parser/src/lexer/regex.rs#L29
- let prev = null
- for (let i = 0; i < input.length; ++i) {
- let c = input.charCodeAt(i)
+ for (let i = 0; i < bytes.length; ++i) {
+ let char = bytes[i]
+ let peek = bytes[i + 1]
- if (inRegex) {
- if (c === LINE_BREAK) {
- break
- } else if (inEscape) {
- inEscape = false
- } else if (c === SLASH && !inCharacterClass) {
- inRegex = false
- regexEnd = i
- break
- } else if (c === BRACKET_OPEN) {
- inCharacterClass = true
- } else if (c === BACKSLASH) {
- inEscape = true
- } else if (c === BRACKET_CLOSE) {
- inCharacterClass = false
- }
-
- continue
+ // Escaped characters
+ if (char === BACKSLASH) {
+ i += 1
}
- // Skip over strings
- if (c === SINGLE_QUOTE) {
- for (let j = i; j < input.length; ++j) {
- let peekChar = input.charCodeAt(j)
-
- if (peekChar === BACKSLASH) {
+ // Skip over strings using single quotes
+ else if (char === SINGLE_QUOTE) {
+ for (let j = i + 1; j < bytes.length; ++j) {
+ let peek = bytes[j]
+ if (peek === BACKSLASH) {
j += 1
- } else if (peekChar === SINGLE_QUOTE) {
+ } else if (peek === SINGLE_QUOTE) {
i = j
break
- } else if (peekChar === LINE_BREAK) {
+ } else if (peek === LINE_BREAK) {
i = j
break
}
}
}
- //
- else if (c === DOUBLE_QUOTE) {
- for (let j = i; j < input.length; ++j) {
- let peekChar = input.charCodeAt(j)
- if (peekChar === BACKSLASH) {
+ // Skip over strings using double quotes
+ else if (char === DOUBLE_QUOTE) {
+ for (let j = i + 1; j < bytes.length; ++j) {
+ let peek = bytes[j]
+ if (peek === BACKSLASH) {
j += 1
- } else if (peekChar === DOUBLE_QUOTE) {
+ } else if (peek === DOUBLE_QUOTE) {
i = j
break
- } else if (peekChar === LINE_BREAK) {
+ } else if (peek === LINE_BREAK) {
i = j
break
}
}
}
- //
- else if (c === BACKTICK) {
- for (let j = i; j < input.length; ++j) {
- let peekChar = input.charCodeAt(j)
- if (peekChar === BACKSLASH) {
+ // Skip over template literals
+ else if (char === BACKTICK) {
+ for (let j = i + 1; j < bytes.length; ++j) {
+ let peek = bytes[j]
+ if (peek === BACKSLASH) {
j += 1
- } else if (peekChar === BACKTICK) {
+ } else if (peek === BACKTICK) {
i = j
break
- } else if (peekChar === LINE_BREAK) {
- i = j
+ }
+ }
+ }
+
+ // Replace single line comments with whitespace
+ else if (char === SLASH && peek === SLASH) {
+ let end = bytes.length
+ for (let j = i + 2; j < bytes.length; ++j) {
+ let peek = bytes[j]
+ if (peek === LINE_BREAK) {
+ end = j
break
}
}
+
+ replaceWithWhitespace(bytes, i, end)
+
+ i = end
}
- //
- else if (c === SPACE || c === TAB) {
- // do nothing
+
+ // Replace multi line comments with whitespace but preserve line breaks
+ else if (char === SLASH && peek === ASTERISK) {
+ let end = bytes.length
+ for (let j = i + 2; j < bytes.length; ++j) {
+ let curr = bytes[j]
+ let peek = bytes[j + 1]
+ if (curr === ASTERISK && peek === SLASH) {
+ end = j + 1
+ break
+ }
+ }
+
+ replaceWithWhitespace(bytes, i, end)
+
+ i = end
}
+
//
- else if (c === SLASH) {
- if (
+ else if (char === SLASH) {
+ let prev = prevNonWS
+ let canStartRegex =
prev === COMMA ||
prev === COLON ||
prev === EQUALS ||
@@ -258,24 +253,128 @@ function stripRegexLiterals(input: string) {
prev === QUESTION_MARK ||
prev === PAREN_OPEN ||
prev === CURLY_OPEN ||
- prev === LINE_BREAK
- ) {
- inRegex = true
- regexStart = i
+ prev === GREATER_THAN ||
+ prev === LINE_BREAK ||
+ prev === SPACE ||
+ prev === TAB ||
+ isNaN(prev)
+
+ if (!canStartRegex) continue
+
+ let end = -1
+
+ for (let j = i + 1; j < bytes.length; ++j) {
+ let peek = bytes[j]
+ if (peek === LINE_BREAK) {
+ end = j
+ break
+ } else if (peek === BACKSLASH) {
+ j += 1
+ } else if (peek === SLASH && !inCharacterClass) {
+ end = j
+ break
+ } else if (peek === BRACKET_OPEN) {
+ inCharacterClass = true
+ } else if (peek === BRACKET_CLOSE) {
+ inCharacterClass = false
+ }
}
+
+ // This is likely an unterminated regex literal
+ // We'll skip the regex `/` character if this happens and proceed
+ // as if it were not there
+ if (end === -1) continue
+
+ replaceWithWhitespace(bytes, i, end)
}
- //
+
+ // Whitespace can be left as is
+ else if (char === SPACE || char === TAB) {
+ //
+ }
+
+ // We want to capture the previous non-whitespace character
else {
- prev = c
+ prevNonWS = char
}
}
+}
+
+/**
+ * Clean HTML or HTML-like code
+ *
+ * We *intentionally* don't try to skip comments inside "raw text" HTML tags:
+ * -