diff --git a/apps/oxlint/src-js/plugins/comments.ts b/apps/oxlint/src-js/plugins/comments.ts index f8696f1d06cf1..d4e8eef8eb980 100644 --- a/apps/oxlint/src-js/plugins/comments.ts +++ b/apps/oxlint/src-js/plugins/comments.ts @@ -55,6 +55,23 @@ let previousComments: Comment[] = []; // Comments whose `loc` property has been accessed, and therefore needs clearing on reset. const commentsWithLoc: Comment[] = []; +// Tracks indices of deserialized comments so their `value` can be cleared on reset, +// preventing source text strings from being held alive by stale `value` slices. +// +// Pre-allocated in `initCommentsBuffer` to avoid growth during deserialization. +// `Uint32Array` rather than `Array` to avoid GC tracing and write barriers. +// +// `deserializedCommentsLen` is the number of deserialized comments in current file. +// If all comments have been deserialized (`allCommentsDeserialized === true`), `deserializedCommentsLen` is 0, +// and no further indexes are written to `deserializedCommentIndexes`. `resetComments` will reset all comments, +// up to `commentsLen`. +let deserializedCommentIndexes = new Uint32Array(0); +let deserializedCommentsLen = 0; + +// Minimum capacity (in `u32`s) of `deserializedCommentIndexes`, when not empty. +// 16 elements = 64 bytes = 1 cache line. +const DESERIALIZED_COMMENT_INDEXES_MIN_CAPACITY = 16; + // Empty comments array. // Reused for all files which don't have any comments. Frozen to avoid rules mutating it. const EMPTY_COMMENTS: CommentType[] = Object.freeze([]) as unknown as CommentType[]; @@ -69,6 +86,10 @@ debugAssert(COMMENT_SIZE === 1 << COMMENT_SIZE_SHIFT); // Reset `#loc` field on a `Comment` class instance. let resetCommentLoc: (comment: Comment) => void; +// Get `#loc` field on a `Comment` class instance. +// Only used in debug build (tests). +let getCommentPrivateLoc: (comment: Comment) => Location | null; + /** * Comment class. * @@ -99,6 +120,8 @@ class Comment implements Span { resetCommentLoc = (comment: Comment) => { comment.#loc = null; }; + + if (DEBUG) getCommentPrivateLoc = (comment: Comment) => comment.#loc; } } @@ -148,6 +171,8 @@ export function deserializeComments(): void { } allCommentsDeserialized = true; + // No need to count any more, since all comments have been deserialized + deserializedCommentsLen = 0; debugCheckDeserializedComments(); } @@ -196,9 +221,25 @@ export function initCommentsBuffer(): void { commentsUint8 = new Uint8Array(arrayBuffer, absolutePos, commentsLen * COMMENT_SIZE); commentsUint32 = new Uint32Array(arrayBuffer, absolutePos, commentsLen * (COMMENT_SIZE >> 2)); - // Grow cache if needed (one-time cost as cache warms up) - while (cachedComments.length < commentsLen) { - cachedComments.push(new Comment()); + // Grow caches if needed. After first few files, caches should have grown large enough to service all files. + // Later files will skip this step, and allocations stop. + if (cachedComments.length < commentsLen) { + do { + cachedComments.push(new Comment()); + } while (cachedComments.length < commentsLen); + + // Grow `deserializedCommentIndexes` if needed. + // `Uint32Array`s can't grow in place, so allocate a new one. + // First allocation uses minimum capacity. Subsequent growths double, to avoid frequent reallocations. + const indexesLen = deserializedCommentIndexes.length; + if (indexesLen < commentsLen) { + deserializedCommentIndexes = new Uint32Array( + Math.max( + commentsLen, + indexesLen === 0 ? DESERIALIZED_COMMENT_INDEXES_MIN_CAPACITY : indexesLen << 1, + ), + ); + } } // If file has a hashbang, eagerly deserialize the first comment, and set its type to `Shebang`. @@ -223,8 +264,23 @@ export function initCommentsBuffer(): void { * @returns Deserialized comment */ export function getComment(index: number): CommentType { - const comment = deserializeCommentIfNeeded(index); - return comment === null ? cachedComments[index] : comment; + // Skip all other checks if all comments have been deserialized + if (!allCommentsDeserialized) { + const comment = deserializeCommentIfNeeded(index); + + if (comment !== null) { + // Comment was newly deserialized. + // Record the comment so its `value` can be cleared on reset, preventing source text strings + // from being held alive by stale `value` slices. + // This is in `getComment` rather than `deserializeCommentIfNeeded` so the bulk path + // (`deserializeComments`) skips the tracking - it uses `allCommentsDeserialized` instead. + deserializedCommentIndexes[deserializedCommentsLen++] = index; + return comment; + } + } + + // Comment was already deserialized + return cachedComments[index]; } /** @@ -235,7 +291,7 @@ export function getComment(index: number): CommentType { * @param index - Comment index in the comments buffer * @returns `Comment` object if newly deserialized, or `null` if already deserialized */ -export function deserializeCommentIfNeeded(index: number): Comment | null { +function deserializeCommentIfNeeded(index: number): Comment | null { const pos = index << COMMENT_SIZE_SHIFT; // Fast path: If already deserialized, exit @@ -324,14 +380,66 @@ function debugCheckDeserializedComments(): void { * will recalculate it when the comment is reused for a different file. */ export function resetComments(): void { + // Early exit if comments were never accessed (e.g. no rules used comments-related methods) + if (commentsUint32 === null) { + debugAssertAllCommentsCleared(); + return; + } + + // Clear `value` property of deserialized comments to release source text string slices. + // Without this, V8's SlicedString optimization keeps the entire source text alive + // as long as any comment's `value` (which is a slice of it) exists. + if (allCommentsDeserialized === false) { + // Only a subset of comments have been deserialized, so clear only those + for (let i = 0; i < deserializedCommentsLen; i++) { + cachedComments[deserializedCommentIndexes[i]].value = null!; + } + + deserializedCommentsLen = 0; + } else { + // All comments have been deserialized, so clear them all + for (let i = 0; i < commentsLen; i++) { + cachedComments[i].value = null!; + } + + allCommentsDeserialized = false; + + debugAssert( + deserializedCommentsLen === 0, + "Deserialized comments counter should have been reset to 0", + ); + } + + // Reset `#loc` on comments where `loc` has been accessed for (let i = 0, len = commentsWithLoc.length; i < len; i++) { resetCommentLoc(commentsWithLoc[i]); } + commentsWithLoc.length = 0; + // Reset other state comments = null; commentsUint8 = null; commentsUint32 = null; commentsLen = 0; - allCommentsDeserialized = false; + + debugAssertAllCommentsCleared(); +} + +/** + * Check that all comment objects have been cleared. + * + * Only runs in debug build (tests). In release build, this function is entirely removed by minifier. + */ +function debugAssertAllCommentsCleared(): void { + if (!DEBUG) return; + + // Check all cached tokens have `value: null` and `#loc: null` + for (let i = 0; i < cachedComments.length; i++) { + const comment = cachedComments[i]; + if (comment.value !== null) throw new Error(`Comment ${i} has not had \`value\` cleared`); + if (getCommentPrivateLoc(comment) !== null) { + throw new Error(`Comment ${i} has not had \`#loc\` cleared`); + } + } } diff --git a/apps/oxlint/src-js/plugins/comments_methods.ts b/apps/oxlint/src-js/plugins/comments_methods.ts index f272fd21fba2c..aa87036214b4b 100644 --- a/apps/oxlint/src-js/plugins/comments_methods.ts +++ b/apps/oxlint/src-js/plugins/comments_methods.ts @@ -7,9 +7,9 @@ import { comments, commentsUint32, commentsLen, + getComment, initComments, initCommentsBuffer, - deserializeCommentIfNeeded, } from "./comments.ts"; import { initTokensAndCommentsBuffer, @@ -97,7 +97,7 @@ export function getCommentsBefore(nodeOrToken: NodeOrToken): Comment[] { const sliceEnd = sliceStart + (count32 >> MERGED_SIZE32_SHIFT); for (let i = sliceStart; i < sliceEnd; i++) { - deserializeCommentIfNeeded(i); + getComment(i); } return cachedComments.slice(sliceStart, sliceEnd); } @@ -159,7 +159,7 @@ export function getCommentsAfter(nodeOrToken: NodeOrToken): Comment[] { const sliceEnd = sliceStart + (count32 >> MERGED_SIZE32_SHIFT); for (let i = sliceStart; i < sliceEnd; i++) { - deserializeCommentIfNeeded(i); + getComment(i); } return cachedComments.slice(sliceStart, sliceEnd); } @@ -188,7 +188,7 @@ export function getCommentsInside(node: Node): Comment[] { // Deserialize only the comments we're returning for (let i = sliceStart; i < sliceEnd; i++) { - deserializeCommentIfNeeded(i); + getComment(i); } return cachedComments.slice(sliceStart, sliceEnd); } diff --git a/apps/oxlint/src-js/plugins/tokens.ts b/apps/oxlint/src-js/plugins/tokens.ts index 107f4b18ad0d0..028d2e935124e 100644 --- a/apps/oxlint/src-js/plugins/tokens.ts +++ b/apps/oxlint/src-js/plugins/tokens.ts @@ -129,9 +129,32 @@ const regexObjects: Regex[] = []; // for the next regex descriptor object which can be reused. const tokensWithRegex: Token[] = []; +// Tracks indices of deserialized tokens so their `value` can be cleared on reset, +// preventing source text strings from being held alive by stale `value` slices. +// +// Pre-allocated in `initTokensBuffer` to avoid growth during deserialization. +// `Uint32Array` rather than `Array` to avoid GC tracing and write barriers. +// +// `deserializedTokensLen` is the number of deserialized tokens in current file. +// If all tokens have been deserialized (`allTokensDeserialized === true`), `deserializedTokensLen` is 0, +// and no further indexes are written to `deserializedTokenIndexes`. `resetTokens` will reset all tokens, +// up to `tokensLen`. +let deserializedTokenIndexes = new Uint32Array(0); +let deserializedTokensLen = 0; + +// Minimum capacity (in `u32`s) of `deserializedTokenIndexes`, when not empty. +// 16 elements = 64 bytes = 1 cache line. +// Note that this default aims to be a reasonable minimum for number of *deserialized* tokens, +// not *total* number of tokens. +const DESERIALIZED_TOKEN_INDEXES_MIN_CAPACITY = 16; + // Reset `#loc` field on a `Token` class instance let resetLoc: (token: Token) => void; +// Get `#loc` field on a `Token` class instance. +// Only used in debug build (tests). +let getTokenPrivateLoc: (token: Token) => Location | null; + /** * Token implementation with lazy `loc` caching via private field. * @@ -162,6 +185,8 @@ class Token { resetLoc = (token: Token) => { token.#loc = null; }; + + if (DEBUG) getTokenPrivateLoc = (token: Token) => token.#loc; } } @@ -245,6 +270,8 @@ export function deserializeTokens(): void { } allTokensDeserialized = true; + // No need to count any more, since all tokens have been deserialized + deserializedTokensLen = 0; debugCheckDeserializedTokens(); } @@ -277,9 +304,25 @@ export function initTokensBuffer(): void { tokensUint8 = new Uint8Array(arrayBuffer, absolutePos, tokensLen << TOKEN_SIZE_SHIFT); tokensUint32 = new Uint32Array(arrayBuffer, absolutePos, tokensLen << (TOKEN_SIZE_SHIFT - 2)); - // Grow cache if needed (one-time cost as cache warms up) - while (cachedTokens.length < tokensLen) { - cachedTokens.push(new Token()); + // Grow caches if needed. After first few files, caches should have grown large enough to service all files. + // Later files will skip this step, and allocations stop. + if (cachedTokens.length < tokensLen) { + do { + cachedTokens.push(new Token()); + } while (cachedTokens.length < tokensLen); + + // Grow `deserializedTokenIndexes` if needed. + // `Uint32Array`s can't grow in place, so allocate a new one. + // First allocation uses minimum capacity. Subsequent growths double, to avoid frequent reallocations. + const indexesLen = deserializedTokenIndexes.length; + if (indexesLen < tokensLen) { + deserializedTokenIndexes = new Uint32Array( + Math.max( + tokensLen, + indexesLen === 0 ? DESERIALIZED_TOKEN_INDEXES_MIN_CAPACITY : indexesLen << 1, + ), + ); + } } // Check buffer data has valid ranges and ascending order @@ -295,8 +338,23 @@ export function initTokensBuffer(): void { * @returns Deserialized token */ export function getToken(index: number): TokenType { - const token = deserializeTokenIfNeeded(index); - return (token === null ? cachedTokens[index] : token) as TokenType; + // Skip all other checks if all tokens have been deserialized + if (!allTokensDeserialized) { + const token = deserializeTokenIfNeeded(index); + + if (token !== null) { + // Token was newly deserialized. + // Record the token so its `value` can be cleared on reset, preventing source text strings + // from being held alive by stale `value` slices. + // This is in `getToken` rather than `deserializeTokenIfNeeded` so the bulk path + // (`deserializeTokens`) skips the tracking - it uses `allTokensDeserialized` instead. + deserializedTokenIndexes[deserializedTokensLen++] = index; + return token as TokenType; + } + } + + // Token was already deserialized + return cachedTokens[index] as TokenType; } /** @@ -307,7 +365,7 @@ export function getToken(index: number): TokenType { * @param index - Token index in the tokens buffer * @returns `Token` object if newly deserialized, or `null` if already deserialized */ -export function deserializeTokenIfNeeded(index: number): Token | null { +function deserializeTokenIfNeeded(index: number): Token | null { const pos = index << TOKEN_SIZE_SHIFT; // Fast path: If already deserialized, exit @@ -343,7 +401,7 @@ export function deserializeTokenIfNeeded(index: number): Token | null { if (regexObjects.length > regexIndex) { regex = regexObjects[regexIndex]; } else { - regexObjects.push((regex = { pattern: "", flags: "" })); + regexObjects.push((regex = { pattern: null!, flags: null! })); } token.regex = regex; @@ -431,19 +489,89 @@ function debugCheckDeserializedTokens(): void { * will recalculate it when the token is reused for a different file. */ export function resetTokens() { + // Early exit if tokens were never accessed (e.g. no rules used tokens-related methods) + if (tokensUint32 === null) { + debugAssertAllTokensCleared(); + return; + } + + // Clear `value` property of deserialized tokens to release source text string slices. + // Without this, V8's SlicedString optimization keeps the entire source text alive + // as long as any token's `value` (which is a slice of it) exists. + if (allTokensDeserialized === false) { + // Only a subset of tokens have been deserialized, so clear only those + for (let i = 0; i < deserializedTokensLen; i++) { + cachedTokens[deserializedTokenIndexes[i]].value = null!; + } + + deserializedTokensLen = 0; + } else { + // All tokens have been deserialized, so clear them all + for (let i = 0; i < tokensLen; i++) { + cachedTokens[i].value = null!; + } + + allTokensDeserialized = false; + + debugAssert( + deserializedTokensLen === 0, + "Deserialized tokens counter should have been reset to 0", + ); + } + + // Reset `#loc` on tokens where `loc` has been accessed for (let i = 0, len = tokensWithLoc.length; i < len; i++) { resetLoc(tokensWithLoc[i]); } + tokensWithLoc.length = 0; + // Reset `regex` property on regex tokens. + // This avoids having to set it for every non-regex token in `deserializeTokenIfNeeded`. + // Clear `pattern` and `flags` fields of `Regex` object, to release source text string slices (see above). for (let i = 0, len = tokensWithRegex.length; i < len; i++) { - tokensWithRegex[i].regex = undefined; + const token = tokensWithRegex[i]; + + const regex = token.regex!; + regex.pattern = null!; + regex.flags = null!; + + token.regex = undefined; } + tokensWithRegex.length = 0; + // Clear other state tokens = null; tokensUint8 = null; tokensUint32 = null; tokensLen = 0; - allTokensDeserialized = false; + + debugAssertAllTokensCleared(); +} + +/** + * Check that all token and regex objects have been cleared. + * + * Only runs in debug build (tests). In release build, this function is entirely removed by minifier. + */ +function debugAssertAllTokensCleared(): void { + if (!DEBUG) return; + + // Check all cached tokens have `value: null`, `#loc: null`, and `regex: undefined` + for (let i = 0; i < cachedTokens.length; i++) { + const token = cachedTokens[i]; + if (token.value !== null) throw new Error(`Token ${i} has not had \`value\` cleared`); + if (getTokenPrivateLoc(token) !== null) { + throw new Error(`Token ${i} has not had \`#loc\` cleared`); + } + if (token.regex !== undefined) throw new Error(`Token ${i} has not had \`regex\` cleared`); + } + + // Check all regex objects have `pattern: null` and `flags: null` + for (let i = 0; i < regexObjects.length; i++) { + const regex = regexObjects[i]; + if (regex.pattern !== null) throw new Error(`Regex ${i} has not had \`pattern\` cleared`); + if (regex.flags !== null) throw new Error(`Regex ${i} has not had \`flags\` cleared`); + } } diff --git a/apps/oxlint/src-js/plugins/tokens_methods.ts b/apps/oxlint/src-js/plugins/tokens_methods.ts index 21bf31bb41482..8f954531db38d 100644 --- a/apps/oxlint/src-js/plugins/tokens_methods.ts +++ b/apps/oxlint/src-js/plugins/tokens_methods.ts @@ -2,14 +2,7 @@ * `SourceCode` methods related to tokens. */ -import { - cachedTokens, - tokensUint32, - tokensLen, - initTokensBuffer, - getToken, - deserializeTokenIfNeeded, -} from "./tokens.ts"; +import { cachedTokens, tokensUint32, tokensLen, initTokensBuffer, getToken } from "./tokens.ts"; import { tokensAndCommentsUint32, tokensAndCommentsLen, @@ -1508,7 +1501,7 @@ function collectEntries( if (includeComments === false) { // Batch-deserialize tokens in range, then slice from `cachedTokens` array for (let i = startIndex; i < endIndex; i++) { - deserializeTokenIfNeeded(i); + getToken(i); } return cachedTokens!.slice(startIndex, endIndex) as TokenOrComment[]; }