-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
perf(core): Eliminate stat() syscall and lazy-load encoding libraries in fileRead #1399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,20 @@ | ||
| import * as fs from 'node:fs/promises'; | ||
| import iconv from 'iconv-lite'; | ||
| import isBinaryPath from 'is-binary-path'; | ||
| import { isBinaryFile } from 'isbinaryfile'; | ||
| import jschardet from 'jschardet'; | ||
| import { logger } from '../../shared/logger.js'; | ||
|
|
||
| // Lazy-load encoding detection libraries to avoid their ~25ms combined import cost. | ||
| // The fast UTF-8 path (covers ~99% of source code files) never needs these; | ||
| // they are only loaded when a file fails UTF-8 decoding. | ||
| let _jschardet: typeof import('jschardet') | undefined; | ||
| let _iconv: typeof import('iconv-lite') | undefined; | ||
| const getEncodingDeps = async () => { | ||
| if (!_jschardet || !_iconv) { | ||
| [_jschardet, _iconv] = await Promise.all([import('jschardet'), import('iconv-lite')]); | ||
| } | ||
| return { jschardet: _jschardet, iconv: _iconv }; | ||
| }; | ||
|
|
||
| export type FileSkipReason = 'binary-extension' | 'binary-content' | 'size-limit' | 'encoding-error'; | ||
|
|
||
| export interface FileReadResult { | ||
|
|
@@ -20,25 +30,26 @@ export interface FileReadResult { | |
| */ | ||
| export const readRawFile = async (filePath: string, maxFileSize: number): Promise<FileReadResult> => { | ||
| try { | ||
| // Check binary extension first (no I/O needed) to skip stat + read for binary files | ||
| // Check binary extension first (no I/O needed) to skip read for binary files | ||
| if (isBinaryPath(filePath)) { | ||
| logger.debug(`Skipping binary file: ${filePath}`); | ||
| return { content: null, skippedReason: 'binary-extension' }; | ||
| } | ||
|
|
||
| const stats = await fs.stat(filePath); | ||
| logger.trace(`Reading file: ${filePath}`); | ||
|
|
||
| // Read the file directly and check size afterward, avoiding a separate stat() syscall. | ||
| // This halves the number of I/O operations per file. | ||
| // Files exceeding maxFileSize are rare, so the occasional oversized read is acceptable. | ||
| const buffer = await fs.readFile(filePath); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing the Consider if the performance gain of one less |
||
|
|
||
| if (stats.size > maxFileSize) { | ||
| const sizeKB = (stats.size / 1024).toFixed(1); | ||
| if (buffer.length > maxFileSize) { | ||
| const sizeKB = (buffer.length / 1024).toFixed(1); | ||
| const maxSizeKB = (maxFileSize / 1024).toFixed(1); | ||
| logger.trace(`File exceeds size limit: ${sizeKB}KB > ${maxSizeKB}KB (${filePath})`); | ||
| return { content: null, skippedReason: 'size-limit' }; | ||
| } | ||
|
|
||
| logger.trace(`Reading file: ${filePath}`); | ||
|
|
||
| const buffer = await fs.readFile(filePath); | ||
|
|
||
| if (await isBinaryFile(buffer)) { | ||
| logger.debug(`Skipping binary file (content check): ${filePath}`); | ||
| return { content: null, skippedReason: 'binary-content' }; | ||
|
|
@@ -58,9 +69,11 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis | |
| } | ||
|
|
||
| // Slow path: Detect encoding with jschardet for non-UTF-8 files (e.g., Shift-JIS, EUC-KR) | ||
| const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {}; | ||
| const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8'; | ||
| const content = iconv.decode(buffer, encoding, { stripBOM: true }); | ||
| const encodingDeps = await getEncodingDeps(); | ||
| const { encoding: detectedEncoding } = encodingDeps.jschardet.detect(buffer) ?? {}; | ||
| const encoding = | ||
| detectedEncoding && encodingDeps.iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8'; | ||
| const content = encodingDeps.iconv.decode(buffer, encoding, { stripBOM: true }); | ||
|
|
||
| if (content.includes('\uFFFD')) { | ||
| logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The current implementation of
getEncodingDepshas a potential race condition. If multiple concurrent calls toreadRawFiletrigger the slow path (non-UTF-8 files) before the firstimport()completes, the modules will be imported multiple times. Additionally, the return type of the function includesundefinedfor both dependencies, which will cause TypeScript errors or runtime crashes when callingencodingDeps.jschardet.detect()at line 73.Using a single promise to manage the lazy loading ensures that the modules are only loaded once and provides a clean, non-nullable return type for the caller.