From 7dcdbae24da6efc226c1e296a6289e3e16899433 Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Wed, 11 Feb 2026 01:17:03 +0900 Subject: [PATCH 1/3] perf(core): Add UTF-8 fast path to skip expensive jschardet encoding detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, every file went through jschardet.detect() which scans the entire buffer through multiple encoding probers (MBCS, SBCS, Latin1) with frequency table lookups — the most expensive CPU operation in file collection. Since ~99% of source code files are UTF-8, we now try TextDecoder('utf-8', { fatal: true }) first. If it succeeds, jschardet and iconv are skipped entirely. Non-UTF-8 files (e.g., Shift-JIS, EUC-KR) fall back to the original detection path. Additionally, set concurrentTasksPerWorker=3 for fileCollect workers to better overlap I/O waits within each worker thread. Benchmark results (838 files, 10 CPU cores): - Before: ~616ms - After: ~108ms (5.7x faster) --- src/core/file/fileCollect.ts | 1 + src/core/file/fileRead.ts | 27 ++++++++++++++------------- src/shared/processConcurrency.ts | 4 +++- tests/core/file/fileCollect.test.ts | 20 ++++++++++---------- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/core/file/fileCollect.ts b/src/core/file/fileCollect.ts index 07191634f..1d4915728 100644 --- a/src/core/file/fileCollect.ts +++ b/src/core/file/fileCollect.ts @@ -27,6 +27,7 @@ export const collectFiles = async ( numOfTasks: filePaths.length, workerType: 'fileCollect', runtime: 'worker_threads', + concurrentTasksPerWorker: 3, }); const tasks = filePaths.map( (filePath) => diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts index 85166354d..0d1a78f87 100644 --- a/src/core/file/fileRead.ts +++ b/src/core/file/fileRead.ts @@ -43,24 +43,25 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis return { content: null, skippedReason: 'binary-content' }; } + // Fast path: Try UTF-8 decoding first (covers ~99% of source code files) + // This skips the expensive jschardet.detect() which scans the entire buffer + // through multiple encoding probers with frequency table lookups + try { + let content = new TextDecoder('utf-8', { fatal: true }).decode(buffer); + if (content.charCodeAt(0) === 0xfeff) { + content = content.slice(1); // strip UTF-8 BOM + } + return { content }; + } catch { + // Not valid UTF-8, fall through to encoding detection + } + + // Slow path: Detect encoding with jschardet for non-UTF-8 files (e.g., Shift-JIS, EUC-KR) const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {}; const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8'; - const content = iconv.decode(buffer, encoding, { stripBOM: true }); - // Only skip if there are actual decode errors (U+FFFD replacement characters) - // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files if (content.includes('\uFFFD')) { - // For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source - if (encoding.toLowerCase() === 'utf-8') { - try { - let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer); - if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM - return { content: utf8 }; - } catch { - // fall through to skip below - } - } logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`); return { content: null, skippedReason: 'encoding-error' }; } diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts index 4c973e32f..2fe1cb4f8 100644 --- a/src/shared/processConcurrency.ts +++ b/src/shared/processConcurrency.ts @@ -12,6 +12,7 @@ export interface WorkerOptions { numOfTasks: number; workerType: WorkerType; runtime: WorkerRuntime; + concurrentTasksPerWorker?: number; } /** @@ -64,7 +65,7 @@ export const getWorkerThreadCount = (numOfTasks: number): { minThreads: number; }; export const createWorkerPool = (options: WorkerOptions): Tinypool => { - const { numOfTasks, workerType, runtime = 'child_process' } = options; + const { numOfTasks, workerType, runtime = 'child_process', concurrentTasksPerWorker } = options; const { minThreads, maxThreads } = getWorkerThreadCount(numOfTasks); // Get worker path - uses unified worker in bundled env, individual files otherwise @@ -83,6 +84,7 @@ export const createWorkerPool = (options: WorkerOptions): Tinypool => { maxThreads, idleTimeout: 5000, teardown: 'onWorkerTermination', + ...(concurrentTasksPerWorker != null && { concurrentTasksPerWorker }), workerData: { workerType, logLevel: logger.getLogLevel(), diff --git a/tests/core/file/fileCollect.test.ts b/tests/core/file/fileCollect.test.ts index 3afb8a87c..56cae27e8 100644 --- a/tests/core/file/fileCollect.test.ts +++ b/tests/core/file/fileCollect.test.ts @@ -70,7 +70,7 @@ describe('fileCollect', () => { vi.mocked(isBinaryFile).mockResolvedValue(false); vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('decoded content'); + vi.mocked(iconv.decode).mockReturnValue('file content'); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { initTaskRunner: mockInitTaskRunner, @@ -78,8 +78,8 @@ describe('fileCollect', () => { expect(result).toEqual({ rawFiles: [ - { path: 'file1.txt', content: 'decoded content' }, - { path: 'file2.txt', content: 'decoded content' }, + { path: 'file1.txt', content: 'file content' }, + { path: 'file2.txt', content: 'file content' }, ], skippedFiles: [], }); @@ -96,14 +96,14 @@ describe('fileCollect', () => { vi.mocked(isBinaryFile).mockResolvedValue(false); vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('decoded content'); + vi.mocked(iconv.decode).mockReturnValue('file content'); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { initTaskRunner: mockInitTaskRunner, }); expect(result).toEqual({ - rawFiles: [{ path: 'text.txt', content: 'decoded content' }], + rawFiles: [{ path: 'text.txt', content: 'file content' }], skippedFiles: [{ path: 'binary.bin', reason: 'binary-extension' }], }); expect(logger.debug).toHaveBeenCalledWith(`Skipping binary file: ${path.resolve('/root/binary.bin')}`); @@ -130,14 +130,14 @@ describe('fileCollect', () => { vi.mocked(isBinaryFile).mockResolvedValue(false); vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('decoded content'); + vi.mocked(iconv.decode).mockReturnValue('file content'); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { initTaskRunner: mockInitTaskRunner, }); expect(result).toEqual({ - rawFiles: [{ path: 'normal.txt', content: 'decoded content' }], + rawFiles: [{ path: 'normal.txt', content: 'file content' }], skippedFiles: [{ path: 'large.txt', reason: 'size-limit' }], }); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); @@ -174,14 +174,14 @@ describe('fileCollect', () => { vi.mocked(isBinaryFile).mockResolvedValue(false); vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('decoded content'); + vi.mocked(iconv.decode).mockReturnValue('file content'); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { initTaskRunner: mockInitTaskRunner, }); expect(result).toEqual({ - rawFiles: [{ path: 'small.txt', content: 'decoded content' }], + rawFiles: [{ path: 'small.txt', content: 'file content' }], skippedFiles: [{ path: 'medium.txt', reason: 'size-limit' }], }); expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); @@ -225,7 +225,7 @@ describe('fileCollect', () => { vi.mocked(isBinaryFile).mockResolvedValue(false); vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('decoded content'); + vi.mocked(iconv.decode).mockReturnValue('file content'); await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { initTaskRunner: mockInitTaskRunner, From e97691dd360909731098d56aef04a984ce2d65ea Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Wed, 11 Feb 2026 01:34:18 +0900 Subject: [PATCH 2/3] perf(core): Replace worker threads with promise pool for file collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the UTF-8 fast path optimization eliminated the CPU-heavy jschardet bottleneck, file collection became I/O-bound. Worker threads now add pure overhead (Tinypool init, structured clone, IPC) without benefit. Benchmark (954 files, M2 Pro 10-core): - Worker Threads: ~108ms → Promise Pool (c=50): ~37ms (2.9x faster) Changes: - Replace Tinypool worker dispatch with a simple promise pool (c=50) - Inject readRawFile via deps for testability - Remove unused concurrentTasksPerWorker from WorkerOptions - Simplify tests to use readRawFile mock instead of 5+ module mocks --- src/core/file/fileCollect.ts | 117 ++++++++------- src/shared/processConcurrency.ts | 4 +- tests/core/file/fileCollect.test.ts | 180 ++++++----------------- tests/integration-tests/packager.test.ts | 19 +-- 4 files changed, 107 insertions(+), 213 deletions(-) diff --git a/src/core/file/fileCollect.ts b/src/core/file/fileCollect.ts index 1d4915728..5a3b9792a 100644 --- a/src/core/file/fileCollect.ts +++ b/src/core/file/fileCollect.ts @@ -1,18 +1,40 @@ +import path from 'node:path'; import pc from 'picocolors'; import type { RepomixConfigMerged } from '../../config/configSchema.js'; import { logger } from '../../shared/logger.js'; -import { initTaskRunner } from '../../shared/processConcurrency.js'; import type { RepomixProgressCallback } from '../../shared/types.js'; +import { readRawFile as defaultReadRawFile, type FileSkipReason } from './fileRead.js'; import type { RawFile } from './fileTypes.js'; -import type { FileCollectResult, FileCollectTask, SkippedFileInfo } from './workers/fileCollectWorker.js'; + +// Concurrency limit for parallel file reads on the main thread. +// 50 balances I/O throughput with FD/memory safety across different machines. +const FILE_COLLECT_CONCURRENCY = 50; + +export interface SkippedFileInfo { + path: string; + reason: FileSkipReason; +} export interface FileCollectResults { rawFiles: RawFile[]; skippedFiles: SkippedFileInfo[]; } -// Re-export SkippedFileInfo for external use -export type { SkippedFileInfo } from './workers/fileCollectWorker.js'; +const promisePool = async (items: T[], concurrency: number, fn: (item: T) => Promise): Promise => { + const results: R[] = Array.from({ length: items.length }); + let nextIndex = 0; + + const worker = async () => { + while (nextIndex < items.length) { + const i = nextIndex++; + results[i] = await fn(items[i]); + } + }; + + await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker())); + + return results; +}; export const collectFiles = async ( filePaths: string[], @@ -20,64 +42,41 @@ export const collectFiles = async ( config: RepomixConfigMerged, progressCallback: RepomixProgressCallback = () => {}, deps = { - initTaskRunner, + readRawFile: defaultReadRawFile, }, ): Promise => { - const taskRunner = deps.initTaskRunner({ - numOfTasks: filePaths.length, - workerType: 'fileCollect', - runtime: 'worker_threads', - concurrentTasksPerWorker: 3, + const startTime = process.hrtime.bigint(); + logger.trace(`Starting file collection for ${filePaths.length} files`); + + let completedTasks = 0; + const totalTasks = filePaths.length; + const maxFileSize = config.input.maxFileSize; + + const results = await promisePool(filePaths, FILE_COLLECT_CONCURRENCY, async (filePath) => { + const fullPath = path.resolve(rootDir, filePath); + const result = await deps.readRawFile(fullPath, maxFileSize); + + completedTasks++; + progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(filePath)}`); + logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${filePath}`); + + return { filePath, result }; }); - const tasks = filePaths.map( - (filePath) => - ({ - filePath, - rootDir, - maxFileSize: config.input.maxFileSize, - }) satisfies FileCollectTask, - ); - - try { - const startTime = process.hrtime.bigint(); - logger.trace(`Starting file collection for ${filePaths.length} files using worker pool`); - - let completedTasks = 0; - const totalTasks = tasks.length; - - const results = await Promise.all( - tasks.map((task) => - taskRunner.run(task).then((result) => { - completedTasks++; - progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(task.filePath)}`); - logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${task.filePath}`); - return result; - }), - ), - ); - - const endTime = process.hrtime.bigint(); - const duration = Number(endTime - startTime) / 1e6; - logger.trace(`File collection completed in ${duration.toFixed(2)}ms`); - - const rawFiles: RawFile[] = []; - const skippedFiles: SkippedFileInfo[] = []; - - for (const result of results) { - if (result.rawFile) { - rawFiles.push(result.rawFile); - } - if (result.skippedFile) { - skippedFiles.push(result.skippedFile); - } - } - return { rawFiles, skippedFiles }; - } catch (error) { - logger.error('Error during file collection:', error); - throw error; - } finally { - // Always cleanup worker pool - await taskRunner.cleanup(); + const rawFiles: RawFile[] = []; + const skippedFiles: SkippedFileInfo[] = []; + + for (const { filePath, result } of results) { + if (result.content !== null) { + rawFiles.push({ path: filePath, content: result.content }); + } else if (result.skippedReason) { + skippedFiles.push({ path: filePath, reason: result.skippedReason }); + } } + + const endTime = process.hrtime.bigint(); + const duration = Number(endTime - startTime) / 1e6; + logger.trace(`File collection completed in ${duration.toFixed(2)}ms`); + + return { rawFiles, skippedFiles }; }; diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts index 2fe1cb4f8..4c973e32f 100644 --- a/src/shared/processConcurrency.ts +++ b/src/shared/processConcurrency.ts @@ -12,7 +12,6 @@ export interface WorkerOptions { numOfTasks: number; workerType: WorkerType; runtime: WorkerRuntime; - concurrentTasksPerWorker?: number; } /** @@ -65,7 +64,7 @@ export const getWorkerThreadCount = (numOfTasks: number): { minThreads: number; }; export const createWorkerPool = (options: WorkerOptions): Tinypool => { - const { numOfTasks, workerType, runtime = 'child_process', concurrentTasksPerWorker } = options; + const { numOfTasks, workerType, runtime = 'child_process' } = options; const { minThreads, maxThreads } = getWorkerThreadCount(numOfTasks); // Get worker path - uses unified worker in bundled env, individual files otherwise @@ -84,7 +83,6 @@ export const createWorkerPool = (options: WorkerOptions): Tinypool => { maxThreads, idleTimeout: 5000, teardown: 'onWorkerTermination', - ...(concurrentTasksPerWorker != null && { concurrentTasksPerWorker }), workerData: { workerType, logLevel: logger.getLogLevel(), diff --git a/tests/core/file/fileCollect.test.ts b/tests/core/file/fileCollect.test.ts index 56cae27e8..20bdeb67c 100644 --- a/tests/core/file/fileCollect.test.ts +++ b/tests/core/file/fileCollect.test.ts @@ -1,64 +1,17 @@ -import type { Stats } from 'node:fs'; -import * as fs from 'node:fs/promises'; import path from 'node:path'; -import iconv from 'iconv-lite'; -import isBinaryPath from 'is-binary-path'; -import { isBinaryFile } from 'isbinaryfile'; -import jschardet from 'jschardet'; -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, type Mock, vi } from 'vitest'; import { collectFiles } from '../../../src/core/file/fileCollect.js'; -import type { FileCollectTask } from '../../../src/core/file/workers/fileCollectWorker.js'; -import fileCollectWorker from '../../../src/core/file/workers/fileCollectWorker.js'; -import { logger } from '../../../src/shared/logger.js'; -import type { WorkerOptions, WorkerRuntime } from '../../../src/shared/processConcurrency.js'; +import type { FileReadResult } from '../../../src/core/file/fileRead.js'; import { createMockConfig } from '../../testing/testUtils.js'; // Define the max file size constant for tests const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB -vi.mock('node:fs/promises'); -vi.mock('is-binary-path'); -vi.mock('isbinaryfile'); -vi.mock('jschardet'); -vi.mock('iconv-lite'); -vi.mock('../../../src/shared/logger'); - -interface MockInitTaskRunner { - ( - options: WorkerOptions, - ): { - run: (task: T) => Promise; - cleanup: () => Promise; - }; - lastRuntime?: WorkerRuntime; -} - -const mockInitTaskRunner = (options: WorkerOptions) => { - // Store runtime for verification in tests - (mockInitTaskRunner as MockInitTaskRunner).lastRuntime = options.runtime; - return { - run: async (task: T) => { - return (await fileCollectWorker(task as FileCollectTask)) as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; -}; - describe('fileCollect', () => { - beforeEach(() => { - vi.resetAllMocks(); - - // Setup basic file size mock to fix stat - vi.mocked(fs.stat).mockResolvedValue({ - size: 1024, - isFile: () => true, - } as Stats); - }); + let mockReadRawFile: Mock<(filePath: string, maxFileSize: number) => Promise>; - afterEach(() => { - vi.resetAllMocks(); + beforeEach(() => { + mockReadRawFile = vi.fn(); }); it('should collect non-binary files', async () => { @@ -66,14 +19,10 @@ describe('fileCollect', () => { const mockRootDir = '/root'; const mockConfig = createMockConfig(); - vi.mocked(isBinaryPath).mockReturnValue(false); - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); - vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('file content'); + mockReadRawFile.mockResolvedValue({ content: 'file content' }); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + readRawFile: mockReadRawFile, }); expect(result).toEqual({ @@ -83,6 +32,9 @@ describe('fileCollect', () => { ], skippedFiles: [], }); + expect(mockReadRawFile).toHaveBeenCalledTimes(2); + expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/file1.txt'), MAX_FILE_SIZE); + expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/file2.txt'), MAX_FILE_SIZE); }); it('should skip binary files', async () => { @@ -90,62 +42,43 @@ describe('fileCollect', () => { const mockRootDir = '/root'; const mockConfig = createMockConfig(); - vi.mocked(isBinaryPath) - .mockReturnValueOnce(true) // for binary.bin (skip by extension) - .mockReturnValueOnce(false); // for text.txt - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); - vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('file content'); + mockReadRawFile.mockImplementation(async (filePath) => { + if (filePath.endsWith('binary.bin')) { + return { content: null, skippedReason: 'binary-extension' }; + } + return { content: 'file content' }; + }); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + readRawFile: mockReadRawFile, }); expect(result).toEqual({ rawFiles: [{ path: 'text.txt', content: 'file content' }], skippedFiles: [{ path: 'binary.bin', reason: 'binary-extension' }], }); - expect(logger.debug).toHaveBeenCalledWith(`Skipping binary file: ${path.resolve('/root/binary.bin')}`); }); it('should skip large files based on default maxFileSize', async () => { const mockFilePaths = ['large.txt', 'normal.txt']; const mockRootDir = '/root'; const mockConfig = createMockConfig(); - const largePath = path.resolve('/root/large.txt'); - - vi.mocked(fs.stat) - .mockResolvedValueOnce({ - // for large.txt - size: MAX_FILE_SIZE + 1024, // Slightly over limit - isFile: () => true, - } as Stats) - .mockResolvedValueOnce({ - // for normal.txt - size: 1024, - isFile: () => true, - } as Stats); - vi.mocked(isBinaryPath).mockReturnValue(false); - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); - vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('file content'); + + mockReadRawFile.mockImplementation(async (filePath) => { + if (filePath.endsWith('large.txt')) { + return { content: null, skippedReason: 'size-limit' }; + } + return { content: 'file content' }; + }); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + readRawFile: mockReadRawFile, }); expect(result).toEqual({ rawFiles: [{ path: 'normal.txt', content: 'file content' }], skippedFiles: [{ path: 'large.txt', reason: 'size-limit' }], }); - expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); - expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(largePath)); - - // Verify fs.readFile is not called for the large file - expect(fs.readFile).not.toHaveBeenCalledWith(largePath); - expect(fs.readFile).toHaveBeenCalledTimes(1); }); it('should respect custom maxFileSize setting', async () => { @@ -157,40 +90,26 @@ describe('fileCollect', () => { maxFileSize: customMaxFileSize, }, }); - const mediumPath = path.resolve('/root/medium.txt'); - - vi.mocked(fs.stat) - .mockResolvedValueOnce({ - // for medium.txt - size: 10 * 1024 * 1024, // 10MB (exceeds custom 5MB limit) - isFile: () => true, - } as Stats) - .mockResolvedValueOnce({ - // for small.txt - size: 1024, // 1KB (within limit) - isFile: () => true, - } as Stats); - vi.mocked(isBinaryPath).mockReturnValue(false); - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); - vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('file content'); + + mockReadRawFile.mockImplementation(async (filePath) => { + if (filePath.endsWith('medium.txt')) { + return { content: null, skippedReason: 'size-limit' }; + } + return { content: 'file content' }; + }); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + readRawFile: mockReadRawFile, }); expect(result).toEqual({ rawFiles: [{ path: 'small.txt', content: 'file content' }], skippedFiles: [{ path: 'medium.txt', reason: 'size-limit' }], }); - expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:')); - expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('10240.0KB > 5120.0KB')); - expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(mediumPath)); - // Verify fs.readFile is not called for the medium file - expect(fs.readFile).not.toHaveBeenCalledWith(mediumPath); - expect(fs.readFile).toHaveBeenCalledTimes(1); + // Verify readRawFile is called with custom maxFileSize + expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/medium.txt'), customMaxFileSize); + expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/small.txt'), customMaxFileSize); }); it('should handle file read errors', async () => { @@ -198,39 +117,30 @@ describe('fileCollect', () => { const mockRootDir = '/root'; const mockConfig = createMockConfig(); - vi.mocked(isBinaryPath).mockReturnValue(false); - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockRejectedValue(new Error('Read error')); + mockReadRawFile.mockResolvedValue({ content: null, skippedReason: 'encoding-error' }); const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + readRawFile: mockReadRawFile, }); expect(result).toEqual({ rawFiles: [], skippedFiles: [{ path: 'error.txt', reason: 'encoding-error' }], }); - expect(logger.warn).toHaveBeenCalledWith( - `Failed to read file: ${path.resolve('/root/error.txt')}`, - expect.any(Error), - ); }); - it('should use worker_threads runtime when calling initTaskRunner', async () => { - const mockFilePaths = ['test.txt']; + it('should call progressCallback for each file', async () => { + const mockFilePaths = ['file1.txt', 'file2.txt']; const mockRootDir = '/root'; const mockConfig = createMockConfig(); + const mockProgress = vi.fn(); - vi.mocked(isBinaryPath).mockReturnValue(false); - vi.mocked(isBinaryFile).mockResolvedValue(false); - vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content')); - vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 }); - vi.mocked(iconv.decode).mockReturnValue('file content'); + mockReadRawFile.mockResolvedValue({ content: 'file content' }); - await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, { - initTaskRunner: mockInitTaskRunner, + await collectFiles(mockFilePaths, mockRootDir, mockConfig, mockProgress, { + readRawFile: mockReadRawFile, }); - expect((mockInitTaskRunner as MockInitTaskRunner).lastRuntime).toBe('worker_threads'); + expect(mockProgress).toHaveBeenCalledTimes(2); }); }); diff --git a/tests/integration-tests/packager.test.ts b/tests/integration-tests/packager.test.ts index e8c5aafcb..214de2fe0 100644 --- a/tests/integration-tests/packager.test.ts +++ b/tests/integration-tests/packager.test.ts @@ -9,35 +9,22 @@ import { afterEach, beforeEach, describe, expect, test } from 'vitest'; import { loadFileConfig, mergeConfigs } from '../../src/config/configLoad.js'; import type { RepomixConfigFile, RepomixConfigMerged, RepomixOutputStyle } from '../../src/config/configSchema.js'; import { collectFiles } from '../../src/core/file/fileCollect.js'; +import { readRawFile } from '../../src/core/file/fileRead.js'; import { searchFiles } from '../../src/core/file/fileSearch.js'; import type { ProcessedFile } from '../../src/core/file/fileTypes.js'; - -import type { FileCollectTask } from '../../src/core/file/workers/fileCollectWorker.js'; -import fileCollectWorker from '../../src/core/file/workers/fileCollectWorker.js'; import fileProcessWorker from '../../src/core/file/workers/fileProcessWorker.js'; import type { GitDiffResult } from '../../src/core/git/gitDiffHandle.js'; import { produceOutput } from '../../src/core/packager/produceOutput.js'; import { pack } from '../../src/core/packager.js'; import { filterOutUntrustedFiles } from '../../src/core/security/filterOutUntrustedFiles.js'; import { validateFileSafety } from '../../src/core/security/validateFileSafety.js'; -import type { WorkerOptions } from '../../src/shared/processConcurrency.js'; + import { isWindows } from '../testing/testUtils.js'; const fixturesDir = path.join(__dirname, 'fixtures', 'packager'); const inputsDir = path.join(fixturesDir, 'inputs'); const outputsDir = path.join(fixturesDir, 'outputs'); -const mockCollectFileInitTaskRunner = (_options: WorkerOptions) => { - return { - run: async (task: T) => { - return (await fileCollectWorker(task as FileCollectTask)) as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; -}; - describe.runIf(!isWindows)('packager integration', () => { const testCases = [ { @@ -107,7 +94,7 @@ describe.runIf(!isWindows)('packager integration', () => { sortPaths: (filePaths) => filePaths, collectFiles: (filePaths, rootDir, config, progressCallback) => { return collectFiles(filePaths, rootDir, config, progressCallback, { - initTaskRunner: mockCollectFileInitTaskRunner, + readRawFile, }); }, processFiles: async (rawFiles, config, _progressCallback) => { From 05f11f46c705edd64594106de3a0d1a69cce06ad Mon Sep 17 00:00:00 2001 From: Kazuki Yamada Date: Sat, 14 Feb 2026 11:29:47 +0900 Subject: [PATCH 3/3] refactor(core): Remove unused fileCollect worker infrastructure File collection was replaced with a promise pool approach in 96ff05dc, but the worker-related code remained. This removes the now-unused fileCollectWorker and all references to it from the worker system. --- src/core/file/workers/fileCollectWorker.ts | 56 ---------------------- src/shared/processConcurrency.ts | 2 - src/shared/unifiedWorker.ts | 12 +---- tests/shared/processConcurrency.test.ts | 6 +-- tests/shared/unifiedWorker.test.ts | 18 ------- 5 files changed, 4 insertions(+), 90 deletions(-) delete mode 100644 src/core/file/workers/fileCollectWorker.ts diff --git a/src/core/file/workers/fileCollectWorker.ts b/src/core/file/workers/fileCollectWorker.ts deleted file mode 100644 index 7da9d0ab4..000000000 --- a/src/core/file/workers/fileCollectWorker.ts +++ /dev/null @@ -1,56 +0,0 @@ -import path from 'node:path'; -import { setLogLevelByWorkerData } from '../../../shared/logger.js'; -import { type FileSkipReason, readRawFile } from '../fileRead.js'; -import type { RawFile } from '../fileTypes.js'; - -// Initialize logger configuration from workerData at module load time -// This must be called before any logging operations in the worker -setLogLevelByWorkerData(); - -export interface FileCollectTask { - filePath: string; - rootDir: string; - maxFileSize: number; -} - -export interface SkippedFileInfo { - path: string; - reason: FileSkipReason; -} - -export interface FileCollectResult { - rawFile?: RawFile; - skippedFile?: SkippedFileInfo; -} - -export default async ({ filePath, rootDir, maxFileSize }: FileCollectTask): Promise => { - const fullPath = path.resolve(rootDir, filePath); - const result = await readRawFile(fullPath, maxFileSize); - - if (result.content !== null) { - return { - rawFile: { - path: filePath, - content: result.content, - }, - }; - } - - if (result.skippedReason) { - return { - skippedFile: { - path: filePath, - reason: result.skippedReason, - }, - }; - } - - throw new Error( - `File processing for ${filePath} resulted in an unexpected state: content is null but no skip reason was provided.`, - ); -}; - -// Export cleanup function for Tinypool teardown (no cleanup needed for this worker) -export const onWorkerTermination = async (): Promise => { - // No cleanup needed for file collection worker -}; diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts index 4c973e32f..0507131a2 100644 --- a/src/shared/processConcurrency.ts +++ b/src/shared/processConcurrency.ts @@ -27,8 +27,6 @@ const getWorkerPath = (workerType: WorkerType): string => { // Non-bundled environment: use individual worker files switch (workerType) { - case 'fileCollect': - return new URL('../core/file/workers/fileCollectWorker.js', import.meta.url).href; case 'fileProcess': return new URL('../core/file/workers/fileProcessWorker.js', import.meta.url).href; case 'securityCheck': diff --git a/src/shared/unifiedWorker.ts b/src/shared/unifiedWorker.ts index 1adec3c91..32594f4c6 100644 --- a/src/shared/unifiedWorker.ts +++ b/src/shared/unifiedWorker.ts @@ -12,7 +12,7 @@ import { workerData } from 'node:worker_threads'; // Worker type definitions -export type WorkerType = 'fileCollect' | 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction'; +export type WorkerType = 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction'; // Worker handler type - uses 'any' to accommodate different worker signatures // biome-ignore lint/suspicious/noExplicitAny: Worker handlers have varying signatures @@ -39,11 +39,6 @@ const loadWorkerHandler = async ( let result: { handler: WorkerHandler; cleanup?: WorkerCleanup }; switch (workerType) { - case 'fileCollect': { - const module = await import('../core/file/workers/fileCollectWorker.js'); - result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination }; - break; - } case 'fileProcess': { const module = await import('../core/file/workers/fileProcessWorker.js'); result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination }; @@ -95,11 +90,6 @@ const inferWorkerTypeFromTask = (task: unknown): WorkerType | null => { return 'defaultAction'; } - // fileCollect: has filePath, rootDir, maxFileSize - if ('filePath' in taskObj && 'rootDir' in taskObj && 'maxFileSize' in taskObj) { - return 'fileCollect'; - } - // fileProcess: has rawFile (nested object) and config if ('rawFile' in taskObj && 'config' in taskObj) { return 'fileProcess'; diff --git a/tests/shared/processConcurrency.test.ts b/tests/shared/processConcurrency.test.ts index 940ed2abe..289406ff6 100644 --- a/tests/shared/processConcurrency.test.ts +++ b/tests/shared/processConcurrency.test.ts @@ -91,17 +91,17 @@ describe('processConcurrency', () => { }); it('should initialize Tinypool with correct configuration', () => { - const tinypool = createWorkerPool({ numOfTasks: 500, workerType: 'fileCollect', runtime: 'child_process' }); + const tinypool = createWorkerPool({ numOfTasks: 500, workerType: 'fileProcess', runtime: 'child_process' }); expect(Tinypool).toHaveBeenCalledWith({ - filename: expect.stringContaining('fileCollectWorker.js'), + filename: expect.stringContaining('fileProcessWorker.js'), runtime: 'child_process', minThreads: 1, maxThreads: 4, // Math.min(4, 500/100) = 4 idleTimeout: 5000, teardown: 'onWorkerTermination', workerData: { - workerType: 'fileCollect', + workerType: 'fileProcess', logLevel: 2, }, env: expect.objectContaining({ diff --git a/tests/shared/unifiedWorker.test.ts b/tests/shared/unifiedWorker.test.ts index ee1823adb..7f3247b35 100644 --- a/tests/shared/unifiedWorker.test.ts +++ b/tests/shared/unifiedWorker.test.ts @@ -2,10 +2,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; // We need to test the internal functions, so we'll test through the module behavior // Mock all worker modules -vi.mock('../../src/core/file/workers/fileCollectWorker.js', () => ({ - default: vi.fn().mockResolvedValue({ collected: true }), - onWorkerTermination: vi.fn(), -})); vi.mock('../../src/core/file/workers/fileProcessWorker.js', () => ({ default: vi.fn().mockResolvedValue({ processed: true }), onWorkerTermination: vi.fn(), @@ -61,20 +57,6 @@ describe('unifiedWorker', () => { expect(defaultActionWorker.default).toHaveBeenCalledWith(task); }); - it('should infer fileCollect from task with filePath, rootDir, maxFileSize', async () => { - const { default: handler } = await import('../../src/shared/unifiedWorker.js'); - const task = { - filePath: 'test.ts', - rootDir: '/root', - maxFileSize: 1000, - }; - - await handler(task); - - const fileCollectWorker = await import('../../src/core/file/workers/fileCollectWorker.js'); - expect(fileCollectWorker.default).toHaveBeenCalledWith(task); - }); - it('should infer fileProcess from task with rawFile and config', async () => { const { default: handler } = await import('../../src/shared/unifiedWorker.js'); const task = {