From 7dcdbae24da6efc226c1e296a6289e3e16899433 Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Wed, 11 Feb 2026 01:17:03 +0900
Subject: [PATCH 1/3] perf(core): Add UTF-8 fast path to skip expensive
 jschardet encoding detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, every file went through jschardet.detect() which scans the entire
buffer through multiple encoding probers (MBCS, SBCS, Latin1) with frequency
table lookups — the most expensive CPU operation in file collection.

Since ~99% of source code files are UTF-8, we now try TextDecoder('utf-8',
{ fatal: true }) first. If it succeeds, jschardet and iconv are skipped entirely.
Non-UTF-8 files (e.g., Shift-JIS, EUC-KR) fall back to the original detection path.

Additionally, set concurrentTasksPerWorker=3 for fileCollect workers to better
overlap I/O waits within each worker thread.

Benchmark results (838 files, 10 CPU cores):
- Before: ~616ms
- After:  ~108ms (5.7x faster)
---
 src/core/file/fileCollect.ts        |  1 +
 src/core/file/fileRead.ts           | 27 ++++++++++++++-------------
 src/shared/processConcurrency.ts    |  4 +++-
 tests/core/file/fileCollect.test.ts | 20 ++++++++++----------
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/core/file/fileCollect.ts b/src/core/file/fileCollect.ts
index 07191634f..1d4915728 100644
--- a/src/core/file/fileCollect.ts
+++ b/src/core/file/fileCollect.ts
@@ -27,6 +27,7 @@ export const collectFiles = async (
     numOfTasks: filePaths.length,
     workerType: 'fileCollect',
     runtime: 'worker_threads',
+    concurrentTasksPerWorker: 3,
   });
   const tasks = filePaths.map(
     (filePath) =>
diff --git a/src/core/file/fileRead.ts b/src/core/file/fileRead.ts
index 85166354d..0d1a78f87 100644
--- a/src/core/file/fileRead.ts
+++ b/src/core/file/fileRead.ts
@@ -43,24 +43,25 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis
       return { content: null, skippedReason: 'binary-content' };
     }
 
+    // Fast path: Try UTF-8 decoding first (covers ~99% of source code files)
+    // This skips the expensive jschardet.detect() which scans the entire buffer
+    // through multiple encoding probers with frequency table lookups
+    try {
+      let content = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
+      if (content.charCodeAt(0) === 0xfeff) {
+        content = content.slice(1); // strip UTF-8 BOM
+      }
+      return { content };
+    } catch {
+      // Not valid UTF-8, fall through to encoding detection
+    }
+
+    // Slow path: Detect encoding with jschardet for non-UTF-8 files (e.g., Shift-JIS, EUC-KR)
     const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {};
     const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8';
-
     const content = iconv.decode(buffer, encoding, { stripBOM: true });
 
-    // Only skip if there are actual decode errors (U+FFFD replacement characters)
-    // Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files
     if (content.includes('\uFFFD')) {
-      // For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source
-      if (encoding.toLowerCase() === 'utf-8') {
-        try {
-          let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
-          if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM
-          return { content: utf8 };
-        } catch {
-          // fall through to skip below
-        }
-      }
       logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`);
       return { content: null, skippedReason: 'encoding-error' };
     }
diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts
index 4c973e32f..2fe1cb4f8 100644
--- a/src/shared/processConcurrency.ts
+++ b/src/shared/processConcurrency.ts
@@ -12,6 +12,7 @@ export interface WorkerOptions {
   numOfTasks: number;
   workerType: WorkerType;
   runtime: WorkerRuntime;
+  concurrentTasksPerWorker?: number;
 }
 
 /**
@@ -64,7 +65,7 @@ export const getWorkerThreadCount = (numOfTasks: number): { minThreads: number;
 };
 
 export const createWorkerPool = (options: WorkerOptions): Tinypool => {
-  const { numOfTasks, workerType, runtime = 'child_process' } = options;
+  const { numOfTasks, workerType, runtime = 'child_process', concurrentTasksPerWorker } = options;
   const { minThreads, maxThreads } = getWorkerThreadCount(numOfTasks);
 
   // Get worker path - uses unified worker in bundled env, individual files otherwise
@@ -83,6 +84,7 @@ export const createWorkerPool = (options: WorkerOptions): Tinypool => {
     maxThreads,
     idleTimeout: 5000,
     teardown: 'onWorkerTermination',
+    ...(concurrentTasksPerWorker != null && { concurrentTasksPerWorker }),
     workerData: {
       workerType,
       logLevel: logger.getLogLevel(),
diff --git a/tests/core/file/fileCollect.test.ts b/tests/core/file/fileCollect.test.ts
index 3afb8a87c..56cae27e8 100644
--- a/tests/core/file/fileCollect.test.ts
+++ b/tests/core/file/fileCollect.test.ts
@@ -70,7 +70,7 @@ describe('fileCollect', () => {
     vi.mocked(isBinaryFile).mockResolvedValue(false);
     vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
     vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('decoded content');
+    vi.mocked(iconv.decode).mockReturnValue('file content');
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
       initTaskRunner: mockInitTaskRunner,
@@ -78,8 +78,8 @@ describe('fileCollect', () => {
 
     expect(result).toEqual({
       rawFiles: [
-        { path: 'file1.txt', content: 'decoded content' },
-        { path: 'file2.txt', content: 'decoded content' },
+        { path: 'file1.txt', content: 'file content' },
+        { path: 'file2.txt', content: 'file content' },
       ],
       skippedFiles: [],
     });
@@ -96,14 +96,14 @@ describe('fileCollect', () => {
     vi.mocked(isBinaryFile).mockResolvedValue(false);
     vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
     vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('decoded content');
+    vi.mocked(iconv.decode).mockReturnValue('file content');
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
       initTaskRunner: mockInitTaskRunner,
     });
 
     expect(result).toEqual({
-      rawFiles: [{ path: 'text.txt', content: 'decoded content' }],
+      rawFiles: [{ path: 'text.txt', content: 'file content' }],
       skippedFiles: [{ path: 'binary.bin', reason: 'binary-extension' }],
     });
     expect(logger.debug).toHaveBeenCalledWith(`Skipping binary file: ${path.resolve('/root/binary.bin')}`);
@@ -130,14 +130,14 @@ describe('fileCollect', () => {
     vi.mocked(isBinaryFile).mockResolvedValue(false);
     vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
     vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('decoded content');
+    vi.mocked(iconv.decode).mockReturnValue('file content');
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
       initTaskRunner: mockInitTaskRunner,
     });
 
     expect(result).toEqual({
-      rawFiles: [{ path: 'normal.txt', content: 'decoded content' }],
+      rawFiles: [{ path: 'normal.txt', content: 'file content' }],
       skippedFiles: [{ path: 'large.txt', reason: 'size-limit' }],
     });
     expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:'));
@@ -174,14 +174,14 @@ describe('fileCollect', () => {
     vi.mocked(isBinaryFile).mockResolvedValue(false);
     vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
     vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('decoded content');
+    vi.mocked(iconv.decode).mockReturnValue('file content');
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
       initTaskRunner: mockInitTaskRunner,
     });
 
     expect(result).toEqual({
-      rawFiles: [{ path: 'small.txt', content: 'decoded content' }],
+      rawFiles: [{ path: 'small.txt', content: 'file content' }],
       skippedFiles: [{ path: 'medium.txt', reason: 'size-limit' }],
     });
     expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:'));
@@ -225,7 +225,7 @@ describe('fileCollect', () => {
     vi.mocked(isBinaryFile).mockResolvedValue(false);
     vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
     vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('decoded content');
+    vi.mocked(iconv.decode).mockReturnValue('file content');
 
     await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
       initTaskRunner: mockInitTaskRunner,

From e97691dd360909731098d56aef04a984ce2d65ea Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Wed, 11 Feb 2026 01:34:18 +0900
Subject: [PATCH 2/3] perf(core): Replace worker threads with promise pool for
 file collection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the UTF-8 fast path optimization eliminated the CPU-heavy jschardet
bottleneck, file collection became I/O-bound. Worker threads now add pure
overhead (Tinypool init, structured clone, IPC) without benefit.

Benchmark (954 files, M2 Pro 10-core):
- Worker Threads: ~108ms → Promise Pool (c=50): ~37ms (2.9x faster)

Changes:
- Replace Tinypool worker dispatch with a simple promise pool (c=50)
- Inject readRawFile via deps for testability
- Remove unused concurrentTasksPerWorker from WorkerOptions
- Simplify tests to use readRawFile mock instead of 5+ module mocks
---
 src/core/file/fileCollect.ts             | 117 ++++++++-------
 src/shared/processConcurrency.ts         |   4 +-
 tests/core/file/fileCollect.test.ts      | 180 ++++++-----------------
 tests/integration-tests/packager.test.ts |  19 +--
 4 files changed, 107 insertions(+), 213 deletions(-)

diff --git a/src/core/file/fileCollect.ts b/src/core/file/fileCollect.ts
index 1d4915728..5a3b9792a 100644
--- a/src/core/file/fileCollect.ts
+++ b/src/core/file/fileCollect.ts
@@ -1,18 +1,40 @@
+import path from 'node:path';
 import pc from 'picocolors';
 import type { RepomixConfigMerged } from '../../config/configSchema.js';
 import { logger } from '../../shared/logger.js';
-import { initTaskRunner } from '../../shared/processConcurrency.js';
 import type { RepomixProgressCallback } from '../../shared/types.js';
+import { readRawFile as defaultReadRawFile, type FileSkipReason } from './fileRead.js';
 import type { RawFile } from './fileTypes.js';
-import type { FileCollectResult, FileCollectTask, SkippedFileInfo } from './workers/fileCollectWorker.js';
+
+// Concurrency limit for parallel file reads on the main thread.
+// 50 balances I/O throughput with FD/memory safety across different machines.
+const FILE_COLLECT_CONCURRENCY = 50;
+
+export interface SkippedFileInfo {
+  path: string;
+  reason: FileSkipReason;
+}
 
 export interface FileCollectResults {
   rawFiles: RawFile[];
   skippedFiles: SkippedFileInfo[];
 }
 
-// Re-export SkippedFileInfo for external use
-export type { SkippedFileInfo } from './workers/fileCollectWorker.js';
+const promisePool = async <T, R>(items: T[], concurrency: number, fn: (item: T) => Promise<R>): Promise<R[]> => {
+  const results: R[] = Array.from({ length: items.length });
+  let nextIndex = 0;
+
+  const worker = async () => {
+    while (nextIndex < items.length) {
+      const i = nextIndex++;
+      results[i] = await fn(items[i]);
+    }
+  };
+
+  await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));
+
+  return results;
+};
 
 export const collectFiles = async (
   filePaths: string[],
@@ -20,64 +42,41 @@ export const collectFiles = async (
   config: RepomixConfigMerged,
   progressCallback: RepomixProgressCallback = () => {},
   deps = {
-    initTaskRunner,
+    readRawFile: defaultReadRawFile,
   },
 ): Promise<FileCollectResults> => {
-  const taskRunner = deps.initTaskRunner<FileCollectTask, FileCollectResult>({
-    numOfTasks: filePaths.length,
-    workerType: 'fileCollect',
-    runtime: 'worker_threads',
-    concurrentTasksPerWorker: 3,
+  const startTime = process.hrtime.bigint();
+  logger.trace(`Starting file collection for ${filePaths.length} files`);
+
+  let completedTasks = 0;
+  const totalTasks = filePaths.length;
+  const maxFileSize = config.input.maxFileSize;
+
+  const results = await promisePool(filePaths, FILE_COLLECT_CONCURRENCY, async (filePath) => {
+    const fullPath = path.resolve(rootDir, filePath);
+    const result = await deps.readRawFile(fullPath, maxFileSize);
+
+    completedTasks++;
+    progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(filePath)}`);
+    logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${filePath}`);
+
+    return { filePath, result };
   });
-  const tasks = filePaths.map(
-    (filePath) =>
-      ({
-        filePath,
-        rootDir,
-        maxFileSize: config.input.maxFileSize,
-      }) satisfies FileCollectTask,
-  );
-
-  try {
-    const startTime = process.hrtime.bigint();
-    logger.trace(`Starting file collection for ${filePaths.length} files using worker pool`);
-
-    let completedTasks = 0;
-    const totalTasks = tasks.length;
-
-    const results = await Promise.all(
-      tasks.map((task) =>
-        taskRunner.run(task).then((result) => {
-          completedTasks++;
-          progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(task.filePath)}`);
-          logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${task.filePath}`);
-          return result;
-        }),
-      ),
-    );
-
-    const endTime = process.hrtime.bigint();
-    const duration = Number(endTime - startTime) / 1e6;
-    logger.trace(`File collection completed in ${duration.toFixed(2)}ms`);
-
-    const rawFiles: RawFile[] = [];
-    const skippedFiles: SkippedFileInfo[] = [];
-
-    for (const result of results) {
-      if (result.rawFile) {
-        rawFiles.push(result.rawFile);
-      }
-      if (result.skippedFile) {
-        skippedFiles.push(result.skippedFile);
-      }
-    }
 
-    return { rawFiles, skippedFiles };
-  } catch (error) {
-    logger.error('Error during file collection:', error);
-    throw error;
-  } finally {
-    // Always cleanup worker pool
-    await taskRunner.cleanup();
+  const rawFiles: RawFile[] = [];
+  const skippedFiles: SkippedFileInfo[] = [];
+
+  for (const { filePath, result } of results) {
+    if (result.content !== null) {
+      rawFiles.push({ path: filePath, content: result.content });
+    } else if (result.skippedReason) {
+      skippedFiles.push({ path: filePath, reason: result.skippedReason });
+    }
   }
+
+  const endTime = process.hrtime.bigint();
+  const duration = Number(endTime - startTime) / 1e6;
+  logger.trace(`File collection completed in ${duration.toFixed(2)}ms`);
+
+  return { rawFiles, skippedFiles };
 };
diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts
index 2fe1cb4f8..4c973e32f 100644
--- a/src/shared/processConcurrency.ts
+++ b/src/shared/processConcurrency.ts
@@ -12,7 +12,6 @@ export interface WorkerOptions {
   numOfTasks: number;
   workerType: WorkerType;
   runtime: WorkerRuntime;
-  concurrentTasksPerWorker?: number;
 }
 
 /**
@@ -65,7 +64,7 @@ export const getWorkerThreadCount = (numOfTasks: number): { minThreads: number;
 };
 
 export const createWorkerPool = (options: WorkerOptions): Tinypool => {
-  const { numOfTasks, workerType, runtime = 'child_process', concurrentTasksPerWorker } = options;
+  const { numOfTasks, workerType, runtime = 'child_process' } = options;
   const { minThreads, maxThreads } = getWorkerThreadCount(numOfTasks);
 
   // Get worker path - uses unified worker in bundled env, individual files otherwise
@@ -84,7 +83,6 @@ export const createWorkerPool = (options: WorkerOptions): Tinypool => {
     maxThreads,
     idleTimeout: 5000,
     teardown: 'onWorkerTermination',
-    ...(concurrentTasksPerWorker != null && { concurrentTasksPerWorker }),
     workerData: {
       workerType,
       logLevel: logger.getLogLevel(),
diff --git a/tests/core/file/fileCollect.test.ts b/tests/core/file/fileCollect.test.ts
index 56cae27e8..20bdeb67c 100644
--- a/tests/core/file/fileCollect.test.ts
+++ b/tests/core/file/fileCollect.test.ts
@@ -1,64 +1,17 @@
-import type { Stats } from 'node:fs';
-import * as fs from 'node:fs/promises';
 import path from 'node:path';
-import iconv from 'iconv-lite';
-import isBinaryPath from 'is-binary-path';
-import { isBinaryFile } from 'isbinaryfile';
-import jschardet from 'jschardet';
-import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+import { beforeEach, describe, expect, it, type Mock, vi } from 'vitest';
 import { collectFiles } from '../../../src/core/file/fileCollect.js';
-import type { FileCollectTask } from '../../../src/core/file/workers/fileCollectWorker.js';
-import fileCollectWorker from '../../../src/core/file/workers/fileCollectWorker.js';
-import { logger } from '../../../src/shared/logger.js';
-import type { WorkerOptions, WorkerRuntime } from '../../../src/shared/processConcurrency.js';
+import type { FileReadResult } from '../../../src/core/file/fileRead.js';
 import { createMockConfig } from '../../testing/testUtils.js';
 
 // Define the max file size constant for tests
 const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
 
-vi.mock('node:fs/promises');
-vi.mock('is-binary-path');
-vi.mock('isbinaryfile');
-vi.mock('jschardet');
-vi.mock('iconv-lite');
-vi.mock('../../../src/shared/logger');
-
-interface MockInitTaskRunner {
-  <T, R>(
-    options: WorkerOptions,
-  ): {
-    run: (task: T) => Promise<R>;
-    cleanup: () => Promise<void>;
-  };
-  lastRuntime?: WorkerRuntime;
-}
-
-const mockInitTaskRunner = <T, R>(options: WorkerOptions) => {
-  // Store runtime for verification in tests
-  (mockInitTaskRunner as MockInitTaskRunner).lastRuntime = options.runtime;
-  return {
-    run: async (task: T) => {
-      return (await fileCollectWorker(task as FileCollectTask)) as R;
-    },
-    cleanup: async () => {
-      // Mock cleanup - no-op for tests
-    },
-  };
-};
-
 describe('fileCollect', () => {
-  beforeEach(() => {
-    vi.resetAllMocks();
-
-    // Setup basic file size mock to fix stat
-    vi.mocked(fs.stat).mockResolvedValue({
-      size: 1024,
-      isFile: () => true,
-    } as Stats);
-  });
+  let mockReadRawFile: Mock<(filePath: string, maxFileSize: number) => Promise<FileReadResult>>;
 
-  afterEach(() => {
-    vi.resetAllMocks();
+  beforeEach(() => {
+    mockReadRawFile = vi.fn();
   });
 
   it('should collect non-binary files', async () => {
@@ -66,14 +19,10 @@ describe('fileCollect', () => {
     const mockRootDir = '/root';
     const mockConfig = createMockConfig();
 
-    vi.mocked(isBinaryPath).mockReturnValue(false);
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
-    vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('file content');
+    mockReadRawFile.mockResolvedValue({ content: 'file content' });
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+      readRawFile: mockReadRawFile,
     });
 
     expect(result).toEqual({
@@ -83,6 +32,9 @@ describe('fileCollect', () => {
       ],
       skippedFiles: [],
     });
+    expect(mockReadRawFile).toHaveBeenCalledTimes(2);
+    expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/file1.txt'), MAX_FILE_SIZE);
+    expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/file2.txt'), MAX_FILE_SIZE);
   });
 
   it('should skip binary files', async () => {
@@ -90,62 +42,43 @@ describe('fileCollect', () => {
     const mockRootDir = '/root';
     const mockConfig = createMockConfig();
 
-    vi.mocked(isBinaryPath)
-      .mockReturnValueOnce(true) // for binary.bin (skip by extension)
-      .mockReturnValueOnce(false); // for text.txt
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
-    vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('file content');
+    mockReadRawFile.mockImplementation(async (filePath) => {
+      if (filePath.endsWith('binary.bin')) {
+        return { content: null, skippedReason: 'binary-extension' };
+      }
+      return { content: 'file content' };
+    });
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+      readRawFile: mockReadRawFile,
     });
 
     expect(result).toEqual({
       rawFiles: [{ path: 'text.txt', content: 'file content' }],
       skippedFiles: [{ path: 'binary.bin', reason: 'binary-extension' }],
     });
-    expect(logger.debug).toHaveBeenCalledWith(`Skipping binary file: ${path.resolve('/root/binary.bin')}`);
   });
 
   it('should skip large files based on default maxFileSize', async () => {
     const mockFilePaths = ['large.txt', 'normal.txt'];
     const mockRootDir = '/root';
     const mockConfig = createMockConfig();
-    const largePath = path.resolve('/root/large.txt');
-
-    vi.mocked(fs.stat)
-      .mockResolvedValueOnce({
-        // for large.txt
-        size: MAX_FILE_SIZE + 1024, // Slightly over limit
-        isFile: () => true,
-      } as Stats)
-      .mockResolvedValueOnce({
-        // for normal.txt
-        size: 1024,
-        isFile: () => true,
-      } as Stats);
-    vi.mocked(isBinaryPath).mockReturnValue(false);
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
-    vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('file content');
+
+    mockReadRawFile.mockImplementation(async (filePath) => {
+      if (filePath.endsWith('large.txt')) {
+        return { content: null, skippedReason: 'size-limit' };
+      }
+      return { content: 'file content' };
+    });
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+      readRawFile: mockReadRawFile,
     });
 
     expect(result).toEqual({
       rawFiles: [{ path: 'normal.txt', content: 'file content' }],
       skippedFiles: [{ path: 'large.txt', reason: 'size-limit' }],
     });
-    expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:'));
-    expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(largePath));
-
-    // Verify fs.readFile is not called for the large file
-    expect(fs.readFile).not.toHaveBeenCalledWith(largePath);
-    expect(fs.readFile).toHaveBeenCalledTimes(1);
   });
 
   it('should respect custom maxFileSize setting', async () => {
@@ -157,40 +90,26 @@ describe('fileCollect', () => {
         maxFileSize: customMaxFileSize,
       },
     });
-    const mediumPath = path.resolve('/root/medium.txt');
-
-    vi.mocked(fs.stat)
-      .mockResolvedValueOnce({
-        // for medium.txt
-        size: 10 * 1024 * 1024, // 10MB (exceeds custom 5MB limit)
-        isFile: () => true,
-      } as Stats)
-      .mockResolvedValueOnce({
-        // for small.txt
-        size: 1024, // 1KB (within limit)
-        isFile: () => true,
-      } as Stats);
-    vi.mocked(isBinaryPath).mockReturnValue(false);
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
-    vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('file content');
+
+    mockReadRawFile.mockImplementation(async (filePath) => {
+      if (filePath.endsWith('medium.txt')) {
+        return { content: null, skippedReason: 'size-limit' };
+      }
+      return { content: 'file content' };
+    });
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+      readRawFile: mockReadRawFile,
     });
 
     expect(result).toEqual({
       rawFiles: [{ path: 'small.txt', content: 'file content' }],
       skippedFiles: [{ path: 'medium.txt', reason: 'size-limit' }],
     });
-    expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('File exceeds size limit:'));
-    expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining('10240.0KB > 5120.0KB'));
-    expect(logger.trace).toHaveBeenCalledWith(expect.stringContaining(mediumPath));
 
-    // Verify fs.readFile is not called for the medium file
-    expect(fs.readFile).not.toHaveBeenCalledWith(mediumPath);
-    expect(fs.readFile).toHaveBeenCalledTimes(1);
+    // Verify readRawFile is called with custom maxFileSize
+    expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/medium.txt'), customMaxFileSize);
+    expect(mockReadRawFile).toHaveBeenCalledWith(path.resolve('/root/small.txt'), customMaxFileSize);
   });
 
   it('should handle file read errors', async () => {
@@ -198,39 +117,30 @@ describe('fileCollect', () => {
     const mockRootDir = '/root';
     const mockConfig = createMockConfig();
 
-    vi.mocked(isBinaryPath).mockReturnValue(false);
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockRejectedValue(new Error('Read error'));
+    mockReadRawFile.mockResolvedValue({ content: null, skippedReason: 'encoding-error' });
 
     const result = await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+      readRawFile: mockReadRawFile,
     });
 
     expect(result).toEqual({
       rawFiles: [],
       skippedFiles: [{ path: 'error.txt', reason: 'encoding-error' }],
     });
-    expect(logger.warn).toHaveBeenCalledWith(
-      `Failed to read file: ${path.resolve('/root/error.txt')}`,
-      expect.any(Error),
-    );
   });
 
-  it('should use worker_threads runtime when calling initTaskRunner', async () => {
-    const mockFilePaths = ['test.txt'];
+  it('should call progressCallback for each file', async () => {
+    const mockFilePaths = ['file1.txt', 'file2.txt'];
     const mockRootDir = '/root';
     const mockConfig = createMockConfig();
+    const mockProgress = vi.fn();
 
-    vi.mocked(isBinaryPath).mockReturnValue(false);
-    vi.mocked(isBinaryFile).mockResolvedValue(false);
-    vi.mocked(fs.readFile).mockResolvedValue(Buffer.from('file content'));
-    vi.mocked(jschardet.detect).mockReturnValue({ encoding: 'utf-8', confidence: 0.99 });
-    vi.mocked(iconv.decode).mockReturnValue('file content');
+    mockReadRawFile.mockResolvedValue({ content: 'file content' });
 
-    await collectFiles(mockFilePaths, mockRootDir, mockConfig, () => {}, {
-      initTaskRunner: mockInitTaskRunner,
+    await collectFiles(mockFilePaths, mockRootDir, mockConfig, mockProgress, {
+      readRawFile: mockReadRawFile,
     });
 
-    expect((mockInitTaskRunner as MockInitTaskRunner).lastRuntime).toBe('worker_threads');
+    expect(mockProgress).toHaveBeenCalledTimes(2);
   });
 });
diff --git a/tests/integration-tests/packager.test.ts b/tests/integration-tests/packager.test.ts
index e8c5aafcb..214de2fe0 100644
--- a/tests/integration-tests/packager.test.ts
+++ b/tests/integration-tests/packager.test.ts
@@ -9,35 +9,22 @@ import { afterEach, beforeEach, describe, expect, test } from 'vitest';
 import { loadFileConfig, mergeConfigs } from '../../src/config/configLoad.js';
 import type { RepomixConfigFile, RepomixConfigMerged, RepomixOutputStyle } from '../../src/config/configSchema.js';
 import { collectFiles } from '../../src/core/file/fileCollect.js';
+import { readRawFile } from '../../src/core/file/fileRead.js';
 import { searchFiles } from '../../src/core/file/fileSearch.js';
 import type { ProcessedFile } from '../../src/core/file/fileTypes.js';
-
-import type { FileCollectTask } from '../../src/core/file/workers/fileCollectWorker.js';
-import fileCollectWorker from '../../src/core/file/workers/fileCollectWorker.js';
 import fileProcessWorker from '../../src/core/file/workers/fileProcessWorker.js';
 import type { GitDiffResult } from '../../src/core/git/gitDiffHandle.js';
 import { produceOutput } from '../../src/core/packager/produceOutput.js';
 import { pack } from '../../src/core/packager.js';
 import { filterOutUntrustedFiles } from '../../src/core/security/filterOutUntrustedFiles.js';
 import { validateFileSafety } from '../../src/core/security/validateFileSafety.js';
-import type { WorkerOptions } from '../../src/shared/processConcurrency.js';
+
 import { isWindows } from '../testing/testUtils.js';
 
 const fixturesDir = path.join(__dirname, 'fixtures', 'packager');
 const inputsDir = path.join(fixturesDir, 'inputs');
 const outputsDir = path.join(fixturesDir, 'outputs');
 
-const mockCollectFileInitTaskRunner = <T, R>(_options: WorkerOptions) => {
-  return {
-    run: async (task: T) => {
-      return (await fileCollectWorker(task as FileCollectTask)) as R;
-    },
-    cleanup: async () => {
-      // Mock cleanup - no-op for tests
-    },
-  };
-};
-
 describe.runIf(!isWindows)('packager integration', () => {
   const testCases = [
     {
@@ -107,7 +94,7 @@ describe.runIf(!isWindows)('packager integration', () => {
         sortPaths: (filePaths) => filePaths,
         collectFiles: (filePaths, rootDir, config, progressCallback) => {
           return collectFiles(filePaths, rootDir, config, progressCallback, {
-            initTaskRunner: mockCollectFileInitTaskRunner,
+            readRawFile,
           });
         },
         processFiles: async (rawFiles, config, _progressCallback) => {

From 05f11f46c705edd64594106de3a0d1a69cce06ad Mon Sep 17 00:00:00 2001
From: Kazuki Yamada <koukun0120@gmail.com>
Date: Sat, 14 Feb 2026 11:29:47 +0900
Subject: [PATCH 3/3] refactor(core): Remove unused fileCollect worker
 infrastructure

File collection was replaced with a promise pool approach in 96ff05dc,
but the worker-related code remained. This removes the now-unused
fileCollectWorker and all references to it from the worker system.
---
 src/core/file/workers/fileCollectWorker.ts | 56 ----------------------
 src/shared/processConcurrency.ts           |  2 -
 src/shared/unifiedWorker.ts                | 12 +----
 tests/shared/processConcurrency.test.ts    |  6 +--
 tests/shared/unifiedWorker.test.ts         | 18 -------
 5 files changed, 4 insertions(+), 90 deletions(-)
 delete mode 100644 src/core/file/workers/fileCollectWorker.ts

diff --git a/src/core/file/workers/fileCollectWorker.ts b/src/core/file/workers/fileCollectWorker.ts
deleted file mode 100644
index 7da9d0ab4..000000000
--- a/src/core/file/workers/fileCollectWorker.ts
+++ /dev/null
@@ -1,56 +0,0 @@
-import path from 'node:path';
-import { setLogLevelByWorkerData } from '../../../shared/logger.js';
-import { type FileSkipReason, readRawFile } from '../fileRead.js';
-import type { RawFile } from '../fileTypes.js';
-
-// Initialize logger configuration from workerData at module load time
-// This must be called before any logging operations in the worker
-setLogLevelByWorkerData();
-
-export interface FileCollectTask {
-  filePath: string;
-  rootDir: string;
-  maxFileSize: number;
-}
-
-export interface SkippedFileInfo {
-  path: string;
-  reason: FileSkipReason;
-}
-
-export interface FileCollectResult {
-  rawFile?: RawFile;
-  skippedFile?: SkippedFileInfo;
-}
-
-export default async ({ filePath, rootDir, maxFileSize }: FileCollectTask): Promise<FileCollectResult> => {
-  const fullPath = path.resolve(rootDir, filePath);
-  const result = await readRawFile(fullPath, maxFileSize);
-
-  if (result.content !== null) {
-    return {
-      rawFile: {
-        path: filePath,
-        content: result.content,
-      },
-    };
-  }
-
-  if (result.skippedReason) {
-    return {
-      skippedFile: {
-        path: filePath,
-        reason: result.skippedReason,
-      },
-    };
-  }
-
-  throw new Error(
-    `File processing for ${filePath} resulted in an unexpected state: content is null but no skip reason was provided.`,
-  );
-};
-
-// Export cleanup function for Tinypool teardown (no cleanup needed for this worker)
-export const onWorkerTermination = async (): Promise<void> => {
-  // No cleanup needed for file collection worker
-};
diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts
index 4c973e32f..0507131a2 100644
--- a/src/shared/processConcurrency.ts
+++ b/src/shared/processConcurrency.ts
@@ -27,8 +27,6 @@ const getWorkerPath = (workerType: WorkerType): string => {
 
   // Non-bundled environment: use individual worker files
   switch (workerType) {
-    case 'fileCollect':
-      return new URL('../core/file/workers/fileCollectWorker.js', import.meta.url).href;
     case 'fileProcess':
       return new URL('../core/file/workers/fileProcessWorker.js', import.meta.url).href;
     case 'securityCheck':
diff --git a/src/shared/unifiedWorker.ts b/src/shared/unifiedWorker.ts
index 1adec3c91..32594f4c6 100644
--- a/src/shared/unifiedWorker.ts
+++ b/src/shared/unifiedWorker.ts
@@ -12,7 +12,7 @@
 import { workerData } from 'node:worker_threads';
 
 // Worker type definitions
-export type WorkerType = 'fileCollect' | 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction';
+export type WorkerType = 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction';
 
 // Worker handler type - uses 'any' to accommodate different worker signatures
 // biome-ignore lint/suspicious/noExplicitAny: Worker handlers have varying signatures
@@ -39,11 +39,6 @@ const loadWorkerHandler = async (
   let result: { handler: WorkerHandler; cleanup?: WorkerCleanup };
 
   switch (workerType) {
-    case 'fileCollect': {
-      const module = await import('../core/file/workers/fileCollectWorker.js');
-      result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination };
-      break;
-    }
     case 'fileProcess': {
       const module = await import('../core/file/workers/fileProcessWorker.js');
       result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination };
@@ -95,11 +90,6 @@ const inferWorkerTypeFromTask = (task: unknown): WorkerType | null => {
     return 'defaultAction';
   }
 
-  // fileCollect: has filePath, rootDir, maxFileSize
-  if ('filePath' in taskObj && 'rootDir' in taskObj && 'maxFileSize' in taskObj) {
-    return 'fileCollect';
-  }
-
   // fileProcess: has rawFile (nested object) and config
   if ('rawFile' in taskObj && 'config' in taskObj) {
     return 'fileProcess';
diff --git a/tests/shared/processConcurrency.test.ts b/tests/shared/processConcurrency.test.ts
index 940ed2abe..289406ff6 100644
--- a/tests/shared/processConcurrency.test.ts
+++ b/tests/shared/processConcurrency.test.ts
@@ -91,17 +91,17 @@ describe('processConcurrency', () => {
     });
 
     it('should initialize Tinypool with correct configuration', () => {
-      const tinypool = createWorkerPool({ numOfTasks: 500, workerType: 'fileCollect', runtime: 'child_process' });
+      const tinypool = createWorkerPool({ numOfTasks: 500, workerType: 'fileProcess', runtime: 'child_process' });
 
       expect(Tinypool).toHaveBeenCalledWith({
-        filename: expect.stringContaining('fileCollectWorker.js'),
+        filename: expect.stringContaining('fileProcessWorker.js'),
         runtime: 'child_process',
         minThreads: 1,
         maxThreads: 4, // Math.min(4, 500/100) = 4
         idleTimeout: 5000,
         teardown: 'onWorkerTermination',
         workerData: {
-          workerType: 'fileCollect',
+          workerType: 'fileProcess',
           logLevel: 2,
         },
         env: expect.objectContaining({
diff --git a/tests/shared/unifiedWorker.test.ts b/tests/shared/unifiedWorker.test.ts
index ee1823adb..7f3247b35 100644
--- a/tests/shared/unifiedWorker.test.ts
+++ b/tests/shared/unifiedWorker.test.ts
@@ -2,10 +2,6 @@ import { beforeEach, describe, expect, it, vi } from 'vitest';
 
 // We need to test the internal functions, so we'll test through the module behavior
 // Mock all worker modules
-vi.mock('../../src/core/file/workers/fileCollectWorker.js', () => ({
-  default: vi.fn().mockResolvedValue({ collected: true }),
-  onWorkerTermination: vi.fn(),
-}));
 vi.mock('../../src/core/file/workers/fileProcessWorker.js', () => ({
   default: vi.fn().mockResolvedValue({ processed: true }),
   onWorkerTermination: vi.fn(),
@@ -61,20 +57,6 @@ describe('unifiedWorker', () => {
       expect(defaultActionWorker.default).toHaveBeenCalledWith(task);
     });
 
-    it('should infer fileCollect from task with filePath, rootDir, maxFileSize', async () => {
-      const { default: handler } = await import('../../src/shared/unifiedWorker.js');
-      const task = {
-        filePath: 'test.ts',
-        rootDir: '/root',
-        maxFileSize: 1000,
-      };
-
-      await handler(task);
-
-      const fileCollectWorker = await import('../../src/core/file/workers/fileCollectWorker.js');
-      expect(fileCollectWorker.default).toHaveBeenCalledWith(task);
-    });
-
     it('should infer fileProcess from task with rawFile and config', async () => {
       const { default: handler } = await import('../../src/shared/unifiedWorker.js');
       const task = {