Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 58 additions & 58 deletions src/core/file/fileCollect.ts
Original file line number Diff line number Diff line change
@@ -1,82 +1,82 @@
import path from 'node:path';
import pc from 'picocolors';
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { logger } from '../../shared/logger.js';
import { initTaskRunner } from '../../shared/processConcurrency.js';
import type { RepomixProgressCallback } from '../../shared/types.js';
import { readRawFile as defaultReadRawFile, type FileSkipReason } from './fileRead.js';
import type { RawFile } from './fileTypes.js';
import type { FileCollectResult, FileCollectTask, SkippedFileInfo } from './workers/fileCollectWorker.js';

// Concurrency limit for parallel file reads on the main thread.
// 50 balances I/O throughput with FD/memory safety across different machines.
const FILE_COLLECT_CONCURRENCY = 50;

export interface SkippedFileInfo {
path: string;
reason: FileSkipReason;
}

export interface FileCollectResults {
rawFiles: RawFile[];
skippedFiles: SkippedFileInfo[];
}

// Re-export SkippedFileInfo for external use
export type { SkippedFileInfo } from './workers/fileCollectWorker.js';
const promisePool = async <T, R>(items: T[], concurrency: number, fn: (item: T) => Promise<R>): Promise<R[]> => {
const results: R[] = Array.from({ length: items.length });
let nextIndex = 0;

const worker = async () => {
while (nextIndex < items.length) {
const i = nextIndex++;
results[i] = await fn(items[i]);
}
};

await Promise.all(Array.from({ length: Math.min(concurrency, items.length) }, () => worker()));

return results;
};

export const collectFiles = async (
filePaths: string[],
rootDir: string,
config: RepomixConfigMerged,
progressCallback: RepomixProgressCallback = () => {},
deps = {
initTaskRunner,
readRawFile: defaultReadRawFile,
},
): Promise<FileCollectResults> => {
const taskRunner = deps.initTaskRunner<FileCollectTask, FileCollectResult>({
numOfTasks: filePaths.length,
workerType: 'fileCollect',
runtime: 'worker_threads',
const startTime = process.hrtime.bigint();
logger.trace(`Starting file collection for ${filePaths.length} files`);

let completedTasks = 0;
const totalTasks = filePaths.length;
const maxFileSize = config.input.maxFileSize;

const results = await promisePool(filePaths, FILE_COLLECT_CONCURRENCY, async (filePath) => {
const fullPath = path.resolve(rootDir, filePath);
const result = await deps.readRawFile(fullPath, maxFileSize);

completedTasks++;
progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(filePath)}`);
logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${filePath}`);

return { filePath, result };
});
const tasks = filePaths.map(
(filePath) =>
({
filePath,
rootDir,
maxFileSize: config.input.maxFileSize,
}) satisfies FileCollectTask,
);

try {
const startTime = process.hrtime.bigint();
logger.trace(`Starting file collection for ${filePaths.length} files using worker pool`);

let completedTasks = 0;
const totalTasks = tasks.length;

const results = await Promise.all(
tasks.map((task) =>
taskRunner.run(task).then((result) => {
completedTasks++;
progressCallback(`Collect file... (${completedTasks}/${totalTasks}) ${pc.dim(task.filePath)}`);
logger.trace(`Collect files... (${completedTasks}/${totalTasks}) ${task.filePath}`);
return result;
}),
),
);

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`File collection completed in ${duration.toFixed(2)}ms`);

const rawFiles: RawFile[] = [];
const skippedFiles: SkippedFileInfo[] = [];

for (const result of results) {
if (result.rawFile) {
rawFiles.push(result.rawFile);
}
if (result.skippedFile) {
skippedFiles.push(result.skippedFile);
}
}

return { rawFiles, skippedFiles };
} catch (error) {
logger.error('Error during file collection:', error);
throw error;
} finally {
// Always cleanup worker pool
await taskRunner.cleanup();
const rawFiles: RawFile[] = [];
const skippedFiles: SkippedFileInfo[] = [];

for (const { filePath, result } of results) {
if (result.content !== null) {
rawFiles.push({ path: filePath, content: result.content });
} else if (result.skippedReason) {
skippedFiles.push({ path: filePath, reason: result.skippedReason });
}
}

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`File collection completed in ${duration.toFixed(2)}ms`);

return { rawFiles, skippedFiles };
};
27 changes: 14 additions & 13 deletions src/core/file/fileRead.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,25 @@ export const readRawFile = async (filePath: string, maxFileSize: number): Promis
return { content: null, skippedReason: 'binary-content' };
}

// Fast path: Try UTF-8 decoding first (covers ~99% of source code files)
// This skips the expensive jschardet.detect() which scans the entire buffer
// through multiple encoding probers with frequency table lookups
try {
let content = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
if (content.charCodeAt(0) === 0xfeff) {
content = content.slice(1); // strip UTF-8 BOM
}
return { content };
} catch {
// Not valid UTF-8, fall through to encoding detection
}

// Slow path: Detect encoding with jschardet for non-UTF-8 files (e.g., Shift-JIS, EUC-KR)
const { encoding: detectedEncoding } = jschardet.detect(buffer) ?? {};
const encoding = detectedEncoding && iconv.encodingExists(detectedEncoding) ? detectedEncoding : 'utf-8';

const content = iconv.decode(buffer, encoding, { stripBOM: true });

// Only skip if there are actual decode errors (U+FFFD replacement characters)
// Don't rely on jschardet confidence as it can return low values for valid UTF-8/ASCII files
if (content.includes('\uFFFD')) {
// For UTF-8, distinguish invalid byte sequences from a legitimate U+FFFD in the source
if (encoding.toLowerCase() === 'utf-8') {
try {
let utf8 = new TextDecoder('utf-8', { fatal: true }).decode(buffer);
if (utf8.charCodeAt(0) === 0xfeff) utf8 = utf8.slice(1); // strip UTF-8 BOM
return { content: utf8 };
} catch {
// fall through to skip below
}
}
logger.debug(`Skipping file due to encoding errors (detected: ${encoding}): ${filePath}`);
return { content: null, skippedReason: 'encoding-error' };
}
Expand Down
56 changes: 0 additions & 56 deletions src/core/file/workers/fileCollectWorker.ts

This file was deleted.

2 changes: 0 additions & 2 deletions src/shared/processConcurrency.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ const getWorkerPath = (workerType: WorkerType): string => {

// Non-bundled environment: use individual worker files
switch (workerType) {
case 'fileCollect':
return new URL('../core/file/workers/fileCollectWorker.js', import.meta.url).href;
case 'fileProcess':
return new URL('../core/file/workers/fileProcessWorker.js', import.meta.url).href;
case 'securityCheck':
Expand Down
12 changes: 1 addition & 11 deletions src/shared/unifiedWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import { workerData } from 'node:worker_threads';

// Worker type definitions
export type WorkerType = 'fileCollect' | 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction';
export type WorkerType = 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction';

// Worker handler type - uses 'any' to accommodate different worker signatures
// biome-ignore lint/suspicious/noExplicitAny: Worker handlers have varying signatures
Expand All @@ -39,11 +39,6 @@ const loadWorkerHandler = async (
let result: { handler: WorkerHandler; cleanup?: WorkerCleanup };

switch (workerType) {
case 'fileCollect': {
const module = await import('../core/file/workers/fileCollectWorker.js');
result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination };
break;
}
case 'fileProcess': {
const module = await import('../core/file/workers/fileProcessWorker.js');
result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination };
Expand Down Expand Up @@ -95,11 +90,6 @@ const inferWorkerTypeFromTask = (task: unknown): WorkerType | null => {
return 'defaultAction';
}

// fileCollect: has filePath, rootDir, maxFileSize
if ('filePath' in taskObj && 'rootDir' in taskObj && 'maxFileSize' in taskObj) {
return 'fileCollect';
}

// fileProcess: has rawFile (nested object) and config
if ('rawFile' in taskObj && 'config' in taskObj) {
return 'fileProcess';
Expand Down
Loading
Loading