Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 118 additions & 43 deletions src/core/file/fileProcess.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,139 @@ import { initTaskRunner } from '../../shared/processConcurrency.js';
import type { RepomixProgressCallback } from '../../shared/types.js';
import { type FileManipulator, getFileManipulator } from './fileManipulate.js';
import type { ProcessedFile, RawFile } from './fileTypes.js';
import { truncateBase64Content } from './truncateBase64.js';
import type { FileProcessTask } from './workers/fileProcessWorker.js';

type GetFileManipulator = (filePath: string) => FileManipulator | null;

/**
* Apply lightweight transforms on the main thread after worker processing.
* All lightweight transforms are centralized here to avoid duplication with workers.
*
* Transform order: [removeComments → compress] (worker) → truncateBase64 → removeEmptyLines → trim → showLineNumbers
* - removeEmptyLines runs after removeComments so that empty lines created by comment removal are cleaned up.
*/
export const applyLightweightTransforms = (
files: ProcessedFile[],
config: RepomixConfigMerged,
progressCallback: RepomixProgressCallback,
deps: { getFileManipulator: GetFileManipulator },
): ProcessedFile[] => {
const totalFiles = files.length;
const results: ProcessedFile[] = Array.from({ length: totalFiles }) as ProcessedFile[];

for (let i = 0; i < totalFiles; i++) {
const file = files[i];
let content = file.content;

if (config.output.truncateBase64) {
content = truncateBase64Content(content);
}

if (config.output.removeEmptyLines) {
const manipulator = deps.getFileManipulator(file.path);
if (manipulator) {
content = manipulator.removeEmptyLines(content);
}
}

content = content.trim();

if (config.output.showLineNumbers && !config.output.compress) {
const lines = content.split('\n');
const padding = lines.length.toString().length;
const numberedLines = lines.map((line, idx) => `${(idx + 1).toString().padStart(padding)}: ${line}`);
content = numberedLines.join('\n');
}

results[i] = { path: file.path, content };

if ((i + 1) % 50 === 0 || i === totalFiles - 1) {
progressCallback(`Processing file... (${i + 1}/${totalFiles}) ${pc.dim(file.path)}`);
}
}

return results;
};

/**
* Process files through a two-phase pipeline:
*
* 1. Heavy transforms (worker threads, skipped when not needed):
* removeComments → compress
*
* 2. Lightweight transforms (main thread, always applied):
* truncateBase64 → removeEmptyLines → trim → showLineNumbers
*
* removeEmptyLines intentionally runs after removeComments so that
* empty lines created by comment removal are cleaned up.
*/
export const processFiles = async (
rawFiles: RawFile[],
config: RepomixConfigMerged,
progressCallback: RepomixProgressCallback,
deps: {
initTaskRunner: typeof initTaskRunner;
getFileManipulator: GetFileManipulator;
} = {
deps = {
initTaskRunner,
getFileManipulator,
},
): Promise<ProcessedFile[]> => {
const taskRunner = deps.initTaskRunner<FileProcessTask, ProcessedFile>({
numOfTasks: rawFiles.length,
workerType: 'fileProcess',
// High memory usage and leak risk
runtime: 'worker_threads',
});
const tasks = rawFiles.map(
(rawFile, _index) =>
({
rawFile,
config,
}) satisfies FileProcessTask,
);

try {
const startTime = process.hrtime.bigint();
const startTime = process.hrtime.bigint();
let files: ProcessedFile[];

// Only compress (tree-sitter) and removeComments (AST manipulation) justify worker thread overhead
const useWorkers = config.output.compress || config.output.removeComments;

if (useWorkers) {
// Phase 1: Heavy processing via workers (removeComments, compress)
logger.trace(`Starting file processing for ${rawFiles.length} files using worker pool`);

let completedTasks = 0;
const totalTasks = tasks.length;

const results = await Promise.all(
tasks.map((task) =>
taskRunner.run(task).then((result) => {
completedTasks++;
progressCallback(`Processing file... (${completedTasks}/${totalTasks}) ${pc.dim(task.rawFile.path)}`);
logger.trace(`Processing file... (${completedTasks}/${totalTasks}) ${task.rawFile.path}`);
return result;
}),
),
const taskRunner = deps.initTaskRunner<FileProcessTask, ProcessedFile>({
numOfTasks: rawFiles.length,
workerType: 'fileProcess',
runtime: 'worker_threads',
});

const tasks = rawFiles.map(
(rawFile) =>
({
rawFile,
config,
}) satisfies FileProcessTask,
);

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`File processing completed in ${duration.toFixed(2)}ms`);

return results;
} catch (error) {
logger.error('Error during file processing:', error);
throw error;
} finally {
// Always cleanup worker pool
await taskRunner.cleanup();
try {
let completedTasks = 0;
const totalTasks = tasks.length;

files = await Promise.all(
tasks.map((task) =>
taskRunner.run(task).then((result) => {
completedTasks++;
progressCallback(`Processing file... (${completedTasks}/${totalTasks}) ${pc.dim(task.rawFile.path)}`);
logger.trace(`Processing file... (${completedTasks}/${totalTasks}) ${task.rawFile.path}`);
return result;
}),
),
);
} catch (error) {
logger.error('Error during file processing:', error);
throw error;
} finally {
await taskRunner.cleanup();
}

// Phase 2: Lightweight transforms (no progress - already reported by workers)
files = applyLightweightTransforms(files, config, () => {}, deps);
} else {
// No heavy processing needed - apply lightweight transforms directly
logger.trace(`Starting file processing for ${rawFiles.length} files in main thread (lightweight mode)`);
const inputFiles = rawFiles.map((rawFile) => ({ path: rawFile.path, content: rawFile.content }));
files = applyLightweightTransforms(inputFiles, config, progressCallback, deps);
}

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`File processing completed in ${duration.toFixed(2)}ms`);

return files;
};
31 changes: 5 additions & 26 deletions src/core/file/fileProcessContent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,15 @@ import { logger } from '../../shared/logger.js';
import { parseFile } from '../treeSitter/parseFile.js';
import { getFileManipulator } from './fileManipulate.js';
import type { RawFile } from './fileTypes.js';
import { truncateBase64Content } from './truncateBase64.js';

/**
* Process the content of a file according to the configuration
* Applies various transformations based on the config:
* - Remove comments
* - Remove empty lines
* - Truncate base64 encoded data
* Process the content of a file for CPU-intensive operations.
* Only handles heavy transformations that benefit from worker threads:
* - Remove comments (language-specific AST manipulation)
* - Compress content using Tree-sitter
* - Add line numbers
*
* @param rawFile Raw file data containing path and content
* @param config Repomix configuration
* @returns Processed content string
* Lightweight transforms (truncateBase64, removeEmptyLines, trim, showLineNumbers)
* are applied separately on the main thread by processFiles().
*/
export const processContent = async (rawFile: RawFile, config: RepomixConfigMerged): Promise<string> => {
const processStartAt = process.hrtime.bigint();
Expand All @@ -25,20 +20,10 @@ export const processContent = async (rawFile: RawFile, config: RepomixConfigMerg

logger.trace(`Processing file: ${rawFile.path}`);

if (config.output.truncateBase64) {
processedContent = truncateBase64Content(processedContent);
}

if (manipulator && config.output.removeComments) {
processedContent = manipulator.removeComments(processedContent);
}

if (config.output.removeEmptyLines && manipulator) {
processedContent = manipulator.removeEmptyLines(processedContent);
}

processedContent = processedContent.trim();

if (config.output.compress) {
try {
const parsedContent = await parseFile(processedContent, rawFile.path, config);
Expand All @@ -49,14 +34,8 @@ export const processContent = async (rawFile: RawFile, config: RepomixConfigMerg
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
logger.error(`Error parsing ${rawFile.path} in compressed mode: ${message}`);
//re-throw error
throw error;
}
} else if (config.output.showLineNumbers) {
const lines = processedContent.split('\n');
const padding = lines.length.toString().length;
const numberedLines = lines.map((line, i) => `${(i + 1).toString().padStart(padding)}: ${line}`);
processedContent = numberedLines.join('\n');
}

const processEndAt = process.hrtime.bigint();
Expand Down
19 changes: 10 additions & 9 deletions src/core/file/truncateBase64.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ const TRUNCATION_LENGTH = 32;
const MIN_CHAR_DIVERSITY = 10;
const MIN_CHAR_TYPE_COUNT = 3;

// Pre-compiled regex patterns (avoid re-creation per file)
const dataUriPattern = new RegExp(
`data:([a-zA-Z0-9\\/\\-\\+]+)(;[a-zA-Z0-9\\-=]+)*;base64,([A-Za-z0-9+/=]{${MIN_BASE64_LENGTH_DATA_URI},})`,
'g',
);
const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_STANDALONE},}={0,2})`, 'g');

/**
* Truncates base64 encoded data in content to reduce file size
* Detects common base64 patterns like data URIs and standalone base64 strings
Expand All @@ -13,15 +20,9 @@ const MIN_CHAR_TYPE_COUNT = 3;
* @returns Content with base64 data truncated
*/
export const truncateBase64Content = (content: string): string => {
// Pattern to match data URIs (e.g., data:image/png;base64,...)
const dataUriPattern = new RegExp(
`data:([a-zA-Z0-9\\/\\-\\+]+)(;[a-zA-Z0-9\\-=]+)*;base64,([A-Za-z0-9+/=]{${MIN_BASE64_LENGTH_DATA_URI},})`,
'g',
);

// Pattern to match standalone base64 strings
// This matches base64 strings that are likely encoded binary data
const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_STANDALONE},}={0,2})`, 'g');
// Reset lastIndex since patterns are global and reused across calls
dataUriPattern.lastIndex = 0;
standaloneBase64Pattern.lastIndex = 0;

let processedContent = content;

Expand Down
Loading
Loading