yamadashy · yamadashy · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
@@ -5,64 +5,139 @@ import { initTaskRunner } from '../../shared/processConcurrency.js';
 import type { RepomixProgressCallback } from '../../shared/types.js';
 import { type FileManipulator, getFileManipulator } from './fileManipulate.js';
 import type { ProcessedFile, RawFile } from './fileTypes.js';
+import { truncateBase64Content } from './truncateBase64.js';
 import type { FileProcessTask } from './workers/fileProcessWorker.js';
 
 type GetFileManipulator = (filePath: string) => FileManipulator | null;
 
+/**
+ * Apply lightweight transforms on the main thread after worker processing.
+ * All lightweight transforms are centralized here to avoid duplication with workers.
+ *
+ * Transform order: [removeComments → compress] (worker) → truncateBase64 → removeEmptyLines → trim → showLineNumbers
+ * - removeEmptyLines runs after removeComments so that empty lines created by comment removal are cleaned up.
+ */
+export const applyLightweightTransforms = (
+  files: ProcessedFile[],
+  config: RepomixConfigMerged,
+  progressCallback: RepomixProgressCallback,
+  deps: { getFileManipulator: GetFileManipulator },
+): ProcessedFile[] => {
+  const totalFiles = files.length;
+  const results: ProcessedFile[] = Array.from({ length: totalFiles }) as ProcessedFile[];
+
+  for (let i = 0; i < totalFiles; i++) {
+    const file = files[i];
+    let content = file.content;
+
+    if (config.output.truncateBase64) {
+      content = truncateBase64Content(content);
+    }
+
+    if (config.output.removeEmptyLines) {
+      const manipulator = deps.getFileManipulator(file.path);
+      if (manipulator) {
+        content = manipulator.removeEmptyLines(content);
+      }
+    }
+
+    content = content.trim();
+
+    if (config.output.showLineNumbers && !config.output.compress) {
+      const lines = content.split('\n');
+      const padding = lines.length.toString().length;
+      const numberedLines = lines.map((line, idx) => `${(idx + 1).toString().padStart(padding)}: ${line}`);
+      content = numberedLines.join('\n');
+    }
+
+    results[i] = { path: file.path, content };
+
+    if ((i + 1) % 50 === 0 || i === totalFiles - 1) {
+      progressCallback(`Processing file... (${i + 1}/${totalFiles}) ${pc.dim(file.path)}`);
+    }
+  }
+
+  return results;
+};
+
+/**
+ * Process files through a two-phase pipeline:
+ *
+ * 1. Heavy transforms (worker threads, skipped when not needed):
+ *    removeComments → compress
+ *
+ * 2. Lightweight transforms (main thread, always applied):
+ *    truncateBase64 → removeEmptyLines → trim → showLineNumbers
+ *
+ * removeEmptyLines intentionally runs after removeComments so that
+ * empty lines created by comment removal are cleaned up.
+ */
 export const processFiles = async (
   rawFiles: RawFile[],
   config: RepomixConfigMerged,
   progressCallback: RepomixProgressCallback,
-  deps: {
-    initTaskRunner: typeof initTaskRunner;
-    getFileManipulator: GetFileManipulator;
-  } = {
+  deps = {
     initTaskRunner,
     getFileManipulator,
   },
 ): Promise<ProcessedFile[]> => {
-  const taskRunner = deps.initTaskRunner<FileProcessTask, ProcessedFile>({
-    numOfTasks: rawFiles.length,
-    workerType: 'fileProcess',
-    // High memory usage and leak risk
-    runtime: 'worker_threads',
-  });
-  const tasks = rawFiles.map(
-    (rawFile, _index) =>
-      ({
-        rawFile,
-        config,
-      }) satisfies FileProcessTask,
-  );
-
-  try {
-    const startTime = process.hrtime.bigint();
+  const startTime = process.hrtime.bigint();
+  let files: ProcessedFile[];
+
+  // Only compress (tree-sitter) and removeComments (AST manipulation) justify worker thread overhead
+  const useWorkers = config.output.compress || config.output.removeComments;
+
+  if (useWorkers) {
+    // Phase 1: Heavy processing via workers (removeComments, compress)
     logger.trace(`Starting file processing for ${rawFiles.length} files using worker pool`);
 
-    let completedTasks = 0;
-    const totalTasks = tasks.length;
-
-    const results = await Promise.all(
-      tasks.map((task) =>
-        taskRunner.run(task).then((result) => {
-          completedTasks++;
-          progressCallback(`Processing file... (${completedTasks}/${totalTasks}) ${pc.dim(task.rawFile.path)}`);
-          logger.trace(`Processing file... (${completedTasks}/${totalTasks}) ${task.rawFile.path}`);
-          return result;
-        }),
-      ),
+    const taskRunner = deps.initTaskRunner<FileProcessTask, ProcessedFile>({
+      numOfTasks: rawFiles.length,
+      workerType: 'fileProcess',
+      runtime: 'worker_threads',
+    });
+
+    const tasks = rawFiles.map(
+      (rawFile) =>
+        ({
+          rawFile,
+          config,
+        }) satisfies FileProcessTask,
     );
 
-    const endTime = process.hrtime.bigint();
-    const duration = Number(endTime - startTime) / 1e6;
-    logger.trace(`File processing completed in ${duration.toFixed(2)}ms`);
-
-    return results;
-  } catch (error) {
-    logger.error('Error during file processing:', error);
-    throw error;
-  } finally {
-    // Always cleanup worker pool
-    await taskRunner.cleanup();
+    try {
+      let completedTasks = 0;
+      const totalTasks = tasks.length;
+
+      files = await Promise.all(
+        tasks.map((task) =>
+          taskRunner.run(task).then((result) => {
+            completedTasks++;
+            progressCallback(`Processing file... (${completedTasks}/${totalTasks}) ${pc.dim(task.rawFile.path)}`);
+            logger.trace(`Processing file... (${completedTasks}/${totalTasks}) ${task.rawFile.path}`);
+            return result;
+          }),
+        ),
+      );
+    } catch (error) {
+      logger.error('Error during file processing:', error);
+      throw error;
+    } finally {
+      await taskRunner.cleanup();
+    }
+
+    // Phase 2: Lightweight transforms (no progress - already reported by workers)
+    files = applyLightweightTransforms(files, config, () => {}, deps);
+  } else {
+    // No heavy processing needed - apply lightweight transforms directly
+    logger.trace(`Starting file processing for ${rawFiles.length} files in main thread (lightweight mode)`);
+    const inputFiles = rawFiles.map((rawFile) => ({ path: rawFile.path, content: rawFile.content }));
+    files = applyLightweightTransforms(inputFiles, config, progressCallback, deps);
   }
+
+  const endTime = process.hrtime.bigint();
+  const duration = Number(endTime - startTime) / 1e6;
+  logger.trace(`File processing completed in ${duration.toFixed(2)}ms`);
+
+  return files;
 };
@@ -3,20 +3,15 @@ import { logger } from '../../shared/logger.js';
 import { parseFile } from '../treeSitter/parseFile.js';
 import { getFileManipulator } from './fileManipulate.js';
 import type { RawFile } from './fileTypes.js';
-import { truncateBase64Content } from './truncateBase64.js';
 
 /**
- * Process the content of a file according to the configuration
- * Applies various transformations based on the config:
- * - Remove comments
- * - Remove empty lines
- * - Truncate base64 encoded data
+ * Process the content of a file for CPU-intensive operations.
+ * Only handles heavy transformations that benefit from worker threads:
+ * - Remove comments (language-specific AST manipulation)
  * - Compress content using Tree-sitter
- * - Add line numbers
  *
- * @param rawFile Raw file data containing path and content
- * @param config Repomix configuration
- * @returns Processed content string
+ * Lightweight transforms (truncateBase64, removeEmptyLines, trim, showLineNumbers)
+ * are applied separately on the main thread by processFiles().
  */
 export const processContent = async (rawFile: RawFile, config: RepomixConfigMerged): Promise<string> => {
   const processStartAt = process.hrtime.bigint();
@@ -25,20 +20,10 @@ export const processContent = async (rawFile: RawFile, config: RepomixConfigMerg
 
   logger.trace(`Processing file: ${rawFile.path}`);
 
-  if (config.output.truncateBase64) {
-    processedContent = truncateBase64Content(processedContent);
-  }
-
   if (manipulator && config.output.removeComments) {
     processedContent = manipulator.removeComments(processedContent);
   }
 
-  if (config.output.removeEmptyLines && manipulator) {
-    processedContent = manipulator.removeEmptyLines(processedContent);
-  }
-
-  processedContent = processedContent.trim();
-
   if (config.output.compress) {
     try {
       const parsedContent = await parseFile(processedContent, rawFile.path, config);
@@ -49,14 +34,8 @@ export const processContent = async (rawFile: RawFile, config: RepomixConfigMerg
     } catch (error: unknown) {
       const message = error instanceof Error ? error.message : String(error);
       logger.error(`Error parsing ${rawFile.path} in compressed mode: ${message}`);
-      //re-throw error
       throw error;
     }
-  } else if (config.output.showLineNumbers) {
-    const lines = processedContent.split('\n');
-    const padding = lines.length.toString().length;
-    const numberedLines = lines.map((line, i) => `${(i + 1).toString().padStart(padding)}: ${line}`);
-    processedContent = numberedLines.join('\n');
   }
 
   const processEndAt = process.hrtime.bigint();

@@ -5,6 +5,13 @@ const TRUNCATION_LENGTH = 32;
 const MIN_CHAR_DIVERSITY = 10;
 const MIN_CHAR_TYPE_COUNT = 3;
 
+// Pre-compiled regex patterns (avoid re-creation per file)
+const dataUriPattern = new RegExp(
+  `data:([a-zA-Z0-9\\/\\-\\+]+)(;[a-zA-Z0-9\\-=]+)*;base64,([A-Za-z0-9+/=]{${MIN_BASE64_LENGTH_DATA_URI},})`,
+  'g',
+);
+const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_STANDALONE},}={0,2})`, 'g');
+
 /**
  * Truncates base64 encoded data in content to reduce file size
  * Detects common base64 patterns like data URIs and standalone base64 strings
@@ -13,15 +20,9 @@ const MIN_CHAR_TYPE_COUNT = 3;
  * @returns Content with base64 data truncated
  */
 export const truncateBase64Content = (content: string): string => {
-  // Pattern to match data URIs (e.g., data:image/png;base64,...)
-  const dataUriPattern = new RegExp(
-    `data:([a-zA-Z0-9\\/\\-\\+]+)(;[a-zA-Z0-9\\-=]+)*;base64,([A-Za-z0-9+/=]{${MIN_BASE64_LENGTH_DATA_URI},})`,
-    'g',
-  );
-
-  // Pattern to match standalone base64 strings
-  // This matches base64 strings that are likely encoded binary data
-  const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_STANDALONE},}={0,2})`, 'g');
+  // Reset lastIndex since patterns are global and reused across calls
+  dataUriPattern.lastIndex = 0;
+  standaloneBase64Pattern.lastIndex = 0;
 
   let processedContent = content;