Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions src/core/metrics/calculateGitDiffMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { GitDiffResult } from '../git/gitDiffHandle.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import { type MetricsTaskRunner, runTokenCount } from './metricsWorkerRunner.js';

/**
* Calculate token count for git diffs if included
*/
export const calculateGitDiffMetrics = async (
config: RepomixConfigMerged,
gitDiffResult: GitDiffResult | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: { taskRunner: MetricsTaskRunner },
): Promise<number> => {
if (!config.output.git?.includeDiffs || !gitDiffResult) {
return 0;
Expand All @@ -29,15 +28,15 @@ export const calculateGitDiffMetrics = async (

if (gitDiffResult.workTreeDiffContent) {
countPromises.push(
deps.taskRunner.run({
runTokenCount(deps.taskRunner, {
content: gitDiffResult.workTreeDiffContent,
encoding: config.tokenCount.encoding,
}),
);
}
if (gitDiffResult.stagedDiffContent) {
countPromises.push(
deps.taskRunner.run({
runTokenCount(deps.taskRunner, {
content: gitDiffResult.stagedDiffContent,
encoding: config.tokenCount.encoding,
}),
Expand Down
7 changes: 3 additions & 4 deletions src/core/metrics/calculateGitLogMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { GitLogResult } from '../git/gitLogHandle.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import { type MetricsTaskRunner, runTokenCount } from './metricsWorkerRunner.js';

/**
* Calculate token count for git logs if included
*/
export const calculateGitLogMetrics = async (
config: RepomixConfigMerged,
gitLogResult: GitLogResult | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: { taskRunner: MetricsTaskRunner },
): Promise<{ gitLogTokenCount: number }> => {
// Return zero token count if git logs are disabled or no result
if (!config.output.git?.includeLogs || !gitLogResult) {
Expand All @@ -30,7 +29,7 @@ export const calculateGitLogMetrics = async (
const startTime = process.hrtime.bigint();
logger.trace('Starting git log token calculation using worker');

const result = await deps.taskRunner.run({
const result = await runTokenCount(deps.taskRunner, {
content: gitLogResult.logContent,
encoding: config.tokenCount.encoding,
});
Expand Down
13 changes: 7 additions & 6 deletions src/core/metrics/calculateMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { getWorkerThreadCount, initTaskRunner, type TaskRunner } from '../../shared/processConcurrency.js';
import { getWorkerThreadCount, initTaskRunner } from '../../shared/processConcurrency.js';
import type { RepomixProgressCallback } from '../../shared/types.js';
import type { ProcessedFile } from '../file/fileTypes.js';
import type { GitDiffResult } from '../git/gitDiffHandle.js';
Expand All @@ -9,8 +9,9 @@ import { calculateGitDiffMetrics } from './calculateGitDiffMetrics.js';
import { calculateGitLogMetrics } from './calculateGitLogMetrics.js';
import { calculateOutputMetrics } from './calculateOutputMetrics.js';
import { calculateSelectiveFileMetrics } from './calculateSelectiveFileMetrics.js';
import type { MetricsTaskRunner } from './metricsWorkerRunner.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import type { MetricsWorkerResult, MetricsWorkerTask } from './workers/calculateMetricsWorker.js';

export interface CalculateMetricsResult {
totalFiles: number;
Expand All @@ -23,7 +24,7 @@ export interface CalculateMetricsResult {
}

export interface MetricsTaskRunnerWithWarmup {
taskRunner: TaskRunner<TokenCountTask, number>;
taskRunner: MetricsTaskRunner;
warmupPromise: Promise<unknown>;
}

Expand All @@ -34,7 +35,7 @@ export interface MetricsTaskRunnerWithWarmup {
* output generation).
*/
export const createMetricsTaskRunner = (numOfTasks: number, encoding: TokenEncoding): MetricsTaskRunnerWithWarmup => {
const taskRunner = initTaskRunner<TokenCountTask, number>({
const taskRunner = initTaskRunner<MetricsWorkerTask, MetricsWorkerResult>({
numOfTasks,
workerType: 'calculateMetrics',
runtime: 'worker_threads',
Expand All @@ -53,7 +54,7 @@ const defaultDeps = {
calculateOutputMetrics,
calculateGitDiffMetrics,
calculateGitLogMetrics,
taskRunner: undefined as TaskRunner<TokenCountTask, number> | undefined,
taskRunner: undefined as MetricsTaskRunner | undefined,
};

export const calculateMetrics = async (
Expand All @@ -72,7 +73,7 @@ export const calculateMetrics = async (
// Initialize a single task runner for all metrics calculations
const taskRunner =
deps.taskRunner ??
initTaskRunner<TokenCountTask, number>({
initTaskRunner<MetricsWorkerTask, MetricsWorkerResult>({
numOfTasks: processedFiles.length,
workerType: 'calculateMetrics',
runtime: 'worker_threads',
Expand Down
9 changes: 4 additions & 5 deletions src/core/metrics/calculateOutputMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import { type MetricsTaskRunner, runTokenCount } from './metricsWorkerRunner.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';

// Target ~100KB per chunk so that each worker task does meaningful tokenization work.
// Previously this was 1000 (number of chunks), which created ~1KB chunks for 1MB output,
Expand All @@ -13,7 +12,7 @@ export const calculateOutputMetrics = async (
content: string,
encoding: TokenEncoding,
path: string | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: { taskRunner: MetricsTaskRunner },
): Promise<number> => {
const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL;

Expand All @@ -34,7 +33,7 @@ export const calculateOutputMetrics = async (
// Process chunks in parallel
const chunkResults = await Promise.all(
chunks.map(async (chunk, index) => {
return deps.taskRunner.run({
return runTokenCount(deps.taskRunner, {
content: chunk,
encoding,
path: path ? `${path}-chunk-${index}` : undefined,
Expand All @@ -46,7 +45,7 @@ export const calculateOutputMetrics = async (
result = chunkResults.reduce((sum, count) => sum + count, 0);
} else {
// Process small content directly
result = await deps.taskRunner.run({
result = await runTokenCount(deps.taskRunner, {
content,
encoding,
path,
Expand Down
53 changes: 36 additions & 17 deletions src/core/metrics/calculateSelectiveFileMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import pc from 'picocolors';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { RepomixProgressCallback } from '../../shared/types.js';
import type { ProcessedFile } from '../file/fileTypes.js';
import { type MetricsTaskRunner, runBatchTokenCount } from './metricsWorkerRunner.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import type { FileMetrics } from './workers/types.js';

// Batch size for grouping files into worker tasks to reduce IPC overhead.
// Each batch is sent as a single message to a worker thread, avoiding
// per-file round-trip costs (~0.5ms each) that dominate when processing many files.
// For 991 files: 991 round-trips → 20 batches, saving ~485ms of IPC overhead.
const METRICS_BATCH_SIZE = 50;
Comment thread
yamadashy marked this conversation as resolved.

export const calculateSelectiveFileMetrics = async (
processedFiles: ProcessedFile[],
targetFilePaths: string[],
tokenCounterEncoding: TokenEncoding,
progressCallback: RepomixProgressCallback,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: { taskRunner: MetricsTaskRunner },
): Promise<FileMetrics[]> => {
const targetFileSet = new Set(targetFilePaths);
const filesToProcess = processedFiles.filter((file) => targetFileSet.has(file.path));
Expand All @@ -25,33 +30,47 @@ export const calculateSelectiveFileMetrics = async (
const startTime = process.hrtime.bigint();
logger.trace(`Starting selective metrics calculation for ${filesToProcess.length} files using worker pool`);

let completedTasks = 0;
const results = await Promise.all(
filesToProcess.map(async (file) => {
const tokenCount = await deps.taskRunner.run({
content: file.content,
// Split files into batches to reduce IPC round-trips
const batches: ProcessedFile[][] = [];
for (let i = 0; i < filesToProcess.length; i += METRICS_BATCH_SIZE) {
batches.push(filesToProcess.slice(i, i + METRICS_BATCH_SIZE));
}

logger.trace(`Split ${filesToProcess.length} files into ${batches.length} batches for token counting`);

let completedItems = 0;

const batchResults = await Promise.all(
batches.map(async (batch) => {
const tokenCounts = await runBatchTokenCount(deps.taskRunner, {
items: batch.map((file) => ({ content: file.content, path: file.path })),
encoding: tokenCounterEncoding,
path: file.path,
});

const result: FileMetrics = {
const results: FileMetrics[] = batch.map((file, index) => ({
path: file.path,
charCount: file.content.length,
tokenCount,
};
tokenCount: tokenCounts[index],
}));

completedItems += batch.length;
const lastFile = batch[batch.length - 1];
progressCallback(
`Calculating metrics... (${completedItems}/${filesToProcess.length}) ${pc.dim(lastFile.path)}`,
);
logger.trace(`Calculating metrics... (${completedItems}/${filesToProcess.length}) ${lastFile.path}`);

completedTasks++;
progressCallback(`Calculating metrics... (${completedTasks}/${filesToProcess.length}) ${pc.dim(file.path)}`);
logger.trace(`Calculating metrics... (${completedTasks}/${filesToProcess.length}) ${file.path}`);
return result;
return results;
}),
);

const allResults = batchResults.flat();

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`Selective metrics calculation completed in ${duration.toFixed(2)}ms`);

return results;
return allResults;
} catch (error) {
logger.error('Error during selective metrics calculation:', error);
throw error;
Expand Down
17 changes: 17 additions & 0 deletions src/core/metrics/metricsWorkerRunner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type {
MetricsWorkerResult,
MetricsWorkerTask,
TokenCountBatchTask,
TokenCountTask,
} from './workers/calculateMetricsWorker.js';

export type MetricsTaskRunner = TaskRunner<MetricsWorkerTask, MetricsWorkerResult>;

export const runTokenCount = (taskRunner: MetricsTaskRunner, task: TokenCountTask): Promise<number> => {
return taskRunner.run(task) as Promise<number>;
};

export const runBatchTokenCount = (taskRunner: MetricsTaskRunner, task: TokenCountBatchTask): Promise<number[]> => {
return taskRunner.run(task) as Promise<number[]>;
};
41 changes: 36 additions & 5 deletions src/core/metrics/workers/calculateMetricsWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ import type { TokenEncoding } from '../TokenCounter.js';
import { freeTokenCounters, getTokenCounter } from '../tokenCounterFactory.js';

/**
* Simple token counting worker for metrics calculation.
* Token counting worker for metrics calculation.
*
* This worker provides a focused interface for counting tokens from text content,
* using gpt-tokenizer. All complex metric calculation logic is handled
* by the calling side to maintain separation of concerns.
* Supports both single-content and batch modes. Batch mode reduces IPC overhead
* by processing multiple files per worker round-trip (~0.5ms overhead per round-trip).
* For 991 files, batching with size 50 reduces round-trips from 991 to 20.
*/

// Initialize logger configuration from workerData at module load time
Expand All @@ -20,6 +20,19 @@ export interface TokenCountTask {
path?: string;
}

export interface TokenCountBatchItem {
content: string;
path?: string;
}

export interface TokenCountBatchTask {
items: TokenCountBatchItem[];
encoding: TokenEncoding;
}

export type MetricsWorkerTask = TokenCountTask | TokenCountBatchTask;
export type MetricsWorkerResult = number | number[];

export const countTokens = async (task: TokenCountTask): Promise<number> => {
const processStartAt = process.hrtime.bigint();

Expand All @@ -35,12 +48,30 @@ export const countTokens = async (task: TokenCountTask): Promise<number> => {
}
};

const countTokensBatch = async (task: TokenCountBatchTask): Promise<number[]> => {
const processStartAt = process.hrtime.bigint();

try {
const counter = await getTokenCounter(task.encoding);
const results = task.items.map((item) => counter.countTokens(item.content, item.path));

logger.trace(`Counted tokens for ${task.items.length} items. Took: ${getProcessDuration(processStartAt)}ms`);
return results;
} catch (error) {
logger.error('Error in batch token counting worker:', error);
throw error;
}
};

const getProcessDuration = (startTime: bigint): string => {
const endTime = process.hrtime.bigint();
return (Number(endTime - startTime) / 1e6).toFixed(2);
};

export default async (task: TokenCountTask): Promise<number> => {
export default async (task: MetricsWorkerTask): Promise<MetricsWorkerResult> => {
if ('items' in task) {
return countTokensBatch(task);
}
return countTokens(task);
};

Expand Down
6 changes: 3 additions & 3 deletions src/shared/unifiedWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,12 @@ const inferWorkerTypeFromTask = (task: unknown): WorkerType | null => {
return 'fileProcess';
}

// calculateMetrics: has content, encoding (must check before securityCheck)
if ('content' in taskObj && 'encoding' in taskObj) {
// calculateMetrics: single mode has content+encoding, batch mode has items+encoding
if ('encoding' in taskObj && ('content' in taskObj || 'items' in taskObj)) {
return 'calculateMetrics';
}

// securityCheck: has items array (without encoding, which distinguishes it from calculateMetrics)
// securityCheck: has items array without encoding (distinguishes from batch calculateMetrics)
if ('items' in taskObj && !('encoding' in taskObj)) {
return 'securityCheck';
}
Expand Down
Loading
Loading