diff --git a/package-lock.json b/package-lock.json index 0013cc8c6..6a230f4dd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "fast-xml-builder": "^1.1.4", "git-url-parse": "^16.1.0", "globby": "^16.1.1", + "gpt-tokenizer": "^3.4.0", "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", @@ -30,7 +31,6 @@ "picocolors": "^1.1.1", "picospinner": "^3.0.0", "tar": "^7.5.12", - "tiktoken": "^1.0.22", "tinyclip": "^0.1.12", "tinypool": "^2.1.0", "web-tree-sitter": "^0.26.7", @@ -3121,6 +3121,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gpt-tokenizer": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-3.4.0.tgz", + "integrity": "sha512-wxFLnhIXTDjYebd9A9pGl3e31ZpSypbpIJSOswbgop5jLte/AsZVDvjlbEuVFlsqZixVKqbcoNmRlFDf6pz/UQ==", + "license": "MIT" + }, "node_modules/handlebars": { "version": "4.7.9", "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.9.tgz", @@ -4872,12 +4878,6 @@ "url": "https://bevry.me/fund" } }, - "node_modules/tiktoken": { - "version": "1.0.22", - "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.22.tgz", - "integrity": "sha512-PKvy1rVF1RibfF3JlXBSP0Jrcw2uq3yXdgcEXtKTYn3QJ/cBRBHDnrJ5jHky+MENZ6DIPwNUGWpkVx+7joCpNA==", - "license": "MIT" - }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", diff --git a/package.json b/package.json index ebe04e7e0..15611ad6f 100644 --- a/package.json +++ b/package.json @@ -85,6 +85,7 @@ "fast-xml-builder": "^1.1.4", "git-url-parse": "^16.1.0", "globby": "^16.1.1", + "gpt-tokenizer": "^3.4.0", "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", @@ -96,7 +97,6 @@ "picocolors": "^1.1.1", "picospinner": "^3.0.0", "tar": "^7.5.12", - "tiktoken": "^1.0.22", "tinyclip": "^0.1.12", "tinypool": "^2.1.0", "web-tree-sitter": "^0.26.7", diff --git a/src/config/configSchema.ts b/src/config/configSchema.ts index dbc713d63..74c1ef432 100644 --- a/src/config/configSchema.ts +++ b/src/config/configSchema.ts @@ -1,5 +1,5 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { z } from 'zod'; +import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js'; // Output style enum export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']); @@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({ enableSecurityCheck: z.boolean().default(true), }), tokenCount: z.object({ - encoding: z - .string() - .default('o200k_base') - .transform((val) => val as TiktokenEncoding), + encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'), }), }); diff --git a/src/core/metrics/TokenCounter.ts b/src/core/metrics/TokenCounter.ts index 7ae1dcb46..c2a340dea 100644 --- a/src/core/metrics/TokenCounter.ts +++ b/src/core/metrics/TokenCounter.ts @@ -1,48 +1,103 @@ -import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -export class TokenCounter { - private encoding: Tiktoken; +// Supported token encoding types (compatible with tiktoken encoding names) +export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const; +export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number]; + +interface CountTokensOptions { + disallowedSpecial?: Set; +} + +type CountTokensFn = (text: string, options?: CountTokensOptions) => number; + +// Treat all text as regular content by disallowing nothing. +// This matches the old tiktoken behavior: encode(content, [], []).length +// where special tokens like <|endoftext|> are tokenized as ordinary text. +// Also faster than the default (disallowedSpecial='all') because it skips +// the regex scan for special token patterns entirely. +const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() }; + +// Lazy-loaded countTokens functions keyed by encoding +const encodingModules = new Map(); - constructor(encodingName: TiktokenEncoding) { - const startTime = process.hrtime.bigint(); +const loadEncoding = async (encodingName: TokenEncoding): Promise => { + const cached = encodingModules.get(encodingName); + if (cached) { + return cached; + } + + const startTime = process.hrtime.bigint(); + + // Dynamic import of the specific encoding module from gpt-tokenizer + const mod = await import(`gpt-tokenizer/encoding/${encodingName}`); + const countFn = mod.countTokens as CountTokensFn; + encodingModules.set(encodingName, countFn); - // Setup encoding with the specified model - this.encoding = get_encoding(encodingName); + const endTime = process.hrtime.bigint(); + const initTime = Number(endTime - startTime) / 1e6; + logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`); + + return countFn; +}; + +export class TokenCounter { + private countFn: CountTokensFn | null = null; + private readonly encodingName: TokenEncoding; - const endTime = process.hrtime.bigint(); - const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds + constructor(encodingName: TokenEncoding) { + this.encodingName = encodingName; + } - logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`); + async init(): Promise { + this.countFn = await loadEncoding(this.encodingName); } + /** + * Count tokens using gpt-tokenizer's default config (fast path). + * Files containing special token sequences like <|endoftext|> will return 0. + * Use countTokensPlainText() to handle such files correctly. + */ public countTokens(content: string, filePath?: string): number { + if (!this.countFn) { + throw new Error('TokenCounter not initialized. Call init() first.'); + } + try { - // Disable special token validation to handle files that may contain - // special token sequences (e.g., tokenizer configs with <|endoftext|>). - // This treats special tokens as ordinary text rather than control tokens, - // which is appropriate for general code/text analysis where we're not - // actually sending the content to an LLM API. - return this.encoding.encode(content, [], []).length; - } catch (error) { - let message = ''; - if (error instanceof Error) { - message = error.message; + return this.countFn(content); + } catch { + if (filePath) { + logger.warn(`Failed to count tokens. path: ${filePath}`); } else { - message = String(error); + logger.warn('Failed to count tokens.'); } + return 0; + } + } + + /** + * Count tokens treating all content as plain text (no special token checking). + * Matches tiktoken's encode(content, [], []) behavior where special tokens + * like <|endoftext|> are tokenized as ordinary text. + */ + public countTokensPlainText(content: string, filePath?: string): number { + if (!this.countFn) { + throw new Error('TokenCounter not initialized. Call init() first.'); + } + + try { + return this.countFn(content, PLAIN_TEXT_OPTIONS); + } catch { if (filePath) { - logger.warn(`Failed to count tokens. path: ${filePath}, error: ${message}`); + logger.warn(`Failed to count tokens. path: ${filePath}`); } else { - logger.warn(`Failed to count tokens. error: ${message}`); + logger.warn('Failed to count tokens.'); } return 0; } } - public free(): void { - this.encoding.free(); - } + // No-op: gpt-tokenizer is pure JS, no WASM resources to free + public free(): void {} } diff --git a/src/core/metrics/calculateGitDiffMetrics.ts b/src/core/metrics/calculateGitDiffMetrics.ts index cbe3ec5ae..c64eaf539 100644 --- a/src/core/metrics/calculateGitDiffMetrics.ts +++ b/src/core/metrics/calculateGitDiffMetrics.ts @@ -1,8 +1,11 @@ import type { RepomixConfigMerged } from '../../config/configSchema.js'; import { logger } from '../../shared/logger.js'; -import type { TaskRunner } from '../../shared/processConcurrency.js'; import type { GitDiffResult } from '../git/gitDiffHandle.js'; -import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; +import { getTokenCounter } from './tokenCounterFactory.js'; + +const defaultDeps = { + getTokenCounter, +}; /** * Calculate token count for git diffs if included @@ -10,43 +13,40 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; export const calculateGitDiffMetrics = async ( config: RepomixConfigMerged, gitDiffResult: GitDiffResult | undefined, - deps: { taskRunner: TaskRunner }, + deps: Partial = {}, ): Promise => { if (!config.output.git?.includeDiffs || !gitDiffResult) { return 0; } - // Check if we have any diff content to process if (!gitDiffResult.workTreeDiffContent && !gitDiffResult.stagedDiffContent) { return 0; } + const resolvedDeps = { ...defaultDeps, ...deps }; + try { const startTime = process.hrtime.bigint(); - logger.trace('Starting git diff token calculation using worker'); + logger.trace('Starting git diff token calculation on main thread'); - const countPromises: Promise[] = []; + const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding); + let totalTokens = 0; if (gitDiffResult.workTreeDiffContent) { - countPromises.push( - deps.taskRunner.run({ - content: gitDiffResult.workTreeDiffContent, - encoding: config.tokenCount.encoding, - }), - ); + let count = counter.countTokens(gitDiffResult.workTreeDiffContent); + if (count === 0 && gitDiffResult.workTreeDiffContent.length > 0) { + count = counter.countTokensPlainText(gitDiffResult.workTreeDiffContent); + } + totalTokens += count; } if (gitDiffResult.stagedDiffContent) { - countPromises.push( - deps.taskRunner.run({ - content: gitDiffResult.stagedDiffContent, - encoding: config.tokenCount.encoding, - }), - ); + let count = counter.countTokens(gitDiffResult.stagedDiffContent); + if (count === 0 && gitDiffResult.stagedDiffContent.length > 0) { + count = counter.countTokensPlainText(gitDiffResult.stagedDiffContent); + } + totalTokens += count; } - const results = await Promise.all(countPromises); - const totalTokens = results.reduce((sum, count) => sum + count, 0); - const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1e6; logger.trace(`Git diff token calculation completed in ${duration.toFixed(2)}ms`); diff --git a/src/core/metrics/calculateGitLogMetrics.ts b/src/core/metrics/calculateGitLogMetrics.ts index 97e94ae95..e15196274 100644 --- a/src/core/metrics/calculateGitLogMetrics.ts +++ b/src/core/metrics/calculateGitLogMetrics.ts @@ -1,8 +1,11 @@ import type { RepomixConfigMerged } from '../../config/configSchema.js'; import { logger } from '../../shared/logger.js'; -import type { TaskRunner } from '../../shared/processConcurrency.js'; import type { GitLogResult } from '../git/gitLogHandle.js'; -import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; +import { getTokenCounter } from './tokenCounterFactory.js'; + +const defaultDeps = { + getTokenCounter, +}; /** * Calculate token count for git logs if included @@ -10,42 +13,35 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; export const calculateGitLogMetrics = async ( config: RepomixConfigMerged, gitLogResult: GitLogResult | undefined, - deps: { taskRunner: TaskRunner }, + deps: Partial = {}, ): Promise<{ gitLogTokenCount: number }> => { - // Return zero token count if git logs are disabled or no result if (!config.output.git?.includeLogs || !gitLogResult) { - return { - gitLogTokenCount: 0, - }; + return { gitLogTokenCount: 0 }; } - // Return zero token count if no git log content if (!gitLogResult.logContent) { - return { - gitLogTokenCount: 0, - }; + return { gitLogTokenCount: 0 }; } + const resolvedDeps = { ...defaultDeps, ...deps }; + try { const startTime = process.hrtime.bigint(); - logger.trace('Starting git log token calculation using worker'); + logger.trace('Starting git log token calculation on main thread'); - const result = await deps.taskRunner.run({ - content: gitLogResult.logContent, - encoding: config.tokenCount.encoding, - }); + const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding); + let result = counter.countTokens(gitLogResult.logContent); + if (result === 0 && gitLogResult.logContent.length > 0) { + result = counter.countTokensPlainText(gitLogResult.logContent); + } const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1e6; logger.trace(`Git log token calculation completed in ${duration.toFixed(2)}ms`); - return { - gitLogTokenCount: result, - }; + return { gitLogTokenCount: result }; } catch (error) { logger.error('Failed to calculate git log metrics:', error); - return { - gitLogTokenCount: 0, - }; + return { gitLogTokenCount: 0 }; } }; diff --git a/src/core/metrics/calculateMetrics.ts b/src/core/metrics/calculateMetrics.ts index d727f30b3..2c113caad 100644 --- a/src/core/metrics/calculateMetrics.ts +++ b/src/core/metrics/calculateMetrics.ts @@ -1,5 +1,4 @@ import type { RepomixConfigMerged } from '../../config/configSchema.js'; -import { initTaskRunner, type TaskRunner } from '../../shared/processConcurrency.js'; import type { RepomixProgressCallback } from '../../shared/types.js'; import type { ProcessedFile } from '../file/fileTypes.js'; import type { GitDiffResult } from '../git/gitDiffHandle.js'; @@ -9,7 +8,6 @@ import { calculateGitDiffMetrics } from './calculateGitDiffMetrics.js'; import { calculateGitLogMetrics } from './calculateGitLogMetrics.js'; import { calculateOutputMetrics } from './calculateOutputMetrics.js'; import { calculateSelectiveFileMetrics } from './calculateSelectiveFileMetrics.js'; -import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; export interface CalculateMetricsResult { totalFiles: number; @@ -21,24 +19,11 @@ export interface CalculateMetricsResult { gitLogTokenCount: number; } -/** - * Create a metrics task runner that can be pre-initialized to overlap - * tiktoken WASM loading with other pipeline stages. - */ -export const createMetricsTaskRunner = (numOfTasks: number): TaskRunner => { - return initTaskRunner({ - numOfTasks, - workerType: 'calculateMetrics', - runtime: 'worker_threads', - }); -}; - const defaultDeps = { calculateSelectiveFileMetrics, calculateOutputMetrics, calculateGitDiffMetrics, calculateGitLogMetrics, - taskRunner: undefined as TaskRunner | undefined, }; export const calculateMetrics = async ( @@ -54,82 +39,66 @@ export const calculateMetrics = async ( progressCallback('Calculating metrics...'); - // Initialize a single task runner for all metrics calculations - const taskRunner = - deps.taskRunner ?? - initTaskRunner({ - numOfTasks: processedFiles.length, - workerType: 'calculateMetrics', - runtime: 'worker_threads', - }); - - try { - const outputParts = Array.isArray(output) ? output : [output]; - // For top files display optimization: calculate token counts only for top files by character count - // However, if tokenCountTree is enabled, calculate for all files to avoid double calculation - const topFilesLength = config.output.topFilesLength; - const shouldCalculateAllFiles = !!config.output.tokenCountTree; + const outputParts = Array.isArray(output) ? output : [output]; + // For top files display optimization: calculate token counts only for top files by character count + // However, if tokenCountTree is enabled, calculate for all files to avoid double calculation + const topFilesLength = config.output.topFilesLength; + const shouldCalculateAllFiles = !!config.output.tokenCountTree; - // Determine which files to calculate token counts for: - // - If tokenCountTree is enabled: calculate for all files to avoid double calculation - // - Otherwise: calculate only for top files by character count for optimization - const metricsTargetPaths = shouldCalculateAllFiles - ? processedFiles.map((file) => file.path) - : [...processedFiles] - .sort((a, b) => b.content.length - a.content.length) - .slice(0, Math.min(processedFiles.length, Math.max(topFilesLength * 10, topFilesLength))) - .map((file) => file.path); + // Determine which files to calculate token counts for: + // - If tokenCountTree is enabled: calculate for all files to avoid double calculation + // - Otherwise: calculate only for top files by character count for optimization + const metricsTargetPaths = shouldCalculateAllFiles + ? processedFiles.map((file) => file.path) + : [...processedFiles] + .sort((a, b) => b.content.length - a.content.length) + .slice(0, Math.min(processedFiles.length, Math.max(topFilesLength * 10, topFilesLength))) + .map((file) => file.path); - const [selectiveFileMetrics, outputTokenCounts, gitDiffTokenCount, gitLogTokenCount] = await Promise.all([ - deps.calculateSelectiveFileMetrics( - processedFiles, - metricsTargetPaths, - config.tokenCount.encoding, - progressCallback, - { taskRunner }, - ), - Promise.all( - outputParts.map(async (part, index) => { - const partPath = - outputParts.length > 1 - ? buildSplitOutputFilePath(config.output.filePath, index + 1) - : config.output.filePath; - return await deps.calculateOutputMetrics(part, config.tokenCount.encoding, partPath, { taskRunner }); - }), - ), - deps.calculateGitDiffMetrics(config, gitDiffResult, { taskRunner }), - deps.calculateGitLogMetrics(config, gitLogResult, { taskRunner }), - ]); + // File metrics must run first (synchronous on main thread with gpt-tokenizer), + // then output/git metrics can run in parallel since they share the cached TokenCounter + const selectiveFileMetrics = await deps.calculateSelectiveFileMetrics( + processedFiles, + metricsTargetPaths, + config.tokenCount.encoding, + progressCallback, + ); - const totalTokens = outputTokenCounts.reduce((sum, count) => sum + count, 0); - const totalFiles = processedFiles.length; - const totalCharacters = outputParts.reduce((sum, part) => sum + part.length, 0); + const [outputTokenCounts, gitDiffTokenCount, gitLogTokenCount] = await Promise.all([ + Promise.all( + outputParts.map(async (part, index) => { + const partPath = + outputParts.length > 1 ? buildSplitOutputFilePath(config.output.filePath, index + 1) : config.output.filePath; + return await deps.calculateOutputMetrics(part, config.tokenCount.encoding, partPath); + }), + ), + deps.calculateGitDiffMetrics(config, gitDiffResult), + deps.calculateGitLogMetrics(config, gitLogResult), + ]); - // Build character counts for all files - const fileCharCounts: Record = {}; - for (const file of processedFiles) { - fileCharCounts[file.path] = file.content.length; - } + const totalTokens = outputTokenCounts.reduce((sum, count) => sum + count, 0); + const totalFiles = processedFiles.length; + const totalCharacters = outputParts.reduce((sum, part) => sum + part.length, 0); - // Build token counts only for top files - const fileTokenCounts: Record = {}; - for (const file of selectiveFileMetrics) { - fileTokenCounts[file.path] = file.tokenCount; - } + // Build character counts for all files + const fileCharCounts: Record = {}; + for (const file of processedFiles) { + fileCharCounts[file.path] = file.content.length; + } - return { - totalFiles, - totalCharacters, - totalTokens, - fileCharCounts, - fileTokenCounts, - gitDiffTokenCount: gitDiffTokenCount, - gitLogTokenCount: gitLogTokenCount.gitLogTokenCount, - }; - } finally { - // Cleanup the task runner after all calculations are complete (only if we created it) - if (!deps.taskRunner) { - await taskRunner.cleanup(); - } + // Build token counts only for top files + const fileTokenCounts: Record = {}; + for (const file of selectiveFileMetrics) { + fileTokenCounts[file.path] = file.tokenCount; } + + return { + totalFiles, + totalCharacters, + totalTokens, + fileCharCounts, + fileTokenCounts, + gitDiffTokenCount: gitDiffTokenCount, + gitLogTokenCount: gitLogTokenCount.gitLogTokenCount, + }; }; diff --git a/src/core/metrics/calculateOutputMetrics.ts b/src/core/metrics/calculateOutputMetrics.ts index ad41ae918..1684d804b 100644 --- a/src/core/metrics/calculateOutputMetrics.ts +++ b/src/core/metrics/calculateOutputMetrics.ts @@ -1,54 +1,27 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -import type { TaskRunner } from '../../shared/processConcurrency.js'; -import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; +import type { TokenEncoding } from './TokenCounter.js'; +import { getTokenCounter } from './tokenCounterFactory.js'; -const CHUNK_SIZE = 1000; -const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1000KB +const defaultDeps = { + getTokenCounter, +}; export const calculateOutputMetrics = async ( content: string, - encoding: TiktokenEncoding, + encoding: TokenEncoding, path: string | undefined, - deps: { taskRunner: TaskRunner }, + deps: Partial = {}, ): Promise => { - const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL; + const resolvedDeps = { ...defaultDeps, ...deps }; try { logger.trace(`Starting output token count for ${path || 'output'}`); const startTime = process.hrtime.bigint(); - let result: number; - - if (shouldRunInParallel) { - // Split content into chunks for parallel processing - const chunkSize = Math.ceil(content.length / CHUNK_SIZE); - const chunks: string[] = []; - - for (let i = 0; i < content.length; i += chunkSize) { - chunks.push(content.slice(i, i + chunkSize)); - } - - // Process chunks in parallel - const chunkResults = await Promise.all( - chunks.map(async (chunk, index) => { - return deps.taskRunner.run({ - content: chunk, - encoding, - path: path ? `${path}-chunk-${index}` : undefined, - }); - }), - ); - - // Sum up the results - result = chunkResults.reduce((sum, count) => sum + count, 0); - } else { - // Process small content directly - result = await deps.taskRunner.run({ - content, - encoding, - path, - }); + const counter = await resolvedDeps.getTokenCounter(encoding); + let result = counter.countTokens(content, path); + if (result === 0 && content.length > 0) { + result = counter.countTokensPlainText(content, path); } const endTime = process.hrtime.bigint(); diff --git a/src/core/metrics/calculateSelectiveFileMetrics.ts b/src/core/metrics/calculateSelectiveFileMetrics.ts index 02f52726a..f8bf7f8da 100644 --- a/src/core/metrics/calculateSelectiveFileMetrics.ts +++ b/src/core/metrics/calculateSelectiveFileMetrics.ts @@ -1,19 +1,23 @@ import pc from 'picocolors'; -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -import type { TaskRunner } from '../../shared/processConcurrency.js'; import type { RepomixProgressCallback } from '../../shared/types.js'; import type { ProcessedFile } from '../file/fileTypes.js'; -import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; +import type { TokenEncoding } from './TokenCounter.js'; +import { getTokenCounter } from './tokenCounterFactory.js'; import type { FileMetrics } from './workers/types.js'; +const defaultDeps = { + getTokenCounter, +}; + export const calculateSelectiveFileMetrics = async ( processedFiles: ProcessedFile[], targetFilePaths: string[], - tokenCounterEncoding: TiktokenEncoding, + tokenCounterEncoding: TokenEncoding, progressCallback: RepomixProgressCallback, - deps: { taskRunner: TaskRunner }, + deps: Partial = {}, ): Promise => { + const resolvedDeps = { ...defaultDeps, ...deps }; const targetFileSet = new Set(targetFilePaths); const filesToProcess = processedFiles.filter((file) => targetFileSet.has(file.path)); @@ -23,29 +27,27 @@ export const calculateSelectiveFileMetrics = async ( try { const startTime = process.hrtime.bigint(); - logger.trace(`Starting selective metrics calculation for ${filesToProcess.length} files using worker pool`); - - let completedTasks = 0; - const results = await Promise.all( - filesToProcess.map(async (file) => { - const tokenCount = await deps.taskRunner.run({ - content: file.content, - encoding: tokenCounterEncoding, - path: file.path, - }); - - const result: FileMetrics = { - path: file.path, - charCount: file.content.length, - tokenCount, - }; - - completedTasks++; - progressCallback(`Calculating metrics... (${completedTasks}/${filesToProcess.length}) ${pc.dim(file.path)}`); - logger.trace(`Calculating metrics... (${completedTasks}/${filesToProcess.length}) ${file.path}`); - return result; - }), - ); + logger.trace(`Starting selective metrics calculation for ${filesToProcess.length} files on main thread`); + + const counter = await resolvedDeps.getTokenCounter(tokenCounterEncoding); + + const results: FileMetrics[] = []; + for (let i = 0; i < filesToProcess.length; i++) { + const file = filesToProcess[i]; + let tokenCount = counter.countTokens(file.content, file.path); + if (tokenCount === 0 && file.content.length > 0) { + tokenCount = counter.countTokensPlainText(file.content, file.path); + } + + results.push({ + path: file.path, + charCount: file.content.length, + tokenCount, + }); + + progressCallback(`Calculating metrics... (${i + 1}/${filesToProcess.length}) ${pc.dim(file.path)}`); + logger.trace(`Calculating metrics... (${i + 1}/${filesToProcess.length}) ${file.path}`); + } const endTime = process.hrtime.bigint(); const duration = Number(endTime - startTime) / 1e6; diff --git a/src/core/metrics/tokenCounterFactory.ts b/src/core/metrics/tokenCounterFactory.ts index 8f51f0ba5..5230c37b6 100644 --- a/src/core/metrics/tokenCounterFactory.ts +++ b/src/core/metrics/tokenCounterFactory.ts @@ -1,18 +1,19 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -import { TokenCounter } from './TokenCounter.js'; +import { TokenCounter, type TokenEncoding } from './TokenCounter.js'; -// Worker-level cache for TokenCounter instances by encoding -const tokenCounters = new Map(); +// Cache for TokenCounter instances by encoding +const tokenCounters = new Map(); /** * Get or create a TokenCounter instance for the given encoding. - * This ensures only one TokenCounter exists per encoding per worker thread to optimize memory usage. + * This ensures only one TokenCounter exists per encoding to optimize memory usage. + * The counter must be initialized with init() before use. */ -export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => { +export const getTokenCounter = async (encoding: TokenEncoding): Promise => { let tokenCounter = tokenCounters.get(encoding); if (!tokenCounter) { tokenCounter = new TokenCounter(encoding); + await tokenCounter.init(); tokenCounters.set(encoding, tokenCounter); } return tokenCounter; @@ -20,7 +21,7 @@ export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => { /** * Free all TokenCounter resources and clear the cache. - * This should be called when the worker is terminating. + * No-op for gpt-tokenizer (pure JS), but kept for API compatibility. */ export const freeTokenCounters = (): void => { for (const [encoding, tokenCounter] of tokenCounters.entries()) { diff --git a/src/core/metrics/workers/calculateMetricsWorker.ts b/src/core/metrics/workers/calculateMetricsWorker.ts deleted file mode 100644 index 241af02e0..000000000 --- a/src/core/metrics/workers/calculateMetricsWorker.ts +++ /dev/null @@ -1,50 +0,0 @@ -import type { TiktokenEncoding } from 'tiktoken'; -import { logger, setLogLevelByWorkerData } from '../../../shared/logger.js'; -import { freeTokenCounters, getTokenCounter } from '../tokenCounterFactory.js'; - -/** - * Simple token counting worker for metrics calculation. - * - * This worker provides a focused interface for counting tokens from text content, - * using the Tiktoken encoding. All complex metric calculation logic is handled - * by the calling side to maintain separation of concerns. - */ - -// Initialize logger configuration from workerData at module load time -// This must be called before any logging operations in the worker -setLogLevelByWorkerData(); - -export interface TokenCountTask { - content: string; - encoding: TiktokenEncoding; - path?: string; -} - -export const countTokens = async (task: TokenCountTask): Promise => { - const processStartAt = process.hrtime.bigint(); - - try { - const counter = getTokenCounter(task.encoding); - const tokenCount = counter.countTokens(task.content, task.path); - - logger.trace(`Counted tokens. Count: ${tokenCount}. Took: ${getProcessDuration(processStartAt)}ms`); - return tokenCount; - } catch (error) { - logger.error('Error in token counting worker:', error); - throw error; - } -}; - -const getProcessDuration = (startTime: bigint): string => { - const endTime = process.hrtime.bigint(); - return (Number(endTime - startTime) / 1e6).toFixed(2); -}; - -export default async (task: TokenCountTask): Promise => { - return countTokens(task); -}; - -// Export cleanup function for Tinypool teardown -export const onWorkerTermination = async (): Promise => { - freeTokenCounters(); -}; diff --git a/src/core/packager.ts b/src/core/packager.ts index 661008814..565ff368e 100644 --- a/src/core/packager.ts +++ b/src/core/packager.ts @@ -10,7 +10,7 @@ import type { FilesByRoot } from './file/fileTreeGenerate.js'; import type { ProcessedFile } from './file/fileTypes.js'; import { getGitDiffs } from './git/gitDiffHandle.js'; import { getGitLogs } from './git/gitLogHandle.js'; -import { calculateMetrics, createMetricsTaskRunner } from './metrics/calculateMetrics.js'; +import { calculateMetrics } from './metrics/calculateMetrics.js'; import { produceOutput } from './packager/produceOutput.js'; import type { SuspiciousFileResult } from './security/securityCheck.js'; import { validateFileSafety } from './security/validateFileSafety.js'; @@ -40,7 +40,6 @@ const defaultDeps = { validateFileSafety, produceOutput, calculateMetrics, - createMetricsTaskRunner, sortPaths, getGitDiffs, getGitLogs, @@ -91,119 +90,102 @@ export const pack = async ( filePaths: sortedFilePaths.filter((filePath) => filePathSetByDir.get(rootDir)?.has(filePath) ?? false), })); - // Pre-initialize metrics worker pool to overlap tiktoken WASM loading with subsequent pipeline stages - // (security check, file processing, output generation). The warm-up task triggers tiktoken - // initialization in the worker thread without blocking the main pipeline. - const metricsTaskRunner = deps.createMetricsTaskRunner(allFilePaths.length); - const warmupPromise = metricsTaskRunner.run({ content: '', encoding: config.tokenCount.encoding }).catch(() => 0); // Suppress unhandled rejection; errors surface when awaited - - try { - // Run file collection and git operations in parallel since they are independent: - // - collectFiles reads file contents from disk - // - getGitDiffs/getGitLogs spawn git subprocesses - // Neither depends on the other's results. - progressCallback('Collecting files...'); - const [collectResults, gitDiffResult, gitLogResult] = await Promise.all([ - withMemoryLogging( - 'Collect Files', - async () => - await Promise.all( - sortedFilePathsByDir.map(({ rootDir, filePaths }) => - deps.collectFiles(filePaths, rootDir, config, progressCallback), - ), + // Run file collection and git operations in parallel since they are independent: + // - collectFiles reads file contents from disk + // - getGitDiffs/getGitLogs spawn git subprocesses + // Neither depends on the other's results. + progressCallback('Collecting files...'); + const [collectResults, gitDiffResult, gitLogResult] = await Promise.all([ + withMemoryLogging( + 'Collect Files', + async () => + await Promise.all( + sortedFilePathsByDir.map(({ rootDir, filePaths }) => + deps.collectFiles(filePaths, rootDir, config, progressCallback), ), - ), - deps.getGitDiffs(rootDirs, config), - deps.getGitLogs(rootDirs, config), - ]); - - const rawFiles = collectResults.flatMap((curr) => curr.rawFiles); - const allSkippedFiles = collectResults.flatMap((curr) => curr.skippedFiles); - - // Run security check and get filtered safe files - const { safeFilePaths, safeRawFiles, suspiciousFilesResults, suspiciousGitDiffResults, suspiciousGitLogResults } = - await withMemoryLogging('Security Check', () => - deps.validateFileSafety(rawFiles, progressCallback, config, gitDiffResult, gitLogResult), - ); - - // Process files (remove comments, etc.) - progressCallback('Processing files...'); - const processedFiles = await withMemoryLogging('Process Files', () => - deps.processFiles(safeRawFiles, config, progressCallback), + ), + ), + deps.getGitDiffs(rootDirs, config), + deps.getGitLogs(rootDirs, config), + ]); + + const rawFiles = collectResults.flatMap((curr) => curr.rawFiles); + const allSkippedFiles = collectResults.flatMap((curr) => curr.skippedFiles); + + // Run security check and get filtered safe files + const { safeFilePaths, safeRawFiles, suspiciousFilesResults, suspiciousGitDiffResults, suspiciousGitLogResults } = + await withMemoryLogging('Security Check', () => + deps.validateFileSafety(rawFiles, progressCallback, config, gitDiffResult, gitLogResult), ); - progressCallback('Generating output...'); - - // Check if skill generation is requested - if (config.skillGenerate !== undefined && options.skillDir) { - // Await warmup to ensure graceful worker shutdown (avoid terminating WASM-loading thread) - await warmupPromise; - - const result = await deps.packSkill({ - rootDirs, - config, - options, - processedFiles, - allFilePaths, - gitDiffResult, - gitLogResult, - suspiciousFilesResults, - suspiciousGitDiffResults, - suspiciousGitLogResults, - safeFilePaths, - skippedFiles: allSkippedFiles, - progressCallback, - }); - - logMemoryUsage('Pack - End'); - return result; - } - - // Build filePathsByRoot for multi-root tree generation - // Use directory basename as the label for each root - // Fallback to rootDir if basename is empty (e.g., filesystem root "/") - const filePathsByRoot: FilesByRoot[] = sortedFilePathsByDir.map(({ rootDir, filePaths }) => ({ - rootLabel: path.basename(rootDir) || rootDir, - files: filePaths, - })); - - // Generate and write output (handles both single and split output) - const { outputFiles, outputForMetrics } = await deps.produceOutput( + // Process files (remove comments, etc.) + progressCallback('Processing files...'); + const processedFiles = await withMemoryLogging('Process Files', () => + deps.processFiles(safeRawFiles, config, progressCallback), + ); + + progressCallback('Generating output...'); + + // Check if skill generation is requested + if (config.skillGenerate !== undefined && options.skillDir) { + const result = await deps.packSkill({ rootDirs, config, + options, processedFiles, allFilePaths, gitDiffResult, gitLogResult, - progressCallback, - filePathsByRoot, - ); - - // Ensure warm-up task completes before metrics calculation - await warmupPromise; - - const metrics = await withMemoryLogging('Calculate Metrics', () => - deps.calculateMetrics(processedFiles, outputForMetrics, progressCallback, config, gitDiffResult, gitLogResult, { - taskRunner: metricsTaskRunner, - }), - ); - - // Create a result object that includes metrics and security results - const result = { - ...metrics, - ...(outputFiles && { outputFiles }), suspiciousFilesResults, suspiciousGitDiffResults, suspiciousGitLogResults, - processedFiles, safeFilePaths, skippedFiles: allSkippedFiles, - }; + progressCallback, + }); logMemoryUsage('Pack - End'); - return result; - } finally { - await metricsTaskRunner.cleanup(); } + + // Build filePathsByRoot for multi-root tree generation + // Use directory basename as the label for each root + // Fallback to rootDir if basename is empty (e.g., filesystem root "/") + const filePathsByRoot: FilesByRoot[] = sortedFilePathsByDir.map(({ rootDir, filePaths }) => ({ + rootLabel: path.basename(rootDir) || rootDir, + files: filePaths, + })); + + // Generate and write output (handles both single and split output) + const { outputFiles, outputForMetrics } = await deps.produceOutput( + rootDirs, + config, + processedFiles, + allFilePaths, + gitDiffResult, + gitLogResult, + progressCallback, + filePathsByRoot, + ); + + // Token counting runs on main thread with gpt-tokenizer (pure JS) — no worker pool needed + const metrics = await withMemoryLogging('Calculate Metrics', () => + deps.calculateMetrics(processedFiles, outputForMetrics, progressCallback, config, gitDiffResult, gitLogResult), + ); + + // Create a result object that includes metrics and security results + const result = { + ...metrics, + ...(outputFiles && { outputFiles }), + suspiciousFilesResults, + suspiciousGitDiffResults, + suspiciousGitLogResults, + processedFiles, + safeFilePaths, + skippedFiles: allSkippedFiles, + }; + + logMemoryUsage('Pack - End'); + + return result; }; diff --git a/src/shared/processConcurrency.ts b/src/shared/processConcurrency.ts index 0507131a2..fb76ae826 100644 --- a/src/shared/processConcurrency.ts +++ b/src/shared/processConcurrency.ts @@ -31,8 +31,6 @@ const getWorkerPath = (workerType: WorkerType): string => { return new URL('../core/file/workers/fileProcessWorker.js', import.meta.url).href; case 'securityCheck': return new URL('../core/security/workers/securityCheckWorker.js', import.meta.url).href; - case 'calculateMetrics': - return new URL('../core/metrics/workers/calculateMetricsWorker.js', import.meta.url).href; case 'defaultAction': return new URL('../cli/actions/workers/defaultActionWorker.js', import.meta.url).href; default: diff --git a/src/shared/unifiedWorker.ts b/src/shared/unifiedWorker.ts index 32594f4c6..c279149c8 100644 --- a/src/shared/unifiedWorker.ts +++ b/src/shared/unifiedWorker.ts @@ -12,7 +12,7 @@ import { workerData } from 'node:worker_threads'; // Worker type definitions -export type WorkerType = 'fileProcess' | 'securityCheck' | 'calculateMetrics' | 'defaultAction'; +export type WorkerType = 'fileProcess' | 'securityCheck' | 'defaultAction'; // Worker handler type - uses 'any' to accommodate different worker signatures // biome-ignore lint/suspicious/noExplicitAny: Worker handlers have varying signatures @@ -49,11 +49,6 @@ const loadWorkerHandler = async ( result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination }; break; } - case 'calculateMetrics': { - const module = await import('../core/metrics/workers/calculateMetricsWorker.js'); - result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination }; - break; - } case 'defaultAction': { const module = await import('../cli/actions/workers/defaultActionWorker.js'); result = { handler: module.default as WorkerHandler, cleanup: module.onWorkerTermination }; @@ -95,11 +90,6 @@ const inferWorkerTypeFromTask = (task: unknown): WorkerType | null => { return 'fileProcess'; } - // calculateMetrics: has content, encoding (must check before securityCheck) - if ('content' in taskObj && 'encoding' in taskObj) { - return 'calculateMetrics'; - } - // securityCheck: has filePath, content, type if ('filePath' in taskObj && 'content' in taskObj && 'type' in taskObj) { return 'securityCheck'; diff --git a/tests/core/metrics/TokenCounter.test.ts b/tests/core/metrics/TokenCounter.test.ts index dedc8dbcf..a5160eeb8 100644 --- a/tests/core/metrics/TokenCounter.test.ts +++ b/tests/core/metrics/TokenCounter.test.ts @@ -1,33 +1,14 @@ -import { get_encoding, type Tiktoken } from 'tiktoken'; -import { afterEach, beforeEach, describe, expect, type Mock, test, vi } from 'vitest'; +import { afterEach, beforeEach, describe, expect, test, vi } from 'vitest'; import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; -import { logger } from '../../../src/shared/logger.js'; - -vi.mock('tiktoken', () => ({ - get_encoding: vi.fn(), -})); vi.mock('../../../src/shared/logger'); describe('TokenCounter', () => { let tokenCounter: TokenCounter; - let mockEncoder: { - encode: Mock; - free: Mock; - }; - - beforeEach(() => { - // Initialize mock encoder - mockEncoder = { - encode: vi.fn(), - free: vi.fn(), - }; - - // Setup mock encoder behavior - vi.mocked(get_encoding).mockReturnValue(mockEncoder as unknown as Tiktoken); - // Create new TokenCounter instance + beforeEach(async () => { tokenCounter = new TokenCounter('o200k_base'); + await tokenCounter.init(); }); afterEach(() => { @@ -35,61 +16,29 @@ describe('TokenCounter', () => { vi.resetAllMocks(); }); - test('should initialize with o200k_base encoding', () => { - expect(get_encoding).toHaveBeenCalledWith('o200k_base'); - }); - test('should correctly count tokens for simple text', () => { - const text = 'Hello, world!'; - const mockTokens = [123, 456, 789]; // Example token IDs - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(3); // Length of mockTokens - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('Hello, world!'); + expect(count).toBe(4); }); test('should handle empty string', () => { - mockEncoder.encode.mockReturnValue([]); - const count = tokenCounter.countTokens(''); - expect(count).toBe(0); - expect(mockEncoder.encode).toHaveBeenCalledWith('', [], []); }); test('should handle multi-line text', () => { - const text = 'Line 1\nLine 2\nLine 3'; - const mockTokens = [1, 2, 3, 4, 5, 6]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(6); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('Line 1\nLine 2\nLine 3'); + expect(count).toBe(11); }); test('should handle special characters', () => { - const text = '!@#$%^&*()_+'; - const mockTokens = [1, 2, 3]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(3); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('!@#$%^&*()_+'); + expect(count).toBe(9); }); test('should handle unicode characters', () => { - const text = '你好,世界!🌍'; - const mockTokens = [1, 2, 3, 4]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(4); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('你好,世界!🌍'); + expect(count).toBe(6); }); test('should handle code snippets', () => { @@ -98,13 +47,8 @@ describe('TokenCounter', () => { console.log("Hello, world!"); } `; - const mockTokens = Array(10).fill(1); // 10 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); - - expect(count).toBe(10); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + expect(count).toBe(17); }); test('should handle markdown text', () => { @@ -116,52 +60,23 @@ describe('TokenCounter', () => { **Bold text** and _italic text_ `; - const mockTokens = Array(15).fill(1); // 15 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); - - expect(count).toBe(15); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + expect(count).toBe(35); }); test('should handle very long text', () => { const text = 'a'.repeat(10000); - const mockTokens = Array(100).fill(1); // 100 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); - - expect(count).toBe(100); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + expect(count).toBe(1250); }); - test('should properly handle encoding errors without file path', () => { - const error = new Error('Encoding error'); - mockEncoder.encode.mockImplementation(() => { - throw error; - }); - - const count = tokenCounter.countTokens('test content'); - - expect(count).toBe(0); - expect(logger.warn).toHaveBeenCalledWith('Failed to count tokens. error: Encoding error'); + test('should return 0 for errors when not initialized', () => { + const uninitCounter = new TokenCounter('o200k_base'); + // Not calling init() - should throw + expect(() => uninitCounter.countTokens('test')).toThrow('TokenCounter not initialized'); }); - test('should properly handle encoding errors with file path', () => { - const error = new Error('Encoding error'); - mockEncoder.encode.mockImplementation(() => { - throw error; - }); - - const count = tokenCounter.countTokens('test content', 'test.txt'); - - expect(count).toBe(0); - expect(logger.warn).toHaveBeenCalledWith('Failed to count tokens. path: test.txt, error: Encoding error'); - }); - - test('should free encoder resources on cleanup', () => { - tokenCounter.free(); - expect(mockEncoder.free).toHaveBeenCalled(); + test('should free without error (no-op for gpt-tokenizer)', () => { + expect(() => tokenCounter.free()).not.toThrow(); }); }); diff --git a/tests/core/metrics/calculateGitDiffMetrics.test.ts b/tests/core/metrics/calculateGitDiffMetrics.test.ts index adcdf5d75..ae77806c2 100644 --- a/tests/core/metrics/calculateGitDiffMetrics.test.ts +++ b/tests/core/metrics/calculateGitDiffMetrics.test.ts @@ -2,28 +2,20 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { RepomixConfigMerged } from '../../../src/config/configSchema.js'; import type { GitDiffResult } from '../../../src/core/git/gitDiffHandle.js'; import { calculateGitDiffMetrics } from '../../../src/core/metrics/calculateGitDiffMetrics.js'; -import { countTokens, type TokenCountTask } from '../../../src/core/metrics/workers/calculateMetricsWorker.js'; +import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; import { logger } from '../../../src/shared/logger.js'; -import type { TaskRunner, WorkerOptions } from '../../../src/shared/processConcurrency.js'; vi.mock('../../../src/shared/logger'); -const mockInitTaskRunner = (_options: WorkerOptions): TaskRunner => { - return { - run: async (task: TokenCountTask) => { - return await countTokens(task); - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, +describe('calculateGitDiffMetrics', () => { + const mockGetTokenCounter = async () => { + const counter = new TokenCounter('o200k_base'); + await counter.init(); + return counter; }; -}; -describe('calculateGitDiffMetrics', () => { const mockConfig: RepomixConfigMerged = { - input: { - maxFileSize: 50 * 1024 * 1024, - }, + input: { maxFileSize: 50 * 1024 * 1024 }, output: { filePath: 'test-output.txt', style: 'xml', @@ -58,21 +50,11 @@ describe('calculateGitDiffMetrics', () => { useDefaultPatterns: true, customPatterns: [], }, - security: { - enableSecurityCheck: true, - }, - tokenCount: { - encoding: 'o200k_base' as const, - }, + security: { enableSecurityCheck: true }, + tokenCount: { encoding: 'o200k_base' as const }, cwd: '/test/project', }; - const mockTaskRunner = mockInitTaskRunner({ - numOfTasks: 1, - workerType: 'calculateMetrics', - runtime: 'worker_threads', - }); - beforeEach(() => { vi.clearAllMocks(); }); @@ -81,13 +63,7 @@ describe('calculateGitDiffMetrics', () => { it('should return 0 when includeDiffs is false', async () => { const configWithDisabledDiffs = { ...mockConfig, - output: { - ...mockConfig.output, - git: { - ...mockConfig.output.git, - includeDiffs: false, - }, - }, + output: { ...mockConfig.output, git: { ...mockConfig.output.git, includeDiffs: false } }, }; const gitDiffResult: GitDiffResult = { @@ -95,210 +71,104 @@ describe('calculateGitDiffMetrics', () => { stagedDiffContent: 'some staged content', }; - const result = await calculateGitDiffMetrics(configWithDisabledDiffs, gitDiffResult, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitDiffMetrics(configWithDisabledDiffs, gitDiffResult); expect(result).toBe(0); }); it('should return 0 when git config is undefined', async () => { const configWithoutGit = { ...mockConfig, - output: { - ...mockConfig.output, - git: undefined, - }, + output: { ...mockConfig.output, git: undefined }, } as RepomixConfigMerged; - const gitDiffResult: GitDiffResult = { + const result = await calculateGitDiffMetrics(configWithoutGit, { workTreeDiffContent: 'some diff content', stagedDiffContent: 'some staged content', - }; - - const result = await calculateGitDiffMetrics(configWithoutGit, gitDiffResult, { - taskRunner: mockTaskRunner, }); - expect(result).toBe(0); }); }); describe('when git diff result is unavailable', () => { it('should return 0 when gitDiffResult is undefined', async () => { - const result = await calculateGitDiffMetrics(mockConfig, undefined, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitDiffMetrics(mockConfig, undefined); expect(result).toBe(0); }); it('should return 0 when both diff contents are empty', async () => { - const gitDiffResult: GitDiffResult = { + const result = await calculateGitDiffMetrics(mockConfig, { workTreeDiffContent: '', stagedDiffContent: '', - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: mockTaskRunner, }); - - expect(result).toBe(0); - }); - - it('should return 0 when both diff contents are undefined', async () => { - const gitDiffResult = { - workTreeDiffContent: undefined as unknown as string, - stagedDiffContent: undefined as unknown as string, - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: mockTaskRunner, - }); - expect(result).toBe(0); }); }); describe('when processing git diffs', () => { it('should calculate tokens for both workTree and staged diffs', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'work tree changes', - stagedDiffContent: 'staged changes', - }; - - const mockTaskRunnerSpy = vi - .fn() - .mockResolvedValueOnce(5) // workTree tokens - .mockResolvedValueOnce(3); // staged tokens - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledTimes(2); - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'work tree changes', - encoding: 'o200k_base', - }); - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'staged changes', - encoding: 'o200k_base', - }); - expect(result).toBe(8); // 5 + 3 + const result = await calculateGitDiffMetrics( + mockConfig, + { + workTreeDiffContent: 'work tree changes', + stagedDiffContent: 'staged changes', + }, + { getTokenCounter: mockGetTokenCounter }, + ); + // 'work tree changes' = 3 tokens, 'staged changes' = 3 tokens + expect(result).toBe(6); }); it('should calculate tokens for workTree diff only', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'work tree changes only', - stagedDiffContent: '', - }; - - const mockTaskRunnerSpy = vi.fn().mockResolvedValueOnce(7); - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledTimes(1); - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'work tree changes only', - encoding: 'o200k_base', - }); - expect(result).toBe(7); + const result = await calculateGitDiffMetrics( + mockConfig, + { + workTreeDiffContent: 'work tree changes only', + stagedDiffContent: '', + }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(result).toBe(4); }); it('should calculate tokens for staged diff only', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: '', - stagedDiffContent: 'staged changes only', - }; - - const mockTaskRunnerSpy = vi.fn().mockResolvedValueOnce(4); - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledTimes(1); - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'staged changes only', - encoding: 'o200k_base', - }); + const result = await calculateGitDiffMetrics( + mockConfig, + { + workTreeDiffContent: '', + stagedDiffContent: 'staged changes only', + }, + { getTokenCounter: mockGetTokenCounter }, + ); expect(result).toBe(4); }); it('should handle large diff content correctly', async () => { const largeDiffContent = 'a'.repeat(10000); - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: largeDiffContent, - stagedDiffContent: largeDiffContent, - }; - - const result = await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: mockTaskRunner, - }); - - expect(result).toBeGreaterThan(0); - expect(typeof result).toBe('number'); + const result = await calculateGitDiffMetrics( + mockConfig, + { + workTreeDiffContent: largeDiffContent, + stagedDiffContent: largeDiffContent, + }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(result).toBe(2500); }); }); describe('error handling', () => { - it('should throw error when task runner fails', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'some content', - stagedDiffContent: 'some staged content', - }; - - const errorTaskRunner: TaskRunner = { - run: vi.fn().mockRejectedValue(new Error('Task runner failed')), - cleanup: async () => {}, + it('should throw error when getTokenCounter fails', async () => { + const mockErrorGetTokenCounter = async () => { + throw new Error('Token counter failed'); }; await expect( - calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: errorTaskRunner, - }), - ).rejects.toThrow('Task runner failed'); - - expect(logger.error).toHaveBeenCalledWith('Error during git diff token calculation:', expect.any(Error)); - }); - - it('should handle partial task runner failures', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'work tree content', - stagedDiffContent: 'staged content', - }; - - const errorTaskRunner: TaskRunner = { - run: vi - .fn() - .mockResolvedValueOnce(5) // First call succeeds - .mockRejectedValueOnce(new Error('Second call fails')), // Second call fails - cleanup: async () => {}, - }; - - await expect( - calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: errorTaskRunner, - }), - ).rejects.toThrow('Second call fails'); + calculateGitDiffMetrics( + mockConfig, + { workTreeDiffContent: 'some content', stagedDiffContent: '' }, + { getTokenCounter: mockErrorGetTokenCounter }, + ), + ).rejects.toThrow('Token counter failed'); expect(logger.error).toHaveBeenCalledWith('Error during git diff token calculation:', expect.any(Error)); }); @@ -306,51 +176,16 @@ describe('calculateGitDiffMetrics', () => { describe('logging', () => { it('should log trace messages for successful calculation', async () => { - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'test content', - stagedDiffContent: 'staged content', - }; - - await calculateGitDiffMetrics(mockConfig, gitDiffResult, { - taskRunner: mockTaskRunner, - }); + await calculateGitDiffMetrics( + mockConfig, + { workTreeDiffContent: 'test content', stagedDiffContent: '' }, + { getTokenCounter: mockGetTokenCounter }, + ); - expect(logger.trace).toHaveBeenCalledWith('Starting git diff token calculation using worker'); + expect(logger.trace).toHaveBeenCalledWith('Starting git diff token calculation on main thread'); expect(logger.trace).toHaveBeenCalledWith( expect.stringMatching(/Git diff token calculation completed in \d+\.\d+ms/), ); }); }); - - describe('encoding configuration', () => { - it('should use correct encoding from config', async () => { - const configWithDifferentEncoding = { - ...mockConfig, - tokenCount: { - encoding: 'cl100k_base' as const, - }, - }; - - const gitDiffResult: GitDiffResult = { - workTreeDiffContent: 'test content', - stagedDiffContent: '', - }; - - const mockTaskRunnerSpy = vi.fn().mockResolvedValueOnce(10); - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - await calculateGitDiffMetrics(configWithDifferentEncoding, gitDiffResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'test content', - encoding: 'cl100k_base', - }); - }); - }); }); diff --git a/tests/core/metrics/calculateGitLogMetrics.test.ts b/tests/core/metrics/calculateGitLogMetrics.test.ts index 1c53b90b7..b31335c3d 100644 --- a/tests/core/metrics/calculateGitLogMetrics.test.ts +++ b/tests/core/metrics/calculateGitLogMetrics.test.ts @@ -1,29 +1,20 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { RepomixConfigMerged } from '../../../src/config/configSchema.js'; -import type { GitLogResult } from '../../../src/core/git/gitLogHandle.js'; import { calculateGitLogMetrics } from '../../../src/core/metrics/calculateGitLogMetrics.js'; -import { countTokens, type TokenCountTask } from '../../../src/core/metrics/workers/calculateMetricsWorker.js'; +import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; import { logger } from '../../../src/shared/logger.js'; -import type { TaskRunner, WorkerOptions } from '../../../src/shared/processConcurrency.js'; vi.mock('../../../src/shared/logger'); -const mockInitTaskRunner = (_options: WorkerOptions): TaskRunner => { - return { - run: async (task: TokenCountTask) => { - return await countTokens(task); - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, +describe('calculateGitLogMetrics', () => { + const mockGetTokenCounter = async () => { + const counter = new TokenCounter('o200k_base'); + await counter.init(); + return counter; }; -}; -describe('calculateGitLogMetrics', () => { const mockConfig: RepomixConfigMerged = { - input: { - maxFileSize: 50 * 1024 * 1024, - }, + input: { maxFileSize: 50 * 1024 * 1024 }, output: { filePath: 'test-output.txt', style: 'xml', @@ -58,21 +49,11 @@ describe('calculateGitLogMetrics', () => { useDefaultPatterns: true, customPatterns: [], }, - security: { - enableSecurityCheck: true, - }, - tokenCount: { - encoding: 'o200k_base' as const, - }, + security: { enableSecurityCheck: true }, + tokenCount: { encoding: 'o200k_base' as const }, cwd: '/test/project', }; - const mockTaskRunner = mockInitTaskRunner({ - numOfTasks: 1, - workerType: 'calculateMetrics', - runtime: 'worker_threads', - }); - beforeEach(() => { vi.clearAllMocks(); }); @@ -81,345 +62,128 @@ describe('calculateGitLogMetrics', () => { it('should return 0 when includeLogs is false', async () => { const configWithDisabledLogs = { ...mockConfig, - output: { - ...mockConfig.output, - git: { - ...mockConfig.output.git, - includeLogs: false, - }, - }, + output: { ...mockConfig.output, git: { ...mockConfig.output.git, includeLogs: false } }, }; - - const gitLogResult: GitLogResult = { + const result = await calculateGitLogMetrics(configWithDisabledLogs, { logContent: 'some log content', commits: [], - }; - - const result = await calculateGitLogMetrics(configWithDisabledLogs, gitLogResult, { - taskRunner: mockTaskRunner, }); - expect(result).toEqual({ gitLogTokenCount: 0 }); }); it('should return 0 when git config is undefined', async () => { const configWithoutGit = { ...mockConfig, - output: { - ...mockConfig.output, - git: undefined, - }, + output: { ...mockConfig.output, git: undefined }, } as RepomixConfigMerged; - - const gitLogResult: GitLogResult = { + const result = await calculateGitLogMetrics(configWithoutGit, { logContent: 'some log content', commits: [], - }; - - const result = await calculateGitLogMetrics(configWithoutGit, gitLogResult, { - taskRunner: mockTaskRunner, }); - expect(result).toEqual({ gitLogTokenCount: 0 }); }); }); describe('when git log result is unavailable', () => { it('should return 0 when gitLogResult is undefined', async () => { - const result = await calculateGitLogMetrics(mockConfig, undefined, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitLogMetrics(mockConfig, undefined); expect(result).toEqual({ gitLogTokenCount: 0 }); }); it('should return 0 when logContent is empty', async () => { - const gitLogResult: GitLogResult = { - logContent: '', - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(result).toEqual({ gitLogTokenCount: 0 }); - }); - - it('should return 0 when logContent is undefined', async () => { - const gitLogResult = { - logContent: undefined as unknown as string, - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitLogMetrics(mockConfig, { logContent: '', commits: [] }); expect(result).toEqual({ gitLogTokenCount: 0 }); }); }); describe('when processing git logs', () => { it('should calculate tokens for git log content', async () => { - const gitLogResult: GitLogResult = { - logContent: 'commit abc123\nAuthor: Test User\nDate: 2023-01-01\n\nTest commit message', - commits: [], - }; - - const mockTaskRunnerSpy = vi.fn().mockResolvedValueOnce(15); - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledTimes(1); - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'commit abc123\nAuthor: Test User\nDate: 2023-01-01\n\nTest commit message', - encoding: 'o200k_base', - }); - expect(result).toEqual({ gitLogTokenCount: 15 }); + const result = await calculateGitLogMetrics( + mockConfig, + { + logContent: 'commit abc123\nAuthor: Test User\nDate: 2023-01-01\n\nTest commit message', + commits: [], + }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(result.gitLogTokenCount).toBe(22); }); it('should handle large log content correctly', async () => { - const largeLogContent = `${'commit '.repeat(1000)}large commit log`; - const gitLogResult: GitLogResult = { - logContent: largeLogContent, - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(result.gitLogTokenCount).toBeGreaterThan(0); - expect(typeof result.gitLogTokenCount).toBe('number'); - }); - - it('should handle complex git log with multiple commits', async () => { - const complexLogContent = `commit abc123def456 -Author: John Doe -Date: Mon Jan 1 12:00:00 2023 +0000 - - Add new feature for user authentication - - - Implemented OAuth2 integration - - Added user session management - - Updated security middleware - -commit def456ghi789 -Author: Jane Smith -Date: Sun Dec 31 18:30:00 2022 +0000 - - Fix critical bug in payment processing - - - Resolved transaction timeout issue - - Added proper error handling - - Improved logging for debugging`; - - const gitLogResult: GitLogResult = { - logContent: complexLogContent, - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(result.gitLogTokenCount).toBeGreaterThan(0); - expect(typeof result.gitLogTokenCount).toBe('number'); + const result = await calculateGitLogMetrics( + mockConfig, + { + logContent: `${'commit '.repeat(1000)}large commit log`, + commits: [], + }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(result.gitLogTokenCount).toBe(1003); }); }); describe('error handling', () => { - it('should return 0 when task runner fails', async () => { - const gitLogResult: GitLogResult = { - logContent: 'some log content', - commits: [], - }; - - const errorTaskRunner: TaskRunner = { - run: vi.fn().mockRejectedValue(new Error('Task runner failed')), - cleanup: async () => {}, + it('should return 0 when getTokenCounter fails', async () => { + const mockErrorGetTokenCounter = async () => { + throw new Error('Token counter failed'); }; - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: errorTaskRunner, - }); - + const result = await calculateGitLogMetrics( + mockConfig, + { logContent: 'some log content', commits: [] }, + { getTokenCounter: mockErrorGetTokenCounter }, + ); expect(result).toEqual({ gitLogTokenCount: 0 }); expect(logger.error).toHaveBeenCalledWith('Failed to calculate git log metrics:', expect.any(Error)); }); - - it('should handle network timeout errors gracefully', async () => { - const gitLogResult: GitLogResult = { - logContent: 'test log content', - commits: [], - }; - - const timeoutError = new Error('Request timeout'); - const errorTaskRunner = { - run: vi.fn().mockRejectedValue(timeoutError), - cleanup: async () => {}, - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: errorTaskRunner, - }); - - expect(result).toEqual({ gitLogTokenCount: 0 }); - expect(logger.error).toHaveBeenCalledWith('Failed to calculate git log metrics:', timeoutError); - }); }); describe('logging', () => { it('should log trace messages for successful calculation', async () => { - const gitLogResult: GitLogResult = { - logContent: 'test log content', - commits: [], - }; - - await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(logger.trace).toHaveBeenCalledWith('Starting git log token calculation using worker'); + await calculateGitLogMetrics( + mockConfig, + { logContent: 'test log content', commits: [] }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(logger.trace).toHaveBeenCalledWith('Starting git log token calculation on main thread'); expect(logger.trace).toHaveBeenCalledWith( expect.stringMatching(/Git log token calculation completed in \d+\.\d+ms/), ); }); - - it('should not log completion message on error', async () => { - const gitLogResult: GitLogResult = { - logContent: 'test content', - commits: [], - }; - - const errorTaskRunner = { - run: vi.fn().mockRejectedValue(new Error('Test error')), - cleanup: async () => {}, - }; - - await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: errorTaskRunner, - }); - - expect(logger.trace).toHaveBeenCalledWith('Starting git log token calculation using worker'); - expect(logger.trace).not.toHaveBeenCalledWith(expect.stringMatching(/Git log token calculation completed/)); - }); - }); - - describe('encoding configuration', () => { - it('should use correct encoding from config', async () => { - const configWithDifferentEncoding = { - ...mockConfig, - tokenCount: { - encoding: 'cl100k_base' as const, - }, - }; - - const gitLogResult: GitLogResult = { - logContent: 'test log content', - commits: [], - }; - - const mockTaskRunnerSpy = vi.fn().mockResolvedValueOnce(10); - - const customTaskRunner: TaskRunner = { - run: mockTaskRunnerSpy, - cleanup: async () => {}, - }; - - await calculateGitLogMetrics(configWithDifferentEncoding, gitLogResult, { - taskRunner: customTaskRunner, - }); - - expect(mockTaskRunnerSpy).toHaveBeenCalledWith({ - content: 'test log content', - encoding: 'cl100k_base', - }); - }); - }); - - describe('return value structure', () => { - it('should always return an object with gitLogTokenCount property', async () => { - const gitLogResult: GitLogResult = { - logContent: 'test content', - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(result).toHaveProperty('gitLogTokenCount'); - expect(typeof result.gitLogTokenCount).toBe('number'); - }); - - it('should return consistent structure on error', async () => { - const gitLogResult: GitLogResult = { - logContent: 'test content', - commits: [], - }; - - const errorTaskRunner = { - run: vi.fn().mockRejectedValue(new Error('Test error')), - cleanup: async () => {}, - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: errorTaskRunner, - }); - - expect(result).toEqual({ gitLogTokenCount: 0 }); - expect(Object.keys(result)).toEqual(['gitLogTokenCount']); - }); }); describe('edge cases', () => { it('should handle very short log content', async () => { - const gitLogResult: GitLogResult = { - logContent: 'a', - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitLogMetrics( + mockConfig, + { logContent: 'a', commits: [] }, + { getTokenCounter: mockGetTokenCounter }, + ); expect(result.gitLogTokenCount).toBeGreaterThanOrEqual(0); }); it('should handle log content with special characters', async () => { - const gitLogResult: GitLogResult = { - logContent: 'commit 🚀 emoji test\n\n日本語のコミットメッセージ\n\nSpecial chars: ñáéíóú', - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - + const result = await calculateGitLogMetrics( + mockConfig, + { + logContent: 'commit 🚀 emoji test\n\n日本語のコミットメッセージ\n\nSpecial chars: ñáéíóú', + commits: [], + }, + { getTokenCounter: mockGetTokenCounter }, + ); expect(result.gitLogTokenCount).toBeGreaterThan(0); - expect(typeof result.gitLogTokenCount).toBe('number'); }); + }); - it('should handle log content with only whitespace', async () => { - const gitLogResult: GitLogResult = { - logContent: ' \n\t \r\n ', - commits: [], - }; - - const result = await calculateGitLogMetrics(mockConfig, gitLogResult, { - taskRunner: mockTaskRunner, - }); - - expect(result.gitLogTokenCount).toBeGreaterThanOrEqual(0); + describe('return value structure', () => { + it('should always return an object with gitLogTokenCount property', async () => { + const result = await calculateGitLogMetrics( + mockConfig, + { logContent: 'test content', commits: [] }, + { getTokenCounter: mockGetTokenCounter }, + ); + expect(result).toHaveProperty('gitLogTokenCount'); + expect(typeof result.gitLogTokenCount).toBe('number'); }); }); }); diff --git a/tests/core/metrics/calculateMetrics.test.ts b/tests/core/metrics/calculateMetrics.test.ts index 672cd8ee1..ff418b45a 100644 --- a/tests/core/metrics/calculateMetrics.test.ts +++ b/tests/core/metrics/calculateMetrics.test.ts @@ -8,6 +8,7 @@ import { createMockConfig } from '../../testing/testUtils.js'; vi.mock('../../../src/core/metrics/TokenCounter.js', () => { return { + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn().mockImplementation(() => ({ countTokens: vi.fn().mockReturnValue(10), free: vi.fn(), @@ -54,17 +55,11 @@ describe('calculateMetrics', () => { const gitDiffResult: GitDiffResult | undefined = undefined; - const mockTaskRunner = { - run: vi.fn(), - cleanup: vi.fn(), - }; - const result = await calculateMetrics(processedFiles, output, progressCallback, config, gitDiffResult, undefined, { calculateSelectiveFileMetrics, calculateOutputMetrics: async () => 30, calculateGitDiffMetrics: () => Promise.resolve(0), calculateGitLogMetrics: () => Promise.resolve({ gitLogTokenCount: 0 }), - taskRunner: mockTaskRunner, }); expect(progressCallback).toHaveBeenCalledWith('Calculating metrics...'); @@ -73,9 +68,6 @@ describe('calculateMetrics', () => { ['file2.txt', 'file1.txt'], // sorted by character count desc 'o200k_base', progressCallback, - expect.objectContaining({ - taskRunner: expect.any(Object), - }), ); expect(result).toEqual(aggregatedResult); }); diff --git a/tests/core/metrics/calculateOutputMetrics.test.ts b/tests/core/metrics/calculateOutputMetrics.test.ts index 104914ff7..efb6627b6 100644 --- a/tests/core/metrics/calculateOutputMetrics.test.ts +++ b/tests/core/metrics/calculateOutputMetrics.test.ts @@ -1,30 +1,24 @@ import { describe, expect, it, vi } from 'vitest'; import { calculateOutputMetrics } from '../../../src/core/metrics/calculateOutputMetrics.js'; -import { countTokens, type TokenCountTask } from '../../../src/core/metrics/workers/calculateMetricsWorker.js'; +import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; import { logger } from '../../../src/shared/logger.js'; -import type { WorkerOptions } from '../../../src/shared/processConcurrency.js'; vi.mock('../../../src/shared/logger'); -const mockInitTaskRunner = (_options: WorkerOptions) => { - return { - run: async (task: T) => { - return (await countTokens(task as TokenCountTask)) as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, +describe('calculateOutputMetrics', () => { + const mockGetTokenCounter = async () => { + const counter = new TokenCounter('o200k_base'); + await counter.init(); + return counter; }; -}; -describe('calculateOutputMetrics', () => { it('should calculate metrics for output content', async () => { const content = 'test content'; - const encoding = 'o200k_base'; + const encoding = 'o200k_base' as const; const path = 'test.txt'; const result = await calculateOutputMetrics(content, encoding, path, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }); expect(result).toBe(2); // 'test content' should be counted as 2 tokens @@ -32,46 +26,39 @@ describe('calculateOutputMetrics', () => { it('should work without a specified path', async () => { const content = 'test content'; - const encoding = 'o200k_base'; + const encoding = 'o200k_base' as const; const result = await calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }); expect(result).toBe(2); }); - it('should handle errors from worker', async () => { + it('should handle errors from token counter', async () => { const content = 'test content'; - const encoding = 'o200k_base'; - const mockError = new Error('Worker error'); - - const mockErrorTaskRunner = (_options: WorkerOptions) => { - return { - run: async (_task: T) => { - throw mockError; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; + const encoding = 'o200k_base' as const; + const mockError = new Error('Token counter error'); + + const mockErrorGetTokenCounter = async () => { + throw mockError; }; await expect( calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockErrorTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockErrorGetTokenCounter, }), - ).rejects.toThrow('Worker error'); + ).rejects.toThrow('Token counter error'); expect(logger.error).toHaveBeenCalledWith('Error during token count:', mockError); }); it('should handle empty content', async () => { const content = ''; - const encoding = 'o200k_base'; + const encoding = 'o200k_base' as const; const result = await calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }); expect(result).toBe(0); @@ -79,101 +66,12 @@ describe('calculateOutputMetrics', () => { it('should work with longer complex content', async () => { const content = 'This is a longer test content with multiple sentences. It should work correctly.'; - const encoding = 'o200k_base'; + const encoding = 'o200k_base' as const; const result = await calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), - }); - - expect(result).toBeGreaterThan(0); - expect(typeof result).toBe('number'); - }); - - it('should process large content in parallel', async () => { - // Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL - const content = 'a'.repeat(1_100_000); // 1.1MB of content - const encoding = 'o200k_base'; - const path = 'large-file.txt'; - - let chunksProcessed = 0; - const mockParallelTaskRunner = (_options: WorkerOptions) => { - return { - run: async (_task: T) => { - chunksProcessed++; - // Return a fixed token count for each chunk - return 100 as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; - }; - - const result = await calculateOutputMetrics(content, encoding, path, { - taskRunner: mockParallelTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }); - expect(chunksProcessed).toBeGreaterThan(1); // Should have processed multiple chunks - expect(result).toBe(100_000); // 1000 chunks * 100 tokens per chunk - }); - - it('should handle errors in parallel processing', async () => { - const content = 'a'.repeat(1_100_000); // 1.1MB of content - const encoding = 'o200k_base'; - const mockError = new Error('Parallel processing error'); - - const mockErrorTaskRunner = (_options: WorkerOptions) => { - return { - run: async (_task: T) => { - throw mockError; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; - }; - - await expect( - calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockErrorTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), - }), - ).rejects.toThrow('Parallel processing error'); - - expect(logger.error).toHaveBeenCalledWith('Error during token count:', mockError); - }); - - it('should correctly split content into chunks for parallel processing', async () => { - const content = 'a'.repeat(1_100_000); // 1.1MB of content - const encoding = 'o200k_base'; - const processedChunks: string[] = []; - - const mockChunkTrackingTaskRunner = (_options: WorkerOptions) => { - return { - run: async (task: T) => { - const outputTask = task as TokenCountTask; - processedChunks.push(outputTask.content); - return outputTask.content.length as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, - }; - }; - - await calculateOutputMetrics(content, encoding, undefined, { - taskRunner: mockChunkTrackingTaskRunner({ - numOfTasks: 1, - workerType: 'calculateMetrics', - runtime: 'worker_threads', - }), - }); - - // Check that chunks are roughly equal in size - const _expectedChunkSize = Math.ceil(content.length / 1000); // CHUNK_SIZE is 1000 - const chunkSizes = processedChunks.map((chunk) => chunk.length); - - expect(processedChunks.length).toBe(1000); // Should have 1000 chunks - expect(Math.max(...chunkSizes) - Math.min(...chunkSizes)).toBeLessThanOrEqual(1); // Chunks should be almost equal in size - expect(processedChunks.join('')).toBe(content); // All content should be processed + expect(result).toBe(15); }); }); diff --git a/tests/core/metrics/calculateSelectiveFileMetrics.test.ts b/tests/core/metrics/calculateSelectiveFileMetrics.test.ts index 2e89b4161..347ffbab9 100644 --- a/tests/core/metrics/calculateSelectiveFileMetrics.test.ts +++ b/tests/core/metrics/calculateSelectiveFileMetrics.test.ts @@ -1,26 +1,16 @@ import { describe, expect, it, vi } from 'vitest'; import type { ProcessedFile } from '../../../src/core/file/fileTypes.js'; import { calculateSelectiveFileMetrics } from '../../../src/core/metrics/calculateSelectiveFileMetrics.js'; -import { countTokens, type TokenCountTask } from '../../../src/core/metrics/workers/calculateMetricsWorker.js'; -import type { WorkerOptions } from '../../../src/shared/processConcurrency.js'; +import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; import type { RepomixProgressCallback } from '../../../src/shared/types.js'; -vi.mock('../../shared/processConcurrency', () => ({ - getProcessConcurrency: () => 1, -})); - -const mockInitTaskRunner = (_options: WorkerOptions) => { - return { - run: async (task: T) => { - return (await countTokens(task as TokenCountTask)) as R; - }, - cleanup: async () => { - // Mock cleanup - no-op for tests - }, +describe('calculateSelectiveFileMetrics', () => { + const mockGetTokenCounter = async () => { + const counter = new TokenCounter('o200k_base'); + await counter.init(); + return counter; }; -}; -describe('calculateSelectiveFileMetrics', () => { it('should calculate metrics for selective files only', async () => { const processedFiles: ProcessedFile[] = [ { path: 'file1.txt', content: 'a'.repeat(100) }, @@ -36,14 +26,17 @@ describe('calculateSelectiveFileMetrics', () => { 'o200k_base', progressCallback, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }, ); - expect(result).toEqual([ - { path: 'file1.txt', charCount: 100, tokenCount: 13 }, - { path: 'file3.txt', charCount: 300, tokenCount: 75 }, - ]); + expect(result.length).toBe(2); + expect(result[0].path).toBe('file1.txt'); + expect(result[0].charCount).toBe(100); + expect(result[0].tokenCount).toBe(13); + expect(result[1].path).toBe('file3.txt'); + expect(result[1].charCount).toBe(300); + expect(result[1].tokenCount).toBe(75); }); it('should return empty array when no target files match', async () => { @@ -57,7 +50,7 @@ describe('calculateSelectiveFileMetrics', () => { 'o200k_base', progressCallback, { - taskRunner: mockInitTaskRunner({ numOfTasks: 1, workerType: 'calculateMetrics', runtime: 'worker_threads' }), + getTokenCounter: mockGetTokenCounter, }, ); diff --git a/tests/core/metrics/diffTokenCount.test.ts b/tests/core/metrics/diffTokenCount.test.ts index dd5612841..29f9c0a97 100644 --- a/tests/core/metrics/diffTokenCount.test.ts +++ b/tests/core/metrics/diffTokenCount.test.ts @@ -7,6 +7,7 @@ import { createMockConfig } from '../../testing/testUtils.js'; // Mock the TokenCounter vi.mock('../../../src/core/metrics/TokenCounter.js', () => ({ + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn(), })); @@ -87,11 +88,6 @@ index 123..456 100644 }); // Mock dependency functions - const mockTaskRunner = { - run: vi.fn(), - cleanup: vi.fn(), - }; - const mockCalculateOutputMetrics = vi.fn().mockResolvedValue(15); const result = await calculateMetrics( @@ -109,7 +105,6 @@ index 123..456 100644 calculateOutputMetrics: mockCalculateOutputMetrics, calculateGitDiffMetrics: vi.fn().mockResolvedValue(25), calculateGitLogMetrics: vi.fn().mockResolvedValue({ gitLogTokenCount: 0 }), - taskRunner: mockTaskRunner, }, ); @@ -170,11 +165,6 @@ index 123..456 100644 }); // Mock dependency functions - const mockTaskRunner = { - run: vi.fn(), - cleanup: vi.fn(), - }; - const mockCalculateOutputMetrics = vi.fn().mockResolvedValue(15); const result = await calculateMetrics( @@ -189,7 +179,6 @@ index 123..456 100644 calculateOutputMetrics: mockCalculateOutputMetrics, calculateGitDiffMetrics: vi.fn().mockResolvedValue(0), calculateGitLogMetrics: vi.fn().mockResolvedValue({ gitLogTokenCount: 0 }), - taskRunner: mockTaskRunner, }, ); @@ -248,11 +237,6 @@ index 123..456 100644 }); // Mock dependency functions - const mockTaskRunner = { - run: vi.fn(), - cleanup: vi.fn(), - }; - const mockCalculateOutputMetrics = vi.fn().mockResolvedValue(15); const result = await calculateMetrics( @@ -267,7 +251,6 @@ index 123..456 100644 calculateOutputMetrics: mockCalculateOutputMetrics, calculateGitDiffMetrics: vi.fn().mockResolvedValue(0), calculateGitLogMetrics: vi.fn().mockResolvedValue({ gitLogTokenCount: 0 }), - taskRunner: mockTaskRunner, }, ); diff --git a/tests/core/packager.test.ts b/tests/core/packager.test.ts index f54bc94a3..95136f343 100644 --- a/tests/core/packager.test.ts +++ b/tests/core/packager.test.ts @@ -7,6 +7,7 @@ vi.mock('node:fs/promises'); vi.mock('fs/promises'); vi.mock('../../src/core/metrics/TokenCounter.js', () => { return { + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn().mockImplementation(() => ({ countTokens: vi.fn().mockReturnValue(10), free: vi.fn(), @@ -54,10 +55,6 @@ describe('packager', () => { produceOutput: vi.fn().mockResolvedValue({ outputForMetrics: mockOutput, }), - createMetricsTaskRunner: vi.fn().mockReturnValue({ - run: vi.fn().mockResolvedValue(0), - cleanup: vi.fn().mockResolvedValue(undefined), - }), calculateMetrics: vi.fn().mockResolvedValue({ totalFiles: 2, totalCharacters: 11, @@ -111,7 +108,6 @@ describe('packager', () => { mockConfig, undefined, undefined, - expect.objectContaining({ taskRunner: expect.anything() }), ); // Check the result of pack function diff --git a/tests/core/packager/diffsFunctionality.test.ts b/tests/core/packager/diffsFunctionality.test.ts index 1a598c6fb..c58f6f210 100644 --- a/tests/core/packager/diffsFunctionality.test.ts +++ b/tests/core/packager/diffsFunctionality.test.ts @@ -71,10 +71,6 @@ index 123..456 100644 fileTokenCounts: {}, }); const mockSortPaths = vi.fn().mockImplementation((paths) => paths); - const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({ - run: vi.fn().mockResolvedValue(0), - cleanup: vi.fn().mockResolvedValue(undefined), - }); // Config with diffs disabled if (mockConfig.output.git) { @@ -88,7 +84,6 @@ index 123..456 100644 validateFileSafety: mockValidateFileSafety, produceOutput: mockProduceOutput, calculateMetrics: mockCalculateMetrics, - createMetricsTaskRunner: mockCreateMetricsTaskRunner, sortPaths: mockSortPaths, }); @@ -126,10 +121,6 @@ index 123..456 100644 gitDiffTokenCount: 15, // Mock diff token count }); const mockSortPaths = vi.fn().mockImplementation((paths) => paths); - const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({ - run: vi.fn().mockResolvedValue(0), - cleanup: vi.fn().mockResolvedValue(undefined), - }); // Config with diffs enabled if (mockConfig.output.git) { @@ -143,7 +134,6 @@ index 123..456 100644 validateFileSafety: mockValidateFileSafety, produceOutput: mockProduceOutput, calculateMetrics: mockCalculateMetrics, - createMetricsTaskRunner: mockCreateMetricsTaskRunner, sortPaths: mockSortPaths, }); diff --git a/tests/core/packager/splitOutput.test.ts b/tests/core/packager/splitOutput.test.ts index ade19fa30..c1e92ba93 100644 --- a/tests/core/packager/splitOutput.test.ts +++ b/tests/core/packager/splitOutput.test.ts @@ -55,10 +55,6 @@ describe('packager split output', () => { getGitLogs: vi.fn().mockResolvedValue(undefined), produceOutput, calculateMetrics, - createMetricsTaskRunner: vi.fn().mockReturnValue({ - run: vi.fn().mockResolvedValue(0), - cleanup: vi.fn().mockResolvedValue(undefined), - }), }); expect(produceOutput).toHaveBeenCalledWith( @@ -79,7 +75,6 @@ describe('packager split output', () => { mockConfig, undefined, undefined, - expect.objectContaining({ taskRunner: expect.anything() }), ); expect(result.outputFiles).toEqual(['repomix-output.1.xml', 'repomix-output.2.xml']); diff --git a/tests/integration-tests/packager.test.ts b/tests/integration-tests/packager.test.ts index 81ee7cbbe..214de2fe0 100644 --- a/tests/integration-tests/packager.test.ts +++ b/tests/integration-tests/packager.test.ts @@ -115,10 +115,6 @@ describe.runIf(!isWindows)('packager integration', () => { }); }, produceOutput, - createMetricsTaskRunner: () => ({ - run: async () => 0, - cleanup: async () => {}, - }), calculateMetrics: async ( processedFiles, _output, diff --git a/tests/shared/processConcurrency.test.ts b/tests/shared/processConcurrency.test.ts index 289406ff6..c109332d8 100644 --- a/tests/shared/processConcurrency.test.ts +++ b/tests/shared/processConcurrency.test.ts @@ -153,13 +153,13 @@ describe('processConcurrency', () => { }); it('should pass runtime parameter to createWorkerPool', () => { - const taskRunner = initTaskRunner({ numOfTasks: 100, workerType: 'calculateMetrics', runtime: 'worker_threads' }); + const taskRunner = initTaskRunner({ numOfTasks: 100, workerType: 'securityCheck', runtime: 'worker_threads' }); expect(Tinypool).toHaveBeenCalledWith( expect.objectContaining({ runtime: 'worker_threads', workerData: expect.objectContaining({ - workerType: 'calculateMetrics', + workerType: 'securityCheck', }), }), ); diff --git a/tests/shared/unifiedWorker.test.ts b/tests/shared/unifiedWorker.test.ts index 7f3247b35..42f17d024 100644 --- a/tests/shared/unifiedWorker.test.ts +++ b/tests/shared/unifiedWorker.test.ts @@ -10,10 +10,6 @@ vi.mock('../../src/core/security/workers/securityCheckWorker.js', () => ({ default: vi.fn().mockResolvedValue(null), onWorkerTermination: vi.fn(), })); -vi.mock('../../src/core/metrics/workers/calculateMetricsWorker.js', () => ({ - default: vi.fn().mockResolvedValue(100), - onWorkerTermination: vi.fn(), -})); vi.mock('../../src/cli/actions/workers/defaultActionWorker.js', () => ({ default: vi.fn().mockResolvedValue({ packResult: {}, config: {} }), onWorkerTermination: vi.fn(), @@ -70,19 +66,6 @@ describe('unifiedWorker', () => { expect(fileProcessWorker.default).toHaveBeenCalledWith(task); }); - it('should infer calculateMetrics from task with content and encoding', async () => { - const { default: handler } = await import('../../src/shared/unifiedWorker.js'); - const task = { - content: 'test content', - encoding: 'cl100k_base', - }; - - await handler(task); - - const calculateMetricsWorker = await import('../../src/core/metrics/workers/calculateMetricsWorker.js'); - expect(calculateMetricsWorker.default).toHaveBeenCalledWith(task); - }); - it('should infer securityCheck from task with filePath, content, type', async () => { const { default: handler } = await import('../../src/shared/unifiedWorker.js'); const task = {