diff --git a/package-lock.json b/package-lock.json index 0013cc8c6..6a230f4dd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,6 +19,7 @@ "fast-xml-builder": "^1.1.4", "git-url-parse": "^16.1.0", "globby": "^16.1.1", + "gpt-tokenizer": "^3.4.0", "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", @@ -30,7 +31,6 @@ "picocolors": "^1.1.1", "picospinner": "^3.0.0", "tar": "^7.5.12", - "tiktoken": "^1.0.22", "tinyclip": "^0.1.12", "tinypool": "^2.1.0", "web-tree-sitter": "^0.26.7", @@ -3121,6 +3121,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gpt-tokenizer": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/gpt-tokenizer/-/gpt-tokenizer-3.4.0.tgz", + "integrity": "sha512-wxFLnhIXTDjYebd9A9pGl3e31ZpSypbpIJSOswbgop5jLte/AsZVDvjlbEuVFlsqZixVKqbcoNmRlFDf6pz/UQ==", + "license": "MIT" + }, "node_modules/handlebars": { "version": "4.7.9", "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.9.tgz", @@ -4872,12 +4878,6 @@ "url": "https://bevry.me/fund" } }, - "node_modules/tiktoken": { - "version": "1.0.22", - "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.22.tgz", - "integrity": "sha512-PKvy1rVF1RibfF3JlXBSP0Jrcw2uq3yXdgcEXtKTYn3QJ/cBRBHDnrJ5jHky+MENZ6DIPwNUGWpkVx+7joCpNA==", - "license": "MIT" - }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", diff --git a/package.json b/package.json index ebe04e7e0..15611ad6f 100644 --- a/package.json +++ b/package.json @@ -85,6 +85,7 @@ "fast-xml-builder": "^1.1.4", "git-url-parse": "^16.1.0", "globby": "^16.1.1", + "gpt-tokenizer": "^3.4.0", "handlebars": "^4.7.8", "iconv-lite": "^0.7.0", "is-binary-path": "^3.0.0", @@ -96,7 +97,6 @@ "picocolors": "^1.1.1", "picospinner": "^3.0.0", "tar": "^7.5.12", - "tiktoken": "^1.0.22", "tinyclip": "^0.1.12", "tinypool": "^2.1.0", "web-tree-sitter": "^0.26.7", diff --git a/src/config/configSchema.ts b/src/config/configSchema.ts index dbc713d63..74c1ef432 100644 --- a/src/config/configSchema.ts +++ b/src/config/configSchema.ts @@ -1,5 +1,5 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { z } from 'zod'; +import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js'; // Output style enum export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']); @@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({ enableSecurityCheck: z.boolean().default(true), }), tokenCount: z.object({ - encoding: z - .string() - .default('o200k_base') - .transform((val) => val as TiktokenEncoding), + encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'), }), }); diff --git a/src/core/metrics/TokenCounter.ts b/src/core/metrics/TokenCounter.ts index 7ae1dcb46..0ff759087 100644 --- a/src/core/metrics/TokenCounter.ts +++ b/src/core/metrics/TokenCounter.ts @@ -1,29 +1,65 @@ -import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -export class TokenCounter { - private encoding: Tiktoken; +// Supported token encoding types (compatible with tiktoken encoding names) +export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const; +export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number]; + +interface CountTokensOptions { + disallowedSpecial?: Set; +} + +type CountTokensFn = (text: string, options?: CountTokensOptions) => number; + +// Treat all text as regular content by disallowing nothing. +// This matches the old tiktoken behavior: encode(content, [], []).length +// where special tokens like <|endoftext|> are tokenized as ordinary text. +const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() }; + +// Lazy-loaded countTokens functions keyed by encoding +const encodingModules = new Map(); + +const loadEncoding = async (encodingName: TokenEncoding): Promise => { + const cached = encodingModules.get(encodingName); + if (cached) { + return cached; + } - constructor(encodingName: TiktokenEncoding) { - const startTime = process.hrtime.bigint(); + const startTime = process.hrtime.bigint(); - // Setup encoding with the specified model - this.encoding = get_encoding(encodingName); + // Dynamic import of the specific encoding module from gpt-tokenizer + const mod = await import(`gpt-tokenizer/encoding/${encodingName}`); + const countFn = mod.countTokens as CountTokensFn; + encodingModules.set(encodingName, countFn); - const endTime = process.hrtime.bigint(); - const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds + const endTime = process.hrtime.bigint(); + const initTime = Number(endTime - startTime) / 1e6; + logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`); - logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`); + return countFn; +}; + +export class TokenCounter { + private countFn: CountTokensFn | null = null; + private readonly encodingName: TokenEncoding; + + constructor(encodingName: TokenEncoding) { + this.encodingName = encodingName; + } + + async init(): Promise { + this.countFn = await loadEncoding(this.encodingName); } public countTokens(content: string, filePath?: string): number { + if (!this.countFn) { + throw new Error('TokenCounter not initialized. Call init() first.'); + } + try { - // Disable special token validation to handle files that may contain - // special token sequences (e.g., tokenizer configs with <|endoftext|>). - // This treats special tokens as ordinary text rather than control tokens, - // which is appropriate for general code/text analysis where we're not - // actually sending the content to an LLM API. - return this.encoding.encode(content, [], []).length; + // Use PLAIN_TEXT_OPTIONS to treat all content as ordinary text, + // matching the old tiktoken behavior: encode(content, [], []).length + // This also skips gpt-tokenizer's default regex scan for special tokens. + return this.countFn(content, PLAIN_TEXT_OPTIONS); } catch (error) { let message = ''; if (error instanceof Error) { @@ -42,7 +78,6 @@ export class TokenCounter { } } - public free(): void { - this.encoding.free(); - } + // No-op: gpt-tokenizer is pure JS, no WASM resources to free + public free(): void {} } diff --git a/src/core/metrics/calculateOutputMetrics.ts b/src/core/metrics/calculateOutputMetrics.ts index ad41ae918..1be4f57d5 100644 --- a/src/core/metrics/calculateOutputMetrics.ts +++ b/src/core/metrics/calculateOutputMetrics.ts @@ -1,6 +1,6 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; import type { TaskRunner } from '../../shared/processConcurrency.js'; +import type { TokenEncoding } from './TokenCounter.js'; import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; const CHUNK_SIZE = 1000; @@ -8,7 +8,7 @@ const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1000KB export const calculateOutputMetrics = async ( content: string, - encoding: TiktokenEncoding, + encoding: TokenEncoding, path: string | undefined, deps: { taskRunner: TaskRunner }, ): Promise => { diff --git a/src/core/metrics/calculateSelectiveFileMetrics.ts b/src/core/metrics/calculateSelectiveFileMetrics.ts index 02f52726a..36d0cb980 100644 --- a/src/core/metrics/calculateSelectiveFileMetrics.ts +++ b/src/core/metrics/calculateSelectiveFileMetrics.ts @@ -1,16 +1,16 @@ import pc from 'picocolors'; -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; import type { TaskRunner } from '../../shared/processConcurrency.js'; import type { RepomixProgressCallback } from '../../shared/types.js'; import type { ProcessedFile } from '../file/fileTypes.js'; +import type { TokenEncoding } from './TokenCounter.js'; import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; import type { FileMetrics } from './workers/types.js'; export const calculateSelectiveFileMetrics = async ( processedFiles: ProcessedFile[], targetFilePaths: string[], - tokenCounterEncoding: TiktokenEncoding, + tokenCounterEncoding: TokenEncoding, progressCallback: RepomixProgressCallback, deps: { taskRunner: TaskRunner }, ): Promise => { diff --git a/src/core/metrics/tokenCounterFactory.ts b/src/core/metrics/tokenCounterFactory.ts index 8f51f0ba5..de4ebd8d9 100644 --- a/src/core/metrics/tokenCounterFactory.ts +++ b/src/core/metrics/tokenCounterFactory.ts @@ -1,18 +1,18 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { logger } from '../../shared/logger.js'; -import { TokenCounter } from './TokenCounter.js'; +import { TokenCounter, type TokenEncoding } from './TokenCounter.js'; // Worker-level cache for TokenCounter instances by encoding -const tokenCounters = new Map(); +const tokenCounters = new Map(); /** * Get or create a TokenCounter instance for the given encoding. * This ensures only one TokenCounter exists per encoding per worker thread to optimize memory usage. */ -export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => { +export const getTokenCounter = async (encoding: TokenEncoding): Promise => { let tokenCounter = tokenCounters.get(encoding); if (!tokenCounter) { tokenCounter = new TokenCounter(encoding); + await tokenCounter.init(); tokenCounters.set(encoding, tokenCounter); } return tokenCounter; @@ -20,7 +20,7 @@ export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => { /** * Free all TokenCounter resources and clear the cache. - * This should be called when the worker is terminating. + * No-op for gpt-tokenizer (pure JS), but kept for API compatibility. */ export const freeTokenCounters = (): void => { for (const [encoding, tokenCounter] of tokenCounters.entries()) { diff --git a/src/core/metrics/workers/calculateMetricsWorker.ts b/src/core/metrics/workers/calculateMetricsWorker.ts index 241af02e0..99729f474 100644 --- a/src/core/metrics/workers/calculateMetricsWorker.ts +++ b/src/core/metrics/workers/calculateMetricsWorker.ts @@ -1,12 +1,12 @@ -import type { TiktokenEncoding } from 'tiktoken'; import { logger, setLogLevelByWorkerData } from '../../../shared/logger.js'; +import type { TokenEncoding } from '../TokenCounter.js'; import { freeTokenCounters, getTokenCounter } from '../tokenCounterFactory.js'; /** * Simple token counting worker for metrics calculation. * * This worker provides a focused interface for counting tokens from text content, - * using the Tiktoken encoding. All complex metric calculation logic is handled + * using gpt-tokenizer. All complex metric calculation logic is handled * by the calling side to maintain separation of concerns. */ @@ -16,7 +16,7 @@ setLogLevelByWorkerData(); export interface TokenCountTask { content: string; - encoding: TiktokenEncoding; + encoding: TokenEncoding; path?: string; } @@ -24,7 +24,7 @@ export const countTokens = async (task: TokenCountTask): Promise => { const processStartAt = process.hrtime.bigint(); try { - const counter = getTokenCounter(task.encoding); + const counter = await getTokenCounter(task.encoding); const tokenCount = counter.countTokens(task.content, task.path); logger.trace(`Counted tokens. Count: ${tokenCount}. Took: ${getProcessDuration(processStartAt)}ms`); diff --git a/tests/core/metrics/TokenCounter.test.ts b/tests/core/metrics/TokenCounter.test.ts index dedc8dbcf..b320036f2 100644 --- a/tests/core/metrics/TokenCounter.test.ts +++ b/tests/core/metrics/TokenCounter.test.ts @@ -1,33 +1,14 @@ -import { get_encoding, type Tiktoken } from 'tiktoken'; -import { afterEach, beforeEach, describe, expect, type Mock, test, vi } from 'vitest'; +import { afterEach, beforeEach, describe, expect, test, vi } from 'vitest'; import { TokenCounter } from '../../../src/core/metrics/TokenCounter.js'; -import { logger } from '../../../src/shared/logger.js'; - -vi.mock('tiktoken', () => ({ - get_encoding: vi.fn(), -})); vi.mock('../../../src/shared/logger'); describe('TokenCounter', () => { let tokenCounter: TokenCounter; - let mockEncoder: { - encode: Mock; - free: Mock; - }; - - beforeEach(() => { - // Initialize mock encoder - mockEncoder = { - encode: vi.fn(), - free: vi.fn(), - }; - - // Setup mock encoder behavior - vi.mocked(get_encoding).mockReturnValue(mockEncoder as unknown as Tiktoken); - - // Create new TokenCounter instance + + beforeEach(async () => { tokenCounter = new TokenCounter('o200k_base'); + await tokenCounter.init(); }); afterEach(() => { @@ -35,61 +16,29 @@ describe('TokenCounter', () => { vi.resetAllMocks(); }); - test('should initialize with o200k_base encoding', () => { - expect(get_encoding).toHaveBeenCalledWith('o200k_base'); - }); - test('should correctly count tokens for simple text', () => { - const text = 'Hello, world!'; - const mockTokens = [123, 456, 789]; // Example token IDs - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(3); // Length of mockTokens - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('Hello, world!'); + expect(count).toBe(4); }); test('should handle empty string', () => { - mockEncoder.encode.mockReturnValue([]); - const count = tokenCounter.countTokens(''); - expect(count).toBe(0); - expect(mockEncoder.encode).toHaveBeenCalledWith('', [], []); }); test('should handle multi-line text', () => { - const text = 'Line 1\nLine 2\nLine 3'; - const mockTokens = [1, 2, 3, 4, 5, 6]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(6); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('Line 1\nLine 2\nLine 3'); + expect(count).toBe(11); }); test('should handle special characters', () => { - const text = '!@#$%^&*()_+'; - const mockTokens = [1, 2, 3]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(3); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('!@#$%^&*()_+'); + expect(count).toBe(9); }); test('should handle unicode characters', () => { - const text = '你好,世界!🌍'; - const mockTokens = [1, 2, 3, 4]; - mockEncoder.encode.mockReturnValue(mockTokens); - - const count = tokenCounter.countTokens(text); - - expect(count).toBe(4); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + const count = tokenCounter.countTokens('你好,世界!🌍'); + expect(count).toBe(6); }); test('should handle code snippets', () => { @@ -98,13 +47,8 @@ describe('TokenCounter', () => { console.log("Hello, world!"); } `; - const mockTokens = Array(10).fill(1); // 10 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); - - expect(count).toBe(10); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + expect(count).toBe(17); }); test('should handle markdown text', () => { @@ -116,52 +60,38 @@ describe('TokenCounter', () => { **Bold text** and _italic text_ `; - const mockTokens = Array(15).fill(1); // 15 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); - - expect(count).toBe(15); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + expect(count).toBe(35); }); test('should handle very long text', () => { const text = 'a'.repeat(10000); - const mockTokens = Array(100).fill(1); // 100 tokens - mockEncoder.encode.mockReturnValue(mockTokens); - const count = tokenCounter.countTokens(text); + expect(count).toBe(1250); + }); - expect(count).toBe(100); - expect(mockEncoder.encode).toHaveBeenCalledWith(text, [], []); + test('should handle special token sequences as plain text', () => { + // gpt-tokenizer should treat <|endoftext|> as ordinary text, not a control token + const count = tokenCounter.countTokens('Hello <|endoftext|> world'); + expect(count).toBeGreaterThan(0); }); - test('should properly handle encoding errors without file path', () => { - const error = new Error('Encoding error'); - mockEncoder.encode.mockImplementation(() => { - throw error; - }); + test('should work with cl100k_base encoding', async () => { + const cl100kCounter = new TokenCounter('cl100k_base'); + await cl100kCounter.init(); - const count = tokenCounter.countTokens('test content'); + const count = cl100kCounter.countTokens('Hello, world!'); + expect(count).toBe(4); - expect(count).toBe(0); - expect(logger.warn).toHaveBeenCalledWith('Failed to count tokens. error: Encoding error'); + cl100kCounter.free(); }); - test('should properly handle encoding errors with file path', () => { - const error = new Error('Encoding error'); - mockEncoder.encode.mockImplementation(() => { - throw error; - }); - - const count = tokenCounter.countTokens('test content', 'test.txt'); - - expect(count).toBe(0); - expect(logger.warn).toHaveBeenCalledWith('Failed to count tokens. path: test.txt, error: Encoding error'); + test('should throw when countTokens is called before init', () => { + const uninitCounter = new TokenCounter('o200k_base'); + expect(() => uninitCounter.countTokens('test')).toThrow('TokenCounter not initialized'); }); - test('should free encoder resources on cleanup', () => { - tokenCounter.free(); - expect(mockEncoder.free).toHaveBeenCalled(); + test('should free without error (no-op for gpt-tokenizer)', () => { + expect(() => tokenCounter.free()).not.toThrow(); }); }); diff --git a/tests/core/metrics/calculateMetrics.test.ts b/tests/core/metrics/calculateMetrics.test.ts index 672cd8ee1..b0ea0b3ad 100644 --- a/tests/core/metrics/calculateMetrics.test.ts +++ b/tests/core/metrics/calculateMetrics.test.ts @@ -8,6 +8,7 @@ import { createMockConfig } from '../../testing/testUtils.js'; vi.mock('../../../src/core/metrics/TokenCounter.js', () => { return { + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn().mockImplementation(() => ({ countTokens: vi.fn().mockReturnValue(10), free: vi.fn(), diff --git a/tests/core/metrics/diffTokenCount.test.ts b/tests/core/metrics/diffTokenCount.test.ts index dd5612841..68f77a5d8 100644 --- a/tests/core/metrics/diffTokenCount.test.ts +++ b/tests/core/metrics/diffTokenCount.test.ts @@ -7,6 +7,7 @@ import { createMockConfig } from '../../testing/testUtils.js'; // Mock the TokenCounter vi.mock('../../../src/core/metrics/TokenCounter.js', () => ({ + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn(), })); diff --git a/tests/core/packager.test.ts b/tests/core/packager.test.ts index f54bc94a3..b09582e2d 100644 --- a/tests/core/packager.test.ts +++ b/tests/core/packager.test.ts @@ -7,6 +7,7 @@ vi.mock('node:fs/promises'); vi.mock('fs/promises'); vi.mock('../../src/core/metrics/TokenCounter.js', () => { return { + TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'], TokenCounter: vi.fn().mockImplementation(() => ({ countTokens: vi.fn().mockReturnValue(10), free: vi.fn(),