Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"fast-xml-builder": "^1.1.4",
"git-url-parse": "^16.1.0",
"globby": "^16.1.1",
"gpt-tokenizer": "^3.4.0",
"handlebars": "^4.7.8",
"iconv-lite": "^0.7.0",
"is-binary-path": "^3.0.0",
Expand All @@ -96,7 +97,6 @@
"picocolors": "^1.1.1",
"picospinner": "^3.0.0",
"tar": "^7.5.12",
"tiktoken": "^1.0.22",
"tinyclip": "^0.1.12",
"tinypool": "^2.1.0",
"web-tree-sitter": "^0.26.7",
Expand Down
7 changes: 2 additions & 5 deletions src/config/configSchema.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { TiktokenEncoding } from 'tiktoken';
import { z } from 'zod';
import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js';

// Output style enum
export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']);
Expand Down Expand Up @@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({
enableSecurityCheck: z.boolean().default(true),
}),
tokenCount: z.object({
encoding: z
.string()
.default('o200k_base')
.transform((val) => val as TiktokenEncoding),
encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'),
}),
});

Expand Down
73 changes: 54 additions & 19 deletions src/core/metrics/TokenCounter.ts
Original file line number Diff line number Diff line change
@@ -1,29 +1,65 @@
import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken';
import { logger } from '../../shared/logger.js';

export class TokenCounter {
private encoding: Tiktoken;
// Supported token encoding types (compatible with tiktoken encoding names)
export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const;
export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number];

interface CountTokensOptions {
disallowedSpecial?: Set<string>;
}

type CountTokensFn = (text: string, options?: CountTokensOptions) => number;

// Treat all text as regular content by disallowing nothing.
// This matches the old tiktoken behavior: encode(content, [], []).length
// where special tokens like <|endoftext|> are tokenized as ordinary text.
const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() };

// Lazy-loaded countTokens functions keyed by encoding
const encodingModules = new Map<string, CountTokensFn>();

const loadEncoding = async (encodingName: TokenEncoding): Promise<CountTokensFn> => {
const cached = encodingModules.get(encodingName);
if (cached) {
return cached;
}

constructor(encodingName: TiktokenEncoding) {
const startTime = process.hrtime.bigint();
const startTime = process.hrtime.bigint();

// Setup encoding with the specified model
this.encoding = get_encoding(encodingName);
// Dynamic import of the specific encoding module from gpt-tokenizer
const mod = await import(`gpt-tokenizer/encoding/${encodingName}`);
const countFn = mod.countTokens as CountTokensFn;
encodingModules.set(encodingName, countFn);

const endTime = process.hrtime.bigint();
const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds
const endTime = process.hrtime.bigint();
const initTime = Number(endTime - startTime) / 1e6;
logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`);

logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`);
return countFn;
};

export class TokenCounter {
private countFn: CountTokensFn | null = null;
private readonly encodingName: TokenEncoding;

constructor(encodingName: TokenEncoding) {
this.encodingName = encodingName;
}

async init(): Promise<void> {
this.countFn = await loadEncoding(this.encodingName);
}

public countTokens(content: string, filePath?: string): number {
if (!this.countFn) {
throw new Error('TokenCounter not initialized. Call init() first.');
}

try {
// Disable special token validation to handle files that may contain
// special token sequences (e.g., tokenizer configs with <|endoftext|>).
// This treats special tokens as ordinary text rather than control tokens,
// which is appropriate for general code/text analysis where we're not
// actually sending the content to an LLM API.
return this.encoding.encode(content, [], []).length;
// Use PLAIN_TEXT_OPTIONS to treat all content as ordinary text,
// matching the old tiktoken behavior: encode(content, [], []).length
// This also skips gpt-tokenizer's default regex scan for special tokens.
return this.countFn(content, PLAIN_TEXT_OPTIONS);
} catch (error) {
let message = '';
if (error instanceof Error) {
Expand All @@ -42,7 +78,6 @@ export class TokenCounter {
}
}

public free(): void {
this.encoding.free();
}
// No-op: gpt-tokenizer is pure JS, no WASM resources to free
public free(): void {}
}
4 changes: 2 additions & 2 deletions src/core/metrics/calculateOutputMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import type { TiktokenEncoding } from 'tiktoken';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';

const CHUNK_SIZE = 1000;
const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1000KB

export const calculateOutputMetrics = async (
content: string,
encoding: TiktokenEncoding,
encoding: TokenEncoding,
path: string | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
): Promise<number> => {
Expand Down
4 changes: 2 additions & 2 deletions src/core/metrics/calculateSelectiveFileMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import pc from 'picocolors';
import type { TiktokenEncoding } from 'tiktoken';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { RepomixProgressCallback } from '../../shared/types.js';
import type { ProcessedFile } from '../file/fileTypes.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import type { FileMetrics } from './workers/types.js';

export const calculateSelectiveFileMetrics = async (
processedFiles: ProcessedFile[],
targetFilePaths: string[],
tokenCounterEncoding: TiktokenEncoding,
tokenCounterEncoding: TokenEncoding,
progressCallback: RepomixProgressCallback,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
): Promise<FileMetrics[]> => {
Expand Down
10 changes: 5 additions & 5 deletions src/core/metrics/tokenCounterFactory.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
import type { TiktokenEncoding } from 'tiktoken';
import { logger } from '../../shared/logger.js';
import { TokenCounter } from './TokenCounter.js';
import { TokenCounter, type TokenEncoding } from './TokenCounter.js';

// Worker-level cache for TokenCounter instances by encoding
const tokenCounters = new Map<TiktokenEncoding, TokenCounter>();
const tokenCounters = new Map<TokenEncoding, TokenCounter>();

/**
* Get or create a TokenCounter instance for the given encoding.
* This ensures only one TokenCounter exists per encoding per worker thread to optimize memory usage.
*/
export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => {
export const getTokenCounter = async (encoding: TokenEncoding): Promise<TokenCounter> => {
let tokenCounter = tokenCounters.get(encoding);
if (!tokenCounter) {
tokenCounter = new TokenCounter(encoding);
await tokenCounter.init();
tokenCounters.set(encoding, tokenCounter);
Comment thread
yamadashy marked this conversation as resolved.
}
return tokenCounter;
};

/**
* Free all TokenCounter resources and clear the cache.
* This should be called when the worker is terminating.
* No-op for gpt-tokenizer (pure JS), but kept for API compatibility.
*/
export const freeTokenCounters = (): void => {
for (const [encoding, tokenCounter] of tokenCounters.entries()) {
Expand Down
8 changes: 4 additions & 4 deletions src/core/metrics/workers/calculateMetricsWorker.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import type { TiktokenEncoding } from 'tiktoken';
import { logger, setLogLevelByWorkerData } from '../../../shared/logger.js';
import type { TokenEncoding } from '../TokenCounter.js';
import { freeTokenCounters, getTokenCounter } from '../tokenCounterFactory.js';

/**
* Simple token counting worker for metrics calculation.
*
* This worker provides a focused interface for counting tokens from text content,
* using the Tiktoken encoding. All complex metric calculation logic is handled
* using gpt-tokenizer. All complex metric calculation logic is handled
* by the calling side to maintain separation of concerns.
*/

Expand All @@ -16,15 +16,15 @@ setLogLevelByWorkerData();

export interface TokenCountTask {
content: string;
encoding: TiktokenEncoding;
encoding: TokenEncoding;
path?: string;
}

export const countTokens = async (task: TokenCountTask): Promise<number> => {
const processStartAt = process.hrtime.bigint();

try {
const counter = getTokenCounter(task.encoding);
const counter = await getTokenCounter(task.encoding);
const tokenCount = counter.countTokens(task.content, task.path);

logger.trace(`Counted tokens. Count: ${tokenCount}. Took: ${getProcessDuration(processStartAt)}ms`);
Expand Down
Loading
Loading