Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"fast-xml-builder": "^1.1.4",
"git-url-parse": "^16.1.0",
"globby": "^16.1.1",
"gpt-tokenizer": "^3.4.0",
"handlebars": "^4.7.8",
"iconv-lite": "^0.7.0",
"is-binary-path": "^3.0.0",
Expand All @@ -96,7 +97,6 @@
"picocolors": "^1.1.1",
"picospinner": "^3.0.0",
"tar": "^7.5.12",
"tiktoken": "^1.0.22",
"tinyclip": "^0.1.12",
"tinypool": "^2.1.0",
"web-tree-sitter": "^0.26.7",
Expand Down
7 changes: 2 additions & 5 deletions src/config/configSchema.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { TiktokenEncoding } from 'tiktoken';
import { z } from 'zod';
import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js';

// Output style enum
export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']);
Expand Down Expand Up @@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({
enableSecurityCheck: z.boolean().default(true),
}),
tokenCount: z.object({
encoding: z
.string()
.default('o200k_base')
.transform((val) => val as TiktokenEncoding),
encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'),
}),
});

Expand Down
107 changes: 81 additions & 26 deletions src/core/metrics/TokenCounter.ts
Original file line number Diff line number Diff line change
@@ -1,48 +1,103 @@
import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken';
import { logger } from '../../shared/logger.js';

export class TokenCounter {
private encoding: Tiktoken;
// Supported token encoding types (compatible with tiktoken encoding names)
export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const;
export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number];

interface CountTokensOptions {
disallowedSpecial?: Set<string>;
}

type CountTokensFn = (text: string, options?: CountTokensOptions) => number;

// Treat all text as regular content by disallowing nothing.
// This matches the old tiktoken behavior: encode(content, [], []).length
// where special tokens like <|endoftext|> are tokenized as ordinary text.
// Also faster than the default (disallowedSpecial='all') because it skips
// the regex scan for special token patterns entirely.
const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() };

// Lazy-loaded countTokens functions keyed by encoding
const encodingModules = new Map<string, CountTokensFn>();

constructor(encodingName: TiktokenEncoding) {
const startTime = process.hrtime.bigint();
const loadEncoding = async (encodingName: TokenEncoding): Promise<CountTokensFn> => {
const cached = encodingModules.get(encodingName);
if (cached) {
return cached;
}

const startTime = process.hrtime.bigint();

// Dynamic import of the specific encoding module from gpt-tokenizer
const mod = await import(`gpt-tokenizer/encoding/${encodingName}`);
const countFn = mod.countTokens as CountTokensFn;
encodingModules.set(encodingName, countFn);
Comment thread
yamadashy marked this conversation as resolved.

// Setup encoding with the specified model
this.encoding = get_encoding(encodingName);
const endTime = process.hrtime.bigint();
const initTime = Number(endTime - startTime) / 1e6;
logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`);

return countFn;
};

export class TokenCounter {
private countFn: CountTokensFn | null = null;
private readonly encodingName: TokenEncoding;

const endTime = process.hrtime.bigint();
const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds
constructor(encodingName: TokenEncoding) {
this.encodingName = encodingName;
}

logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`);
async init(): Promise<void> {
this.countFn = await loadEncoding(this.encodingName);
}

/**
* Count tokens using gpt-tokenizer's default config (fast path).
* Files containing special token sequences like <|endoftext|> will return 0.
* Use countTokensPlainText() to handle such files correctly.
*/
public countTokens(content: string, filePath?: string): number {
if (!this.countFn) {
throw new Error('TokenCounter not initialized. Call init() first.');
}

try {
// Disable special token validation to handle files that may contain
// special token sequences (e.g., tokenizer configs with <|endoftext|>).
// This treats special tokens as ordinary text rather than control tokens,
// which is appropriate for general code/text analysis where we're not
// actually sending the content to an LLM API.
return this.encoding.encode(content, [], []).length;
} catch (error) {
let message = '';
if (error instanceof Error) {
message = error.message;
return this.countFn(content);
Comment thread
yamadashy marked this conversation as resolved.
} catch {
if (filePath) {
logger.warn(`Failed to count tokens. path: ${filePath}`);
} else {
message = String(error);
logger.warn('Failed to count tokens.');
}

return 0;
}
}

/**
* Count tokens treating all content as plain text (no special token checking).
* Matches tiktoken's encode(content, [], []) behavior where special tokens
* like <|endoftext|> are tokenized as ordinary text.
*/
public countTokensPlainText(content: string, filePath?: string): number {
if (!this.countFn) {
throw new Error('TokenCounter not initialized. Call init() first.');
}

try {
return this.countFn(content, PLAIN_TEXT_OPTIONS);
} catch {
if (filePath) {
logger.warn(`Failed to count tokens. path: ${filePath}, error: ${message}`);
logger.warn(`Failed to count tokens. path: ${filePath}`);
} else {
logger.warn(`Failed to count tokens. error: ${message}`);
logger.warn('Failed to count tokens.');
}

return 0;
}
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.
}
Comment thread
yamadashy marked this conversation as resolved.

public free(): void {
this.encoding.free();
}
// No-op: gpt-tokenizer is pure JS, no WASM resources to free
public free(): void {}
}
42 changes: 21 additions & 21 deletions src/core/metrics/calculateGitDiffMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,52 +1,52 @@
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { GitDiffResult } from '../git/gitDiffHandle.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import { getTokenCounter } from './tokenCounterFactory.js';

const defaultDeps = {
getTokenCounter,
};

/**
* Calculate token count for git diffs if included
*/
export const calculateGitDiffMetrics = async (
config: RepomixConfigMerged,
gitDiffResult: GitDiffResult | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: Partial<typeof defaultDeps> = {},
): Promise<number> => {
if (!config.output.git?.includeDiffs || !gitDiffResult) {
return 0;
}

// Check if we have any diff content to process
if (!gitDiffResult.workTreeDiffContent && !gitDiffResult.stagedDiffContent) {
return 0;
}

const resolvedDeps = { ...defaultDeps, ...deps };

try {
const startTime = process.hrtime.bigint();
logger.trace('Starting git diff token calculation using worker');
logger.trace('Starting git diff token calculation on main thread');

const countPromises: Promise<number>[] = [];
const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding);
let totalTokens = 0;

if (gitDiffResult.workTreeDiffContent) {
countPromises.push(
deps.taskRunner.run({
content: gitDiffResult.workTreeDiffContent,
encoding: config.tokenCount.encoding,
}),
);
let count = counter.countTokens(gitDiffResult.workTreeDiffContent);
if (count === 0 && gitDiffResult.workTreeDiffContent.length > 0) {
count = counter.countTokensPlainText(gitDiffResult.workTreeDiffContent);
}
totalTokens += count;
}
if (gitDiffResult.stagedDiffContent) {
countPromises.push(
deps.taskRunner.run({
content: gitDiffResult.stagedDiffContent,
encoding: config.tokenCount.encoding,
}),
);
let count = counter.countTokens(gitDiffResult.stagedDiffContent);
if (count === 0 && gitDiffResult.stagedDiffContent.length > 0) {
count = counter.countTokensPlainText(gitDiffResult.stagedDiffContent);
}
totalTokens += count;
}

const results = await Promise.all(countPromises);
const totalTokens = results.reduce((sum, count) => sum + count, 0);

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`Git diff token calculation completed in ${duration.toFixed(2)}ms`);
Expand Down
40 changes: 18 additions & 22 deletions src/core/metrics/calculateGitLogMetrics.ts
Original file line number Diff line number Diff line change
@@ -1,51 +1,47 @@
import type { RepomixConfigMerged } from '../../config/configSchema.js';
import { logger } from '../../shared/logger.js';
import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { GitLogResult } from '../git/gitLogHandle.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
import { getTokenCounter } from './tokenCounterFactory.js';

const defaultDeps = {
getTokenCounter,
};

/**
* Calculate token count for git logs if included
*/
export const calculateGitLogMetrics = async (
config: RepomixConfigMerged,
gitLogResult: GitLogResult | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
deps: Partial<typeof defaultDeps> = {},
): Promise<{ gitLogTokenCount: number }> => {
// Return zero token count if git logs are disabled or no result
if (!config.output.git?.includeLogs || !gitLogResult) {
return {
gitLogTokenCount: 0,
};
return { gitLogTokenCount: 0 };
}

// Return zero token count if no git log content
if (!gitLogResult.logContent) {
return {
gitLogTokenCount: 0,
};
return { gitLogTokenCount: 0 };
}

const resolvedDeps = { ...defaultDeps, ...deps };

try {
const startTime = process.hrtime.bigint();
logger.trace('Starting git log token calculation using worker');
logger.trace('Starting git log token calculation on main thread');

const result = await deps.taskRunner.run({
content: gitLogResult.logContent,
encoding: config.tokenCount.encoding,
});
const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding);
let result = counter.countTokens(gitLogResult.logContent);
if (result === 0 && gitLogResult.logContent.length > 0) {
result = counter.countTokensPlainText(gitLogResult.logContent);
}

const endTime = process.hrtime.bigint();
const duration = Number(endTime - startTime) / 1e6;
logger.trace(`Git log token calculation completed in ${duration.toFixed(2)}ms`);

return {
gitLogTokenCount: result,
};
return { gitLogTokenCount: result };
} catch (error) {
logger.error('Failed to calculate git log metrics:', error);
return {
gitLogTokenCount: 0,
};
return { gitLogTokenCount: 0 };
}
};
Loading
Loading