-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
perf(core): Replace tiktoken WASM with gpt-tokenizer #1343
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
3c71199
perf(core): Replace tiktoken WASM with gpt-tokenizer and simplify met…
yamadashy 29abfe2
fix(core): Validate token encoding with Zod enum and remove ineffecti…
yamadashy 514b863
fix(core): Add special token fallback and fix tokenCounterFactory rac…
yamadashy 9b0ecbc
perf(core): Use disallowedSpecial instead of two-phase try/catch for …
yamadashy 450e1ae
perf(core): Use fast path without options for token counting
yamadashy 0f1a634
perf(core): Avoid V8 deoptimization from complex catch blocks in coun…
yamadashy fffc7c8
fix(core): Add p50k_edit encoding and special token fallback for file…
yamadashy 925eddb
fix(core): Use countTokensPlainText for git metrics and strengthen te…
yamadashy ee29cb6
fix(core): Use countTokensPlainText for output and worker token counting
yamadashy 4b096a6
refactor(core): Remove dead calculateMetricsWorker code
yamadashy 00e2c9f
perf(core): Use fast-path-with-retry for output and git token counting
yamadashy File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,48 +1,103 @@ | ||
| import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken'; | ||
| import { logger } from '../../shared/logger.js'; | ||
|
|
||
| export class TokenCounter { | ||
| private encoding: Tiktoken; | ||
| // Supported token encoding types (compatible with tiktoken encoding names) | ||
| export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const; | ||
| export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number]; | ||
|
|
||
| interface CountTokensOptions { | ||
| disallowedSpecial?: Set<string>; | ||
| } | ||
|
|
||
| type CountTokensFn = (text: string, options?: CountTokensOptions) => number; | ||
|
|
||
| // Treat all text as regular content by disallowing nothing. | ||
| // This matches the old tiktoken behavior: encode(content, [], []).length | ||
| // where special tokens like <|endoftext|> are tokenized as ordinary text. | ||
| // Also faster than the default (disallowedSpecial='all') because it skips | ||
| // the regex scan for special token patterns entirely. | ||
| const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() }; | ||
|
|
||
| // Lazy-loaded countTokens functions keyed by encoding | ||
| const encodingModules = new Map<string, CountTokensFn>(); | ||
|
|
||
| constructor(encodingName: TiktokenEncoding) { | ||
| const startTime = process.hrtime.bigint(); | ||
| const loadEncoding = async (encodingName: TokenEncoding): Promise<CountTokensFn> => { | ||
| const cached = encodingModules.get(encodingName); | ||
| if (cached) { | ||
| return cached; | ||
| } | ||
|
|
||
| const startTime = process.hrtime.bigint(); | ||
|
|
||
| // Dynamic import of the specific encoding module from gpt-tokenizer | ||
| const mod = await import(`gpt-tokenizer/encoding/${encodingName}`); | ||
| const countFn = mod.countTokens as CountTokensFn; | ||
| encodingModules.set(encodingName, countFn); | ||
|
|
||
| // Setup encoding with the specified model | ||
| this.encoding = get_encoding(encodingName); | ||
| const endTime = process.hrtime.bigint(); | ||
| const initTime = Number(endTime - startTime) / 1e6; | ||
| logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`); | ||
|
|
||
| return countFn; | ||
| }; | ||
|
|
||
| export class TokenCounter { | ||
| private countFn: CountTokensFn | null = null; | ||
| private readonly encodingName: TokenEncoding; | ||
|
|
||
| const endTime = process.hrtime.bigint(); | ||
| const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds | ||
| constructor(encodingName: TokenEncoding) { | ||
| this.encodingName = encodingName; | ||
| } | ||
|
|
||
| logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`); | ||
| async init(): Promise<void> { | ||
| this.countFn = await loadEncoding(this.encodingName); | ||
| } | ||
|
|
||
| /** | ||
| * Count tokens using gpt-tokenizer's default config (fast path). | ||
| * Files containing special token sequences like <|endoftext|> will return 0. | ||
| * Use countTokensPlainText() to handle such files correctly. | ||
| */ | ||
| public countTokens(content: string, filePath?: string): number { | ||
| if (!this.countFn) { | ||
| throw new Error('TokenCounter not initialized. Call init() first.'); | ||
| } | ||
|
|
||
| try { | ||
| // Disable special token validation to handle files that may contain | ||
| // special token sequences (e.g., tokenizer configs with <|endoftext|>). | ||
| // This treats special tokens as ordinary text rather than control tokens, | ||
| // which is appropriate for general code/text analysis where we're not | ||
| // actually sending the content to an LLM API. | ||
| return this.encoding.encode(content, [], []).length; | ||
| } catch (error) { | ||
| let message = ''; | ||
| if (error instanceof Error) { | ||
| message = error.message; | ||
| return this.countFn(content); | ||
|
yamadashy marked this conversation as resolved.
|
||
| } catch { | ||
| if (filePath) { | ||
| logger.warn(`Failed to count tokens. path: ${filePath}`); | ||
| } else { | ||
| message = String(error); | ||
| logger.warn('Failed to count tokens.'); | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Count tokens treating all content as plain text (no special token checking). | ||
| * Matches tiktoken's encode(content, [], []) behavior where special tokens | ||
| * like <|endoftext|> are tokenized as ordinary text. | ||
| */ | ||
| public countTokensPlainText(content: string, filePath?: string): number { | ||
| if (!this.countFn) { | ||
| throw new Error('TokenCounter not initialized. Call init() first.'); | ||
| } | ||
|
|
||
| try { | ||
| return this.countFn(content, PLAIN_TEXT_OPTIONS); | ||
| } catch { | ||
| if (filePath) { | ||
| logger.warn(`Failed to count tokens. path: ${filePath}, error: ${message}`); | ||
| logger.warn(`Failed to count tokens. path: ${filePath}`); | ||
| } else { | ||
| logger.warn(`Failed to count tokens. error: ${message}`); | ||
| logger.warn('Failed to count tokens.'); | ||
| } | ||
|
|
||
| return 0; | ||
| } | ||
|
devin-ai-integration[bot] marked this conversation as resolved.
|
||
| } | ||
|
yamadashy marked this conversation as resolved.
|
||
|
|
||
| public free(): void { | ||
| this.encoding.free(); | ||
| } | ||
| // No-op: gpt-tokenizer is pure JS, no WASM resources to free | ||
| public free(): void {} | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,51 +1,47 @@ | ||
| import type { RepomixConfigMerged } from '../../config/configSchema.js'; | ||
| import { logger } from '../../shared/logger.js'; | ||
| import type { TaskRunner } from '../../shared/processConcurrency.js'; | ||
| import type { GitLogResult } from '../git/gitLogHandle.js'; | ||
| import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; | ||
| import { getTokenCounter } from './tokenCounterFactory.js'; | ||
|
|
||
| const defaultDeps = { | ||
| getTokenCounter, | ||
| }; | ||
|
|
||
| /** | ||
| * Calculate token count for git logs if included | ||
| */ | ||
| export const calculateGitLogMetrics = async ( | ||
| config: RepomixConfigMerged, | ||
| gitLogResult: GitLogResult | undefined, | ||
| deps: { taskRunner: TaskRunner<TokenCountTask, number> }, | ||
| deps: Partial<typeof defaultDeps> = {}, | ||
| ): Promise<{ gitLogTokenCount: number }> => { | ||
| // Return zero token count if git logs are disabled or no result | ||
| if (!config.output.git?.includeLogs || !gitLogResult) { | ||
| return { | ||
| gitLogTokenCount: 0, | ||
| }; | ||
| return { gitLogTokenCount: 0 }; | ||
| } | ||
|
|
||
| // Return zero token count if no git log content | ||
| if (!gitLogResult.logContent) { | ||
| return { | ||
| gitLogTokenCount: 0, | ||
| }; | ||
| return { gitLogTokenCount: 0 }; | ||
| } | ||
|
|
||
| const resolvedDeps = { ...defaultDeps, ...deps }; | ||
|
|
||
| try { | ||
| const startTime = process.hrtime.bigint(); | ||
| logger.trace('Starting git log token calculation using worker'); | ||
| logger.trace('Starting git log token calculation on main thread'); | ||
|
|
||
| const result = await deps.taskRunner.run({ | ||
| content: gitLogResult.logContent, | ||
| encoding: config.tokenCount.encoding, | ||
| }); | ||
| const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding); | ||
| let result = counter.countTokens(gitLogResult.logContent); | ||
| if (result === 0 && gitLogResult.logContent.length > 0) { | ||
| result = counter.countTokensPlainText(gitLogResult.logContent); | ||
| } | ||
|
|
||
| const endTime = process.hrtime.bigint(); | ||
| const duration = Number(endTime - startTime) / 1e6; | ||
| logger.trace(`Git log token calculation completed in ${duration.toFixed(2)}ms`); | ||
|
|
||
| return { | ||
| gitLogTokenCount: result, | ||
| }; | ||
| return { gitLogTokenCount: result }; | ||
| } catch (error) { | ||
| logger.error('Failed to calculate git log metrics:', error); | ||
| return { | ||
| gitLogTokenCount: 0, | ||
| }; | ||
| return { gitLogTokenCount: 0 }; | ||
| } | ||
| }; |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.