yamadashy · yamadashy · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
@@ -85,6 +85,7 @@
     "fast-xml-builder": "^1.1.4",
     "git-url-parse": "^16.1.0",
     "globby": "^16.1.1",
+    "gpt-tokenizer": "^3.4.0",
     "handlebars": "^4.7.8",
     "iconv-lite": "^0.7.0",
     "is-binary-path": "^3.0.0",
@@ -96,7 +97,6 @@
     "picocolors": "^1.1.1",
     "picospinner": "^3.0.0",
     "tar": "^7.5.12",
-    "tiktoken": "^1.0.22",
     "tinyclip": "^0.1.12",
     "tinypool": "^2.1.0",
     "web-tree-sitter": "^0.26.7",

@@ -1,5 +1,5 @@
-import type { TiktokenEncoding } from 'tiktoken';
 import { z } from 'zod';
+import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js';
 
 // Output style enum
 export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']);
@@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({
     enableSecurityCheck: z.boolean().default(true),
   }),
   tokenCount: z.object({
-    encoding: z
-      .string()
-      .default('o200k_base')
-      .transform((val) => val as TiktokenEncoding),
+    encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'),
   }),
 });
 

@@ -1,48 +1,103 @@
-import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken';
 import { logger } from '../../shared/logger.js';
 
-export class TokenCounter {
-  private encoding: Tiktoken;
+// Supported token encoding types (compatible with tiktoken encoding names)
+export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const;
+export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number];
+
+interface CountTokensOptions {
+  disallowedSpecial?: Set<string>;
+}
+
+type CountTokensFn = (text: string, options?: CountTokensOptions) => number;
+
+// Treat all text as regular content by disallowing nothing.
+// This matches the old tiktoken behavior: encode(content, [], []).length
+// where special tokens like <|endoftext|> are tokenized as ordinary text.
+// Also faster than the default (disallowedSpecial='all') because it skips
+// the regex scan for special token patterns entirely.
+const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() };
+
+// Lazy-loaded countTokens functions keyed by encoding
+const encodingModules = new Map<string, CountTokensFn>();
 
-  constructor(encodingName: TiktokenEncoding) {
-    const startTime = process.hrtime.bigint();
+const loadEncoding = async (encodingName: TokenEncoding): Promise<CountTokensFn> => {
+  const cached = encodingModules.get(encodingName);
+  if (cached) {
+    return cached;
+  }
+
+  const startTime = process.hrtime.bigint();
+
+  // Dynamic import of the specific encoding module from gpt-tokenizer
+  const mod = await import(`gpt-tokenizer/encoding/${encodingName}`);
+  const countFn = mod.countTokens as CountTokensFn;
+  encodingModules.set(encodingName, countFn);
 
-    // Setup encoding with the specified model
-    this.encoding = get_encoding(encodingName);
+  const endTime = process.hrtime.bigint();
+  const initTime = Number(endTime - startTime) / 1e6;
+  logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`);
+
+  return countFn;
+};
+
+export class TokenCounter {
+  private countFn: CountTokensFn | null = null;
+  private readonly encodingName: TokenEncoding;
 
-    const endTime = process.hrtime.bigint();
-    const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds
+  constructor(encodingName: TokenEncoding) {
+    this.encodingName = encodingName;
+  }
 
-    logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`);
+  async init(): Promise<void> {
+    this.countFn = await loadEncoding(this.encodingName);
   }
 
+  /**
+   * Count tokens using gpt-tokenizer's default config (fast path).
+   * Files containing special token sequences like <|endoftext|> will return 0.
+   * Use countTokensPlainText() to handle such files correctly.
+   */
   public countTokens(content: string, filePath?: string): number {
+    if (!this.countFn) {
+      throw new Error('TokenCounter not initialized. Call init() first.');
+    }
+
     try {
-      // Disable special token validation to handle files that may contain
-      // special token sequences (e.g., tokenizer configs with <|endoftext|>).
-      // This treats special tokens as ordinary text rather than control tokens,
-      // which is appropriate for general code/text analysis where we're not
-      // actually sending the content to an LLM API.
-      return this.encoding.encode(content, [], []).length;
-    } catch (error) {
-      let message = '';
-      if (error instanceof Error) {
-        message = error.message;
+      return this.countFn(content);
+    } catch {
+      if (filePath) {
+        logger.warn(`Failed to count tokens. path: ${filePath}`);
       } else {
-        message = String(error);
+        logger.warn('Failed to count tokens.');
       }
 
+      return 0;
+    }
+  }
+
+  /**
+   * Count tokens treating all content as plain text (no special token checking).
+   * Matches tiktoken's encode(content, [], []) behavior where special tokens
+   * like <|endoftext|> are tokenized as ordinary text.
+   */
+  public countTokensPlainText(content: string, filePath?: string): number {
+    if (!this.countFn) {
+      throw new Error('TokenCounter not initialized. Call init() first.');
+    }
+
+    try {
+      return this.countFn(content, PLAIN_TEXT_OPTIONS);
+    } catch {
       if (filePath) {
-        logger.warn(`Failed to count tokens. path: ${filePath}, error: ${message}`);
+        logger.warn(`Failed to count tokens. path: ${filePath}`);
       } else {
-        logger.warn(`Failed to count tokens. error: ${message}`);
+        logger.warn('Failed to count tokens.');
       }
 
       return 0;
     }
   }
 
-  public free(): void {
-    this.encoding.free();
-  }
+  // No-op: gpt-tokenizer is pure JS, no WASM resources to free
+  public free(): void {}
 }
@@ -1,52 +1,52 @@
 import type { RepomixConfigMerged } from '../../config/configSchema.js';
 import { logger } from '../../shared/logger.js';
-import type { TaskRunner } from '../../shared/processConcurrency.js';
 import type { GitDiffResult } from '../git/gitDiffHandle.js';
-import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
+import { getTokenCounter } from './tokenCounterFactory.js';
+
+const defaultDeps = {
+  getTokenCounter,
+};
 
 /**
  * Calculate token count for git diffs if included
  */
 export const calculateGitDiffMetrics = async (
   config: RepomixConfigMerged,
   gitDiffResult: GitDiffResult | undefined,
-  deps: { taskRunner: TaskRunner<TokenCountTask, number> },
+  deps: Partial<typeof defaultDeps> = {},
 ): Promise<number> => {
   if (!config.output.git?.includeDiffs || !gitDiffResult) {
     return 0;
   }
 
-  // Check if we have any diff content to process
   if (!gitDiffResult.workTreeDiffContent && !gitDiffResult.stagedDiffContent) {
     return 0;
   }
 
+  const resolvedDeps = { ...defaultDeps, ...deps };
+
   try {
     const startTime = process.hrtime.bigint();
-    logger.trace('Starting git diff token calculation using worker');
+    logger.trace('Starting git diff token calculation on main thread');
 
-    const countPromises: Promise<number>[] = [];
+    const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding);
+    let totalTokens = 0;
 
     if (gitDiffResult.workTreeDiffContent) {
-      countPromises.push(
-        deps.taskRunner.run({
-          content: gitDiffResult.workTreeDiffContent,
-          encoding: config.tokenCount.encoding,
-        }),
-      );
+      let count = counter.countTokens(gitDiffResult.workTreeDiffContent);
+      if (count === 0 && gitDiffResult.workTreeDiffContent.length > 0) {
+        count = counter.countTokensPlainText(gitDiffResult.workTreeDiffContent);
+      }
+      totalTokens += count;
     }
     if (gitDiffResult.stagedDiffContent) {
-      countPromises.push(
-        deps.taskRunner.run({
-          content: gitDiffResult.stagedDiffContent,
-          encoding: config.tokenCount.encoding,
-        }),
-      );
+      let count = counter.countTokens(gitDiffResult.stagedDiffContent);
+      if (count === 0 && gitDiffResult.stagedDiffContent.length > 0) {
+        count = counter.countTokensPlainText(gitDiffResult.stagedDiffContent);
+      }
+      totalTokens += count;
     }
 
-    const results = await Promise.all(countPromises);
-    const totalTokens = results.reduce((sum, count) => sum + count, 0);
-
     const endTime = process.hrtime.bigint();
     const duration = Number(endTime - startTime) / 1e6;
     logger.trace(`Git diff token calculation completed in ${duration.toFixed(2)}ms`);

@@ -1,51 +1,47 @@
 import type { RepomixConfigMerged } from '../../config/configSchema.js';
 import { logger } from '../../shared/logger.js';
-import type { TaskRunner } from '../../shared/processConcurrency.js';
 import type { GitLogResult } from '../git/gitLogHandle.js';
-import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
+import { getTokenCounter } from './tokenCounterFactory.js';
+
+const defaultDeps = {
+  getTokenCounter,
+};
 
 /**
  * Calculate token count for git logs if included
  */
 export const calculateGitLogMetrics = async (
   config: RepomixConfigMerged,
   gitLogResult: GitLogResult | undefined,
-  deps: { taskRunner: TaskRunner<TokenCountTask, number> },
+  deps: Partial<typeof defaultDeps> = {},
 ): Promise<{ gitLogTokenCount: number }> => {
-  // Return zero token count if git logs are disabled or no result
   if (!config.output.git?.includeLogs || !gitLogResult) {
-    return {
-      gitLogTokenCount: 0,
-    };
+    return { gitLogTokenCount: 0 };
   }
 
-  // Return zero token count if no git log content
   if (!gitLogResult.logContent) {
-    return {
-      gitLogTokenCount: 0,
-    };
+    return { gitLogTokenCount: 0 };
   }
 
+  const resolvedDeps = { ...defaultDeps, ...deps };
+
   try {
     const startTime = process.hrtime.bigint();
-    logger.trace('Starting git log token calculation using worker');
+    logger.trace('Starting git log token calculation on main thread');
 
-    const result = await deps.taskRunner.run({
-      content: gitLogResult.logContent,
-      encoding: config.tokenCount.encoding,
-    });
+    const counter = await resolvedDeps.getTokenCounter(config.tokenCount.encoding);
+    let result = counter.countTokens(gitLogResult.logContent);
+    if (result === 0 && gitLogResult.logContent.length > 0) {
+      result = counter.countTokensPlainText(gitLogResult.logContent);
+    }
 
     const endTime = process.hrtime.bigint();
     const duration = Number(endTime - startTime) / 1e6;
     logger.trace(`Git log token calculation completed in ${duration.toFixed(2)}ms`);
 
-    return {
-      gitLogTokenCount: result,
-    };
+    return { gitLogTokenCount: result };
   } catch (error) {
     logger.error('Failed to calculate git log metrics:', error);
-    return {
-      gitLogTokenCount: 0,
-    };
+    return { gitLogTokenCount: 0 };
   }
 };