yamadashy · yamadashy · Mar 29, 2026 · Mar 28, 2026 · Mar 29, 2026
@@ -85,6 +85,7 @@
     "fast-xml-builder": "^1.1.4",
     "git-url-parse": "^16.1.0",
     "globby": "^16.1.1",
+    "gpt-tokenizer": "^3.4.0",
     "handlebars": "^4.7.8",
     "iconv-lite": "^0.7.0",
     "is-binary-path": "^3.0.0",
@@ -96,7 +97,6 @@
     "picocolors": "^1.1.1",
     "picospinner": "^3.0.0",
     "tar": "^7.5.12",
-    "tiktoken": "^1.0.22",
     "tinyclip": "^0.1.12",
     "tinypool": "^2.1.0",
     "web-tree-sitter": "^0.26.7",

@@ -1,5 +1,5 @@
-import type { TiktokenEncoding } from 'tiktoken';
 import { z } from 'zod';
+import { TOKEN_ENCODINGS } from '../core/metrics/TokenCounter.js';
 
 // Output style enum
 export const repomixOutputStyleSchema = z.enum(['xml', 'markdown', 'json', 'plain']);
@@ -122,10 +122,7 @@ export const repomixConfigDefaultSchema = z.object({
     enableSecurityCheck: z.boolean().default(true),
   }),
   tokenCount: z.object({
-    encoding: z
-      .string()
-      .default('o200k_base')
-      .transform((val) => val as TiktokenEncoding),
+    encoding: z.enum(TOKEN_ENCODINGS).default('o200k_base'),
   }),
 });
 

@@ -1,29 +1,65 @@
-import { get_encoding, type Tiktoken, type TiktokenEncoding } from 'tiktoken';
 import { logger } from '../../shared/logger.js';
 
-export class TokenCounter {
-  private encoding: Tiktoken;
+// Supported token encoding types (compatible with tiktoken encoding names)
+export const TOKEN_ENCODINGS = ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'] as const;
+export type TokenEncoding = (typeof TOKEN_ENCODINGS)[number];
+
+interface CountTokensOptions {
+  disallowedSpecial?: Set<string>;
+}
+
+type CountTokensFn = (text: string, options?: CountTokensOptions) => number;
+
+// Treat all text as regular content by disallowing nothing.
+// This matches the old tiktoken behavior: encode(content, [], []).length
+// where special tokens like <|endoftext|> are tokenized as ordinary text.
+const PLAIN_TEXT_OPTIONS: CountTokensOptions = { disallowedSpecial: new Set() };
+
+// Lazy-loaded countTokens functions keyed by encoding
+const encodingModules = new Map<string, CountTokensFn>();
+
+const loadEncoding = async (encodingName: TokenEncoding): Promise<CountTokensFn> => {
+  const cached = encodingModules.get(encodingName);
+  if (cached) {
+    return cached;
+  }
 
-  constructor(encodingName: TiktokenEncoding) {
-    const startTime = process.hrtime.bigint();
+  const startTime = process.hrtime.bigint();
 
-    // Setup encoding with the specified model
-    this.encoding = get_encoding(encodingName);
+  // Dynamic import of the specific encoding module from gpt-tokenizer
+  const mod = await import(`gpt-tokenizer/encoding/${encodingName}`);
+  const countFn = mod.countTokens as CountTokensFn;
+  encodingModules.set(encodingName, countFn);
 
-    const endTime = process.hrtime.bigint();
-    const initTime = Number(endTime - startTime) / 1e6; // Convert to milliseconds
+  const endTime = process.hrtime.bigint();
+  const initTime = Number(endTime - startTime) / 1e6;
+  logger.debug(`TokenCounter initialization for ${encodingName} took ${initTime.toFixed(2)}ms`);
 
-    logger.debug(`TokenCounter initialization took ${initTime.toFixed(2)}ms`);
+  return countFn;
+};
+
+export class TokenCounter {
+  private countFn: CountTokensFn | null = null;
+  private readonly encodingName: TokenEncoding;
+
+  constructor(encodingName: TokenEncoding) {
+    this.encodingName = encodingName;
+  }
+
+  async init(): Promise<void> {
+    this.countFn = await loadEncoding(this.encodingName);
   }
 
   public countTokens(content: string, filePath?: string): number {
+    if (!this.countFn) {
+      throw new Error('TokenCounter not initialized. Call init() first.');
+    }
+
     try {
-      // Disable special token validation to handle files that may contain
-      // special token sequences (e.g., tokenizer configs with <|endoftext|>).
-      // This treats special tokens as ordinary text rather than control tokens,
-      // which is appropriate for general code/text analysis where we're not
-      // actually sending the content to an LLM API.
-      return this.encoding.encode(content, [], []).length;
+      // Use PLAIN_TEXT_OPTIONS to treat all content as ordinary text,
+      // matching the old tiktoken behavior: encode(content, [], []).length
+      // This also skips gpt-tokenizer's default regex scan for special tokens.
+      return this.countFn(content, PLAIN_TEXT_OPTIONS);
     } catch (error) {
       let message = '';
       if (error instanceof Error) {
@@ -42,7 +78,6 @@ export class TokenCounter {
     }
   }
 
-  public free(): void {
-    this.encoding.free();
-  }
+  // No-op: gpt-tokenizer is pure JS, no WASM resources to free
+  public free(): void {}
 }
@@ -1,14 +1,14 @@
-import type { TiktokenEncoding } from 'tiktoken';
 import { logger } from '../../shared/logger.js';
 import type { TaskRunner } from '../../shared/processConcurrency.js';
+import type { TokenEncoding } from './TokenCounter.js';
 import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
 
 const CHUNK_SIZE = 1000;
 const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1000KB
 
 export const calculateOutputMetrics = async (
   content: string,
-  encoding: TiktokenEncoding,
+  encoding: TokenEncoding,
   path: string | undefined,
   deps: { taskRunner: TaskRunner<TokenCountTask, number> },
 ): Promise<number> => {

@@ -1,16 +1,16 @@
 import pc from 'picocolors';
-import type { TiktokenEncoding } from 'tiktoken';
 import { logger } from '../../shared/logger.js';
 import type { TaskRunner } from '../../shared/processConcurrency.js';
 import type { RepomixProgressCallback } from '../../shared/types.js';
 import type { ProcessedFile } from '../file/fileTypes.js';
+import type { TokenEncoding } from './TokenCounter.js';
 import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
 import type { FileMetrics } from './workers/types.js';
 
 export const calculateSelectiveFileMetrics = async (
   processedFiles: ProcessedFile[],
   targetFilePaths: string[],
-  tokenCounterEncoding: TiktokenEncoding,
+  tokenCounterEncoding: TokenEncoding,
   progressCallback: RepomixProgressCallback,
   deps: { taskRunner: TaskRunner<TokenCountTask, number> },
 ): Promise<FileMetrics[]> => {

@@ -1,26 +1,26 @@
-import type { TiktokenEncoding } from 'tiktoken';
 import { logger } from '../../shared/logger.js';
-import { TokenCounter } from './TokenCounter.js';
+import { TokenCounter, type TokenEncoding } from './TokenCounter.js';
 
 // Worker-level cache for TokenCounter instances by encoding
-const tokenCounters = new Map<TiktokenEncoding, TokenCounter>();
+const tokenCounters = new Map<TokenEncoding, TokenCounter>();
 
 /**
  * Get or create a TokenCounter instance for the given encoding.
  * This ensures only one TokenCounter exists per encoding per worker thread to optimize memory usage.
  */
-export const getTokenCounter = (encoding: TiktokenEncoding): TokenCounter => {
+export const getTokenCounter = async (encoding: TokenEncoding): Promise<TokenCounter> => {
   let tokenCounter = tokenCounters.get(encoding);
   if (!tokenCounter) {
     tokenCounter = new TokenCounter(encoding);
+    await tokenCounter.init();
     tokenCounters.set(encoding, tokenCounter);
   }
   return tokenCounter;
 };
 
 /**
  * Free all TokenCounter resources and clear the cache.
- * This should be called when the worker is terminating.
+ * No-op for gpt-tokenizer (pure JS), but kept for API compatibility.
  */
 export const freeTokenCounters = (): void => {
   for (const [encoding, tokenCounter] of tokenCounters.entries()) {

@@ -1,12 +1,12 @@
-import type { TiktokenEncoding } from 'tiktoken';
 import { logger, setLogLevelByWorkerData } from '../../../shared/logger.js';
+import type { TokenEncoding } from '../TokenCounter.js';
 import { freeTokenCounters, getTokenCounter } from '../tokenCounterFactory.js';
 
 /**
  * Simple token counting worker for metrics calculation.
  *
  * This worker provides a focused interface for counting tokens from text content,
- * using the Tiktoken encoding. All complex metric calculation logic is handled
+ * using gpt-tokenizer. All complex metric calculation logic is handled
  * by the calling side to maintain separation of concerns.
  */
 
@@ -16,15 +16,15 @@ setLogLevelByWorkerData();
 
 export interface TokenCountTask {
   content: string;
-  encoding: TiktokenEncoding;
+  encoding: TokenEncoding;
   path?: string;
 }
 
 export const countTokens = async (task: TokenCountTask): Promise<number> => {
   const processStartAt = process.hrtime.bigint();
 
   try {
-    const counter = getTokenCounter(task.encoding);
+    const counter = await getTokenCounter(task.encoding);
     const tokenCount = counter.countTokens(task.content, task.path);
 
     logger.trace(`Counted tokens. Count: ${tokenCount}. Took: ${getProcessDuration(processStartAt)}ms`);