yamadashy · yamadashy · Apr 5, 2026 · Apr 5, 2026 · coderabbitai · Apr 5, 2026
@@ -9,48 +9,42 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
 const TARGET_CHARS_PER_CHUNK = 100_000;
 const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1MB
 
+// Sampling constants for token count estimation on large outputs.
+// Instead of full BPE tokenization, we sample evenly spaced portions and extrapolate.
+// Threshold must be well above MIN_CONTENT_LENGTH_FOR_PARALLEL (1MB) so that sampling
+// (10 worker calls) is significantly fewer than full parallel chunking (30+ chunks).
+const OUTPUT_SAMPLING_THRESHOLD = 3_000_000; // 3MB - outputs below this are fully tokenized
+const OUTPUT_SAMPLE_SIZE = 100_000; // 100KB per sample
+const OUTPUT_SAMPLE_COUNT = 10; // Number of evenly spaced samples
+// Maximum coefficient of variation allowed for sampling estimation.
+// If per-sample chars/token ratios vary more than this (e.g. mixed CJK/ASCII content,
+// or periodic structure resonating with the stride), fall back to full tokenization.
+const SAMPLING_CV_THRESHOLD = 0.15;
+
 export const calculateOutputMetrics = async (
   content: string,
   encoding: TokenEncoding,
   path: string | undefined,
   deps: { taskRunner: TaskRunner<TokenCountTask, number> },
 ): Promise<number> => {
-  const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL;
-
   try {
     logger.trace(`Starting output token count for ${path || 'output'}`);
     const startTime = process.hrtime.bigint();
 
     let result: number;
 
-    if (shouldRunInParallel) {
-      // Split content into chunks for parallel processing
-      const chunks: string[] = [];
-
-      for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) {
-        chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK));
+    if (content.length > OUTPUT_SAMPLING_THRESHOLD) {
+      // For very large outputs, try sampling estimation first
+      const estimated = await tryEstimateBySampling(content, encoding, path, deps);
+      if (estimated !== null) {
+        result = estimated;
+      } else {
+        // Sampling variance too high, fall back to full tokenization
+        result = await fullTokenize(content, encoding, path, deps);
       }
-
-      // Process chunks in parallel
-      const chunkResults = await Promise.all(
-        chunks.map(async (chunk, index) => {
-          return deps.taskRunner.run({
-            content: chunk,
-            encoding,
-            path: path ? `${path}-chunk-${index}` : undefined,
-          });
-        }),
-      );
-
-      // Sum up the results
-      result = chunkResults.reduce((sum, count) => sum + count, 0);
     } else {
-      // Process small content directly
-      result = await deps.taskRunner.run({
-        content,
-        encoding,
-        path,
-      });
+      // Standard path: full tokenization (parallel for > 1MB, direct for smaller)
+      result = await fullTokenize(content, encoding, path, deps);
     }
 
     const endTime = process.hrtime.bigint();
@@ -63,3 +57,98 @@ export const calculateOutputMetrics = async (
     throw error;
   }
 };
+
+/**
+ * Full tokenization: split content into chunks and process in parallel, or directly for smaller content.
+ */
+const fullTokenize = async (
+  content: string,
+  encoding: TokenEncoding,
+  path: string | undefined,
+  deps: { taskRunner: TaskRunner<TokenCountTask, number> },
+): Promise<number> => {
+  if (content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL) {
+    const chunks: string[] = [];
+    for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) {
+      chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK));
+    }
+
+    const chunkResults = await Promise.all(
+      chunks.map((chunk, index) =>
+        deps.taskRunner.run({
+          content: chunk,
+          encoding,
+          path: path ? `${path}-chunk-${index}` : undefined,
+        }),
+      ),
+    );
+
+    return chunkResults.reduce((sum, count) => sum + count, 0);
+  }
+
+  return deps.taskRunner.run({ content, encoding, path });
+};
+
+/**
+ * Try to estimate token count by sampling evenly spaced portions of the content.
+ * Returns the estimated count, or null if the sample variance is too high
+ * (indicating heterogeneous content where sampling would be inaccurate).
+ */
+const tryEstimateBySampling = async (
+  content: string,
+  encoding: TokenEncoding,
+  path: string | undefined,
+  deps: { taskRunner: TaskRunner<TokenCountTask, number> },
+): Promise<number | null> => {
+  const sampleCount = Math.min(OUTPUT_SAMPLE_COUNT, Math.ceil(content.length / OUTPUT_SAMPLE_SIZE));
+  if (sampleCount < 2) {
+    return null;
+  }
+
+  const stride = Math.floor(content.length / sampleCount);
+
+  const sampleResults = await Promise.all(
+    Array.from({ length: sampleCount }, (_, i) => {
+      const start = i * stride;
+      const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE);
+      return deps.taskRunner
+        .run({
+          content: sampleContent,
+          encoding,
+          path: path ? `${path}-sample-${i}` : undefined,
+        })
+        .then((tokens) => ({ chars: sampleContent.length, tokens }));
+    }),
+  );
+
+  const validSamples = sampleResults.filter((s) => s.tokens > 0 && s.chars > 0);
+  if (validSamples.length < 2) {
+    return null;
+  }
+
+  // Compute per-sample chars/token ratios and check coefficient of variation (CV = stddev / mean).
+  // High CV indicates the content is heterogeneous (e.g. mixed CJK/ASCII, or periodic structure
+  // resonating with the sample stride), making the extrapolation unreliable.
+  const ratios = validSamples.map((s) => s.chars / s.tokens);
+  const mean = ratios.reduce((sum, r) => sum + r, 0) / ratios.length;
+  const variance = ratios.reduce((sum, r) => sum + (r - mean) ** 2, 0) / ratios.length;
+  const cv = Math.sqrt(variance) / mean;
+
+  if (cv > SAMPLING_CV_THRESHOLD) {
+    logger.trace(
+      `Sampling CV ${cv.toFixed(3)} exceeds threshold ${SAMPLING_CV_THRESHOLD}, falling back to full tokenization`,
+    );
+    return null;
+  }
+
+  // Extrapolate total token count from the overall sample ratio
+  const totalSampleTokens = validSamples.reduce((sum, s) => sum + s.tokens, 0);
+  const totalSampleChars = validSamples.reduce((sum, s) => sum + s.chars, 0);
+  const estimated = Math.round((content.length / totalSampleChars) * totalSampleTokens);
+
+  logger.trace(
+    `Estimated output tokens from ${validSamples.length} samples: ${estimated} (CV=${cv.toFixed(3)}, ${(totalSampleChars / totalSampleTokens).toFixed(2)} chars/token)`,
+  );
+
+  return estimated;
+};
@@ -90,7 +90,7 @@ describe('calculateOutputMetrics', () => {
   });
 
   it('should process large content in parallel', async () => {
-    // Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL
+    // Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL but below sampling threshold
     const content = 'a'.repeat(1_100_000); // 1.1MB of content
     const encoding = 'o200k_base';
     const path = 'large-file.txt';
@@ -143,7 +143,7 @@ describe('calculateOutputMetrics', () => {
   });
 
   it('should correctly split content into chunks for parallel processing', async () => {
-    const content = 'a'.repeat(1_100_000); // 1.1MB of content
+    const content = 'a'.repeat(1_100_000); // 1.1MB of content (below sampling threshold)
     const encoding = 'o200k_base';
     const processedChunks: string[] = [];
 
@@ -178,4 +178,86 @@ describe('calculateOutputMetrics', () => {
     }
     expect(processedChunks.join('')).toBe(content); // All content should be processed
   });
+
+  describe('sampling estimation', () => {
+    it('should use sampling estimation for large content with uniform token density', async () => {
+      // 3.5MB of uniform content (above 3MB sampling threshold)
+      const content = 'hello world '.repeat(291_667); // ~3.5MB
+      const encoding = 'o200k_base';
+      let totalRunCalls = 0;
+
+      // Mock that returns consistent tokens-per-char ratio
+      const mockTaskRunner = {
+        run: async (task: TokenCountTask) => {
+          totalRunCalls++;
+          // ~4 chars per token, consistent ratio
+          return Math.round(task.content.length / 4);
+        },
+        cleanup: async () => {},
+      };
+
+      const result = await calculateOutputMetrics(content, encoding, 'test.txt', {
+        taskRunner: mockTaskRunner,
+      });
+
+      // Should have used sampling (10 samples), not full tokenization (35 chunks)
+      expect(totalRunCalls).toBeLessThanOrEqual(10);
+      // Estimated tokens should be approximately content.length / 4
+      expect(result).toBeGreaterThan(0);
+      expect(Math.abs(result - content.length / 4)).toBeLessThan((content.length / 4) * 0.05);
+    });
+
+    it('should fall back to full tokenization when sampling CV is too high', async () => {
+      // 3.5MB of content (above sampling threshold)
+      const content = 'a'.repeat(3_500_000);
+      const encoding = 'o200k_base';
+      let runCallCount = 0;
+
+      // Mock that returns wildly different ratios per sample to trigger high CV
+      const mockTaskRunner = {
+        run: async (task: TokenCountTask) => {
+          runCallCount++;
+          const isSample = task.path?.includes('-sample-');
+          if (isSample) {
+            // Alternate between very different ratios to produce high CV
+            const sampleIndex = Number.parseInt(task.path?.split('-sample-')[1] || '0', 10);
+            return sampleIndex % 2 === 0 ? task.content.length / 2 : task.content.length / 10;
+          }
+          // Full tokenization chunks
+          return Math.round(task.content.length / 4);
+        },
+        cleanup: async () => {},
+      };
+
+      const result = await calculateOutputMetrics(content, encoding, 'test.txt', {
+        taskRunner: mockTaskRunner,
+      });
+
+      // Should have fallen back to full parallel tokenization (10 samples + 35 chunks)
+      expect(runCallCount).toBeGreaterThan(10);
+      expect(result).toBeGreaterThan(0);
+    });
+
+    it('should not use sampling for content below threshold', async () => {
+      // 2MB (below 3MB sampling threshold, but above parallel threshold)
+      const content = 'a'.repeat(2_000_000);
+      const encoding = 'o200k_base';
+      let runCallCount = 0;
+
+      const mockTaskRunner = {
+        run: async (task: TokenCountTask) => {
+          runCallCount++;
+          return Math.round(task.content.length / 4);
+        },
+        cleanup: async () => {},
+      };
+
+      await calculateOutputMetrics(content, encoding, 'test.txt', {
+        taskRunner: mockTaskRunner,
+      });
+
+      // Should use full parallel tokenization (20 chunks), not sampling
+      expect(runCallCount).toBe(20);
+    });
+  });
 });