diff --git a/src/core/metrics/calculateOutputMetrics.ts b/src/core/metrics/calculateOutputMetrics.ts index c809ab2a6..c0ce530bd 100644 --- a/src/core/metrics/calculateOutputMetrics.ts +++ b/src/core/metrics/calculateOutputMetrics.ts @@ -9,48 +9,42 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; const TARGET_CHARS_PER_CHUNK = 100_000; const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1MB +// Sampling constants for token count estimation on large outputs. +// Instead of full BPE tokenization, we sample evenly spaced portions and extrapolate. +// Threshold must be well above MIN_CONTENT_LENGTH_FOR_PARALLEL (1MB) so that sampling +// (10 worker calls) is significantly fewer than full parallel chunking (30+ chunks). +const OUTPUT_SAMPLING_THRESHOLD = 3_000_000; // 3MB - outputs below this are fully tokenized +const OUTPUT_SAMPLE_SIZE = 100_000; // 100KB per sample +const OUTPUT_SAMPLE_COUNT = 10; // Number of evenly spaced samples +// Maximum coefficient of variation allowed for sampling estimation. +// If per-sample chars/token ratios vary more than this (e.g. mixed CJK/ASCII content, +// or periodic structure resonating with the stride), fall back to full tokenization. +const SAMPLING_CV_THRESHOLD = 0.15; + export const calculateOutputMetrics = async ( content: string, encoding: TokenEncoding, path: string | undefined, deps: { taskRunner: TaskRunner }, ): Promise => { - const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL; - try { logger.trace(`Starting output token count for ${path || 'output'}`); const startTime = process.hrtime.bigint(); let result: number; - if (shouldRunInParallel) { - // Split content into chunks for parallel processing - const chunks: string[] = []; - - for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) { - chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK)); + if (content.length > OUTPUT_SAMPLING_THRESHOLD) { + // For very large outputs, try sampling estimation first + const estimated = await tryEstimateBySampling(content, encoding, path, deps); + if (estimated !== null) { + result = estimated; + } else { + // Sampling variance too high, fall back to full tokenization + result = await fullTokenize(content, encoding, path, deps); } - - // Process chunks in parallel - const chunkResults = await Promise.all( - chunks.map(async (chunk, index) => { - return deps.taskRunner.run({ - content: chunk, - encoding, - path: path ? `${path}-chunk-${index}` : undefined, - }); - }), - ); - - // Sum up the results - result = chunkResults.reduce((sum, count) => sum + count, 0); } else { - // Process small content directly - result = await deps.taskRunner.run({ - content, - encoding, - path, - }); + // Standard path: full tokenization (parallel for > 1MB, direct for smaller) + result = await fullTokenize(content, encoding, path, deps); } const endTime = process.hrtime.bigint(); @@ -63,3 +57,98 @@ export const calculateOutputMetrics = async ( throw error; } }; + +/** + * Full tokenization: split content into chunks and process in parallel, or directly for smaller content. + */ +const fullTokenize = async ( + content: string, + encoding: TokenEncoding, + path: string | undefined, + deps: { taskRunner: TaskRunner }, +): Promise => { + if (content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL) { + const chunks: string[] = []; + for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) { + chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK)); + } + + const chunkResults = await Promise.all( + chunks.map((chunk, index) => + deps.taskRunner.run({ + content: chunk, + encoding, + path: path ? `${path}-chunk-${index}` : undefined, + }), + ), + ); + + return chunkResults.reduce((sum, count) => sum + count, 0); + } + + return deps.taskRunner.run({ content, encoding, path }); +}; + +/** + * Try to estimate token count by sampling evenly spaced portions of the content. + * Returns the estimated count, or null if the sample variance is too high + * (indicating heterogeneous content where sampling would be inaccurate). + */ +const tryEstimateBySampling = async ( + content: string, + encoding: TokenEncoding, + path: string | undefined, + deps: { taskRunner: TaskRunner }, +): Promise => { + const sampleCount = Math.min(OUTPUT_SAMPLE_COUNT, Math.ceil(content.length / OUTPUT_SAMPLE_SIZE)); + if (sampleCount < 2) { + return null; + } + + const stride = Math.floor(content.length / sampleCount); + + const sampleResults = await Promise.all( + Array.from({ length: sampleCount }, (_, i) => { + const start = i * stride; + const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE); + return deps.taskRunner + .run({ + content: sampleContent, + encoding, + path: path ? `${path}-sample-${i}` : undefined, + }) + .then((tokens) => ({ chars: sampleContent.length, tokens })); + }), + ); + + const validSamples = sampleResults.filter((s) => s.tokens > 0 && s.chars > 0); + if (validSamples.length < 2) { + return null; + } + + // Compute per-sample chars/token ratios and check coefficient of variation (CV = stddev / mean). + // High CV indicates the content is heterogeneous (e.g. mixed CJK/ASCII, or periodic structure + // resonating with the sample stride), making the extrapolation unreliable. + const ratios = validSamples.map((s) => s.chars / s.tokens); + const mean = ratios.reduce((sum, r) => sum + r, 0) / ratios.length; + const variance = ratios.reduce((sum, r) => sum + (r - mean) ** 2, 0) / ratios.length; + const cv = Math.sqrt(variance) / mean; + + if (cv > SAMPLING_CV_THRESHOLD) { + logger.trace( + `Sampling CV ${cv.toFixed(3)} exceeds threshold ${SAMPLING_CV_THRESHOLD}, falling back to full tokenization`, + ); + return null; + } + + // Extrapolate total token count from the overall sample ratio + const totalSampleTokens = validSamples.reduce((sum, s) => sum + s.tokens, 0); + const totalSampleChars = validSamples.reduce((sum, s) => sum + s.chars, 0); + const estimated = Math.round((content.length / totalSampleChars) * totalSampleTokens); + + logger.trace( + `Estimated output tokens from ${validSamples.length} samples: ${estimated} (CV=${cv.toFixed(3)}, ${(totalSampleChars / totalSampleTokens).toFixed(2)} chars/token)`, + ); + + return estimated; +}; diff --git a/tests/core/metrics/calculateOutputMetrics.test.ts b/tests/core/metrics/calculateOutputMetrics.test.ts index 2b42d0731..ee1f2a405 100644 --- a/tests/core/metrics/calculateOutputMetrics.test.ts +++ b/tests/core/metrics/calculateOutputMetrics.test.ts @@ -90,7 +90,7 @@ describe('calculateOutputMetrics', () => { }); it('should process large content in parallel', async () => { - // Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL + // Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL but below sampling threshold const content = 'a'.repeat(1_100_000); // 1.1MB of content const encoding = 'o200k_base'; const path = 'large-file.txt'; @@ -143,7 +143,7 @@ describe('calculateOutputMetrics', () => { }); it('should correctly split content into chunks for parallel processing', async () => { - const content = 'a'.repeat(1_100_000); // 1.1MB of content + const content = 'a'.repeat(1_100_000); // 1.1MB of content (below sampling threshold) const encoding = 'o200k_base'; const processedChunks: string[] = []; @@ -178,4 +178,86 @@ describe('calculateOutputMetrics', () => { } expect(processedChunks.join('')).toBe(content); // All content should be processed }); + + describe('sampling estimation', () => { + it('should use sampling estimation for large content with uniform token density', async () => { + // 3.5MB of uniform content (above 3MB sampling threshold) + const content = 'hello world '.repeat(291_667); // ~3.5MB + const encoding = 'o200k_base'; + let totalRunCalls = 0; + + // Mock that returns consistent tokens-per-char ratio + const mockTaskRunner = { + run: async (task: TokenCountTask) => { + totalRunCalls++; + // ~4 chars per token, consistent ratio + return Math.round(task.content.length / 4); + }, + cleanup: async () => {}, + }; + + const result = await calculateOutputMetrics(content, encoding, 'test.txt', { + taskRunner: mockTaskRunner, + }); + + // Should have used sampling (10 samples), not full tokenization (35 chunks) + expect(totalRunCalls).toBeLessThanOrEqual(10); + // Estimated tokens should be approximately content.length / 4 + expect(result).toBeGreaterThan(0); + expect(Math.abs(result - content.length / 4)).toBeLessThan((content.length / 4) * 0.05); + }); + + it('should fall back to full tokenization when sampling CV is too high', async () => { + // 3.5MB of content (above sampling threshold) + const content = 'a'.repeat(3_500_000); + const encoding = 'o200k_base'; + let runCallCount = 0; + + // Mock that returns wildly different ratios per sample to trigger high CV + const mockTaskRunner = { + run: async (task: TokenCountTask) => { + runCallCount++; + const isSample = task.path?.includes('-sample-'); + if (isSample) { + // Alternate between very different ratios to produce high CV + const sampleIndex = Number.parseInt(task.path?.split('-sample-')[1] || '0', 10); + return sampleIndex % 2 === 0 ? task.content.length / 2 : task.content.length / 10; + } + // Full tokenization chunks + return Math.round(task.content.length / 4); + }, + cleanup: async () => {}, + }; + + const result = await calculateOutputMetrics(content, encoding, 'test.txt', { + taskRunner: mockTaskRunner, + }); + + // Should have fallen back to full parallel tokenization (10 samples + 35 chunks) + expect(runCallCount).toBeGreaterThan(10); + expect(result).toBeGreaterThan(0); + }); + + it('should not use sampling for content below threshold', async () => { + // 2MB (below 3MB sampling threshold, but above parallel threshold) + const content = 'a'.repeat(2_000_000); + const encoding = 'o200k_base'; + let runCallCount = 0; + + const mockTaskRunner = { + run: async (task: TokenCountTask) => { + runCallCount++; + return Math.round(task.content.length / 4); + }, + cleanup: async () => {}, + }; + + await calculateOutputMetrics(content, encoding, 'test.txt', { + taskRunner: mockTaskRunner, + }); + + // Should use full parallel tokenization (20 chunks), not sampling + expect(runCallCount).toBe(20); + }); + }); });