-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
perf(core): Estimate output tokens via sampling with CV-based fallback #1397
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,48 +9,42 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js'; | |
| const TARGET_CHARS_PER_CHUNK = 100_000; | ||
| const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1MB | ||
|
|
||
| // Sampling constants for token count estimation on large outputs. | ||
| // Instead of full BPE tokenization, we sample evenly spaced portions and extrapolate. | ||
| // Threshold must be well above MIN_CONTENT_LENGTH_FOR_PARALLEL (1MB) so that sampling | ||
| // (10 worker calls) is significantly fewer than full parallel chunking (30+ chunks). | ||
| const OUTPUT_SAMPLING_THRESHOLD = 3_000_000; // 3MB - outputs below this are fully tokenized | ||
| const OUTPUT_SAMPLE_SIZE = 100_000; // 100KB per sample | ||
| const OUTPUT_SAMPLE_COUNT = 10; // Number of evenly spaced samples | ||
| // Maximum coefficient of variation allowed for sampling estimation. | ||
| // If per-sample chars/token ratios vary more than this (e.g. mixed CJK/ASCII content, | ||
| // or periodic structure resonating with the stride), fall back to full tokenization. | ||
| const SAMPLING_CV_THRESHOLD = 0.15; | ||
|
|
||
| export const calculateOutputMetrics = async ( | ||
| content: string, | ||
| encoding: TokenEncoding, | ||
| path: string | undefined, | ||
| deps: { taskRunner: TaskRunner<TokenCountTask, number> }, | ||
| ): Promise<number> => { | ||
| const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL; | ||
|
|
||
| try { | ||
| logger.trace(`Starting output token count for ${path || 'output'}`); | ||
| const startTime = process.hrtime.bigint(); | ||
|
|
||
| let result: number; | ||
|
|
||
| if (shouldRunInParallel) { | ||
| // Split content into chunks for parallel processing | ||
| const chunks: string[] = []; | ||
|
|
||
| for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) { | ||
| chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK)); | ||
| if (content.length > OUTPUT_SAMPLING_THRESHOLD) { | ||
| // For very large outputs, try sampling estimation first | ||
| const estimated = await tryEstimateBySampling(content, encoding, path, deps); | ||
| if (estimated !== null) { | ||
| result = estimated; | ||
| } else { | ||
| // Sampling variance too high, fall back to full tokenization | ||
| result = await fullTokenize(content, encoding, path, deps); | ||
| } | ||
|
|
||
| // Process chunks in parallel | ||
| const chunkResults = await Promise.all( | ||
| chunks.map(async (chunk, index) => { | ||
| return deps.taskRunner.run({ | ||
| content: chunk, | ||
| encoding, | ||
| path: path ? `${path}-chunk-${index}` : undefined, | ||
| }); | ||
| }), | ||
| ); | ||
|
|
||
| // Sum up the results | ||
| result = chunkResults.reduce((sum, count) => sum + count, 0); | ||
| } else { | ||
| // Process small content directly | ||
| result = await deps.taskRunner.run({ | ||
| content, | ||
| encoding, | ||
| path, | ||
| }); | ||
| // Standard path: full tokenization (parallel for > 1MB, direct for smaller) | ||
| result = await fullTokenize(content, encoding, path, deps); | ||
| } | ||
|
|
||
| const endTime = process.hrtime.bigint(); | ||
|
|
@@ -63,3 +57,98 @@ export const calculateOutputMetrics = async ( | |
| throw error; | ||
| } | ||
| }; | ||
|
|
||
| /** | ||
| * Full tokenization: split content into chunks and process in parallel, or directly for smaller content. | ||
| */ | ||
| const fullTokenize = async ( | ||
| content: string, | ||
| encoding: TokenEncoding, | ||
| path: string | undefined, | ||
| deps: { taskRunner: TaskRunner<TokenCountTask, number> }, | ||
| ): Promise<number> => { | ||
| if (content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL) { | ||
| const chunks: string[] = []; | ||
| for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) { | ||
| chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK)); | ||
| } | ||
|
|
||
| const chunkResults = await Promise.all( | ||
| chunks.map((chunk, index) => | ||
| deps.taskRunner.run({ | ||
| content: chunk, | ||
| encoding, | ||
| path: path ? `${path}-chunk-${index}` : undefined, | ||
| }), | ||
| ), | ||
| ); | ||
|
|
||
| return chunkResults.reduce((sum, count) => sum + count, 0); | ||
| } | ||
|
|
||
| return deps.taskRunner.run({ content, encoding, path }); | ||
| }; | ||
|
|
||
| /** | ||
| * Try to estimate token count by sampling evenly spaced portions of the content. | ||
| * Returns the estimated count, or null if the sample variance is too high | ||
| * (indicating heterogeneous content where sampling would be inaccurate). | ||
| */ | ||
| const tryEstimateBySampling = async ( | ||
| content: string, | ||
| encoding: TokenEncoding, | ||
| path: string | undefined, | ||
| deps: { taskRunner: TaskRunner<TokenCountTask, number> }, | ||
| ): Promise<number | null> => { | ||
| const sampleCount = Math.min(OUTPUT_SAMPLE_COUNT, Math.ceil(content.length / OUTPUT_SAMPLE_SIZE)); | ||
| if (sampleCount < 2) { | ||
| return null; | ||
| } | ||
|
|
||
| const stride = Math.floor(content.length / sampleCount); | ||
|
|
||
| const sampleResults = await Promise.all( | ||
| Array.from({ length: sampleCount }, (_, i) => { | ||
| const start = i * stride; | ||
| const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE); | ||
| return deps.taskRunner | ||
| .run({ | ||
| content: sampleContent, | ||
| encoding, | ||
| path: path ? `${path}-sample-${i}` : undefined, | ||
| }) | ||
| .then((tokens) => ({ chars: sampleContent.length, tokens })); | ||
| }), | ||
| ); | ||
|
|
||
| const validSamples = sampleResults.filter((s) => s.tokens > 0 && s.chars > 0); | ||
| if (validSamples.length < 2) { | ||
| return null; | ||
| } | ||
|
|
||
| // Compute per-sample chars/token ratios and check coefficient of variation (CV = stddev / mean). | ||
| // High CV indicates the content is heterogeneous (e.g. mixed CJK/ASCII, or periodic structure | ||
| // resonating with the sample stride), making the extrapolation unreliable. | ||
| const ratios = validSamples.map((s) => s.chars / s.tokens); | ||
| const mean = ratios.reduce((sum, r) => sum + r, 0) / ratios.length; | ||
| const variance = ratios.reduce((sum, r) => sum + (r - mean) ** 2, 0) / ratios.length; | ||
| const cv = Math.sqrt(variance) / mean; | ||
|
|
||
| if (cv > SAMPLING_CV_THRESHOLD) { | ||
| logger.trace( | ||
| `Sampling CV ${cv.toFixed(3)} exceeds threshold ${SAMPLING_CV_THRESHOLD}, falling back to full tokenization`, | ||
| ); | ||
| return null; | ||
| } | ||
|
|
||
| // Extrapolate total token count from the overall sample ratio | ||
| const totalSampleTokens = validSamples.reduce((sum, s) => sum + s.tokens, 0); | ||
| const totalSampleChars = validSamples.reduce((sum, s) => sum + s.chars, 0); | ||
| const estimated = Math.round((content.length / totalSampleChars) * totalSampleTokens); | ||
|
|
||
| logger.trace( | ||
| `Estimated output tokens from ${validSamples.length} samples: ${estimated} (CV=${cv.toFixed(3)}, ${(totalSampleChars / totalSampleTokens).toFixed(2)} chars/token)`, | ||
| ); | ||
|
Comment on lines
+108
to
+151
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current sample placement can miss the exact resonance case the CV gate is supposed to catch.
🩹 One way to de-bias the sample starts- const stride = Math.floor(content.length / sampleCount);
+ const lastStart = Math.max(0, content.length - OUTPUT_SAMPLE_SIZE);
+ const bucketSize = content.length / sampleCount;
const sampleResults = await Promise.all(
Array.from({ length: sampleCount }, (_, i) => {
- const start = i * stride;
+ const rawStart =
+ i === 0
+ ? 0
+ : i === sampleCount - 1
+ ? lastStart
+ : (i + 0.5) * bucketSize - OUTPUT_SAMPLE_SIZE / 2;
+ const start = Math.max(0, Math.min(lastStart, Math.round(rawStart)));
const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE);🤖 Prompt for AI Agents |
||
|
|
||
| return estimated; | ||
| }; | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Surface when this count is estimated.
This branch now returns an extrapolated value through the same
Promise<number>contract, butsrc/core/metrics/calculateMetrics.tsstill reduces these numbers intototalTokensas if they were exact. That silently changes the semantics of large-output metrics for every downstream caller. Please propagate estimate metadata or keep the exact path on the public result.🤖 Prompt for AI Agents