Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 117 additions & 28 deletions src/core/metrics/calculateOutputMetrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,48 +9,42 @@ import type { TokenCountTask } from './workers/calculateMetricsWorker.js';
const TARGET_CHARS_PER_CHUNK = 100_000;
const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1MB

// Sampling constants for token count estimation on large outputs.
// Instead of full BPE tokenization, we sample evenly spaced portions and extrapolate.
// Threshold must be well above MIN_CONTENT_LENGTH_FOR_PARALLEL (1MB) so that sampling
// (10 worker calls) is significantly fewer than full parallel chunking (30+ chunks).
const OUTPUT_SAMPLING_THRESHOLD = 3_000_000; // 3MB - outputs below this are fully tokenized
const OUTPUT_SAMPLE_SIZE = 100_000; // 100KB per sample
const OUTPUT_SAMPLE_COUNT = 10; // Number of evenly spaced samples
// Maximum coefficient of variation allowed for sampling estimation.
// If per-sample chars/token ratios vary more than this (e.g. mixed CJK/ASCII content,
// or periodic structure resonating with the stride), fall back to full tokenization.
const SAMPLING_CV_THRESHOLD = 0.15;

export const calculateOutputMetrics = async (
content: string,
encoding: TokenEncoding,
path: string | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
): Promise<number> => {
const shouldRunInParallel = content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL;

try {
logger.trace(`Starting output token count for ${path || 'output'}`);
const startTime = process.hrtime.bigint();

let result: number;

if (shouldRunInParallel) {
// Split content into chunks for parallel processing
const chunks: string[] = [];

for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) {
chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK));
if (content.length > OUTPUT_SAMPLING_THRESHOLD) {
// For very large outputs, try sampling estimation first
const estimated = await tryEstimateBySampling(content, encoding, path, deps);
if (estimated !== null) {
result = estimated;
} else {
// Sampling variance too high, fall back to full tokenization
result = await fullTokenize(content, encoding, path, deps);
}
Comment on lines +36 to 44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Surface when this count is estimated.

This branch now returns an extrapolated value through the same Promise<number> contract, but src/core/metrics/calculateMetrics.ts still reduces these numbers into totalTokens as if they were exact. That silently changes the semantics of large-output metrics for every downstream caller. Please propagate estimate metadata or keep the exact path on the public result.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/core/metrics/calculateOutputMetrics.ts` around lines 34 - 42, The branch
in calculateOutputMetrics.ts that returns an extrapolated numeric value from
tryEstimateBySampling hides that the count is estimated; change the public
contract returned by calculateOutputMetrics (and values produced by
tryEstimateBySampling and fullTokenize) from a bare number to a small result
object such as { count: number, estimated: boolean } (or add an explicit
isEstimated flag) so callers can distinguish exact vs extrapolated counts,
update calculateMetrics.ts to aggregate using .count and to track/propagate
estimation metadata, and ensure fullTokenize returns estimated=false while
tryEstimateBySampling returns estimated=true when extrapolating.


// Process chunks in parallel
const chunkResults = await Promise.all(
chunks.map(async (chunk, index) => {
return deps.taskRunner.run({
content: chunk,
encoding,
path: path ? `${path}-chunk-${index}` : undefined,
});
}),
);

// Sum up the results
result = chunkResults.reduce((sum, count) => sum + count, 0);
} else {
// Process small content directly
result = await deps.taskRunner.run({
content,
encoding,
path,
});
// Standard path: full tokenization (parallel for > 1MB, direct for smaller)
result = await fullTokenize(content, encoding, path, deps);
}

const endTime = process.hrtime.bigint();
Expand All @@ -63,3 +57,98 @@ export const calculateOutputMetrics = async (
throw error;
}
};

/**
* Full tokenization: split content into chunks and process in parallel, or directly for smaller content.
*/
const fullTokenize = async (
content: string,
encoding: TokenEncoding,
path: string | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
): Promise<number> => {
if (content.length > MIN_CONTENT_LENGTH_FOR_PARALLEL) {
const chunks: string[] = [];
for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) {
chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK));
}

const chunkResults = await Promise.all(
chunks.map((chunk, index) =>
deps.taskRunner.run({
content: chunk,
encoding,
path: path ? `${path}-chunk-${index}` : undefined,
}),
),
);

return chunkResults.reduce((sum, count) => sum + count, 0);
}

return deps.taskRunner.run({ content, encoding, path });
};

/**
* Try to estimate token count by sampling evenly spaced portions of the content.
* Returns the estimated count, or null if the sample variance is too high
* (indicating heterogeneous content where sampling would be inaccurate).
*/
const tryEstimateBySampling = async (
content: string,
encoding: TokenEncoding,
path: string | undefined,
deps: { taskRunner: TaskRunner<TokenCountTask, number> },
): Promise<number | null> => {
const sampleCount = Math.min(OUTPUT_SAMPLE_COUNT, Math.ceil(content.length / OUTPUT_SAMPLE_SIZE));
if (sampleCount < 2) {
return null;
}

const stride = Math.floor(content.length / sampleCount);

const sampleResults = await Promise.all(
Array.from({ length: sampleCount }, (_, i) => {
const start = i * stride;
const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE);
return deps.taskRunner
.run({
content: sampleContent,
encoding,
path: path ? `${path}-sample-${i}` : undefined,
})
.then((tokens) => ({ chars: sampleContent.length, tokens }));
}),
);

const validSamples = sampleResults.filter((s) => s.tokens > 0 && s.chars > 0);
if (validSamples.length < 2) {
return null;
}

// Compute per-sample chars/token ratios and check coefficient of variation (CV = stddev / mean).
// High CV indicates the content is heterogeneous (e.g. mixed CJK/ASCII, or periodic structure
// resonating with the sample stride), making the extrapolation unreliable.
const ratios = validSamples.map((s) => s.chars / s.tokens);
const mean = ratios.reduce((sum, r) => sum + r, 0) / ratios.length;
const variance = ratios.reduce((sum, r) => sum + (r - mean) ** 2, 0) / ratios.length;
const cv = Math.sqrt(variance) / mean;

if (cv > SAMPLING_CV_THRESHOLD) {
logger.trace(
`Sampling CV ${cv.toFixed(3)} exceeds threshold ${SAMPLING_CV_THRESHOLD}, falling back to full tokenization`,
);
return null;
}

// Extrapolate total token count from the overall sample ratio
const totalSampleTokens = validSamples.reduce((sum, s) => sum + s.tokens, 0);
const totalSampleChars = validSamples.reduce((sum, s) => sum + s.chars, 0);
const estimated = Math.round((content.length / totalSampleChars) * totalSampleTokens);

logger.trace(
`Estimated output tokens from ${validSamples.length} samples: ${estimated} (CV=${cv.toFixed(3)}, ${(totalSampleChars / totalSampleTokens).toFixed(2)} chars/token)`,
);
Comment on lines +108 to +151
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

The current sample placement can miss the exact resonance case the CV gate is supposed to catch.

start = i * stride always samples the leading edge of each band and can leave the tail unsampled. On alternating 100KB ASCII/CJK blocks, every sample can land on the same phase, CV stays near zero, and the estimate is still badly wrong. Please distribute starts across the full [0, content.length - OUTPUT_SAMPLE_SIZE] range—ideally centered or jittered per bucket—before relying on CV as the safety check.

🩹 One way to de-bias the sample starts
-  const stride = Math.floor(content.length / sampleCount);
+  const lastStart = Math.max(0, content.length - OUTPUT_SAMPLE_SIZE);
+  const bucketSize = content.length / sampleCount;

   const sampleResults = await Promise.all(
     Array.from({ length: sampleCount }, (_, i) => {
-      const start = i * stride;
+      const rawStart =
+        i === 0
+          ? 0
+          : i === sampleCount - 1
+            ? lastStart
+            : (i + 0.5) * bucketSize - OUTPUT_SAMPLE_SIZE / 2;
+      const start = Math.max(0, Math.min(lastStart, Math.round(rawStart)));
       const sampleContent = content.slice(start, start + OUTPUT_SAMPLE_SIZE);
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/core/metrics/calculateOutputMetrics.ts` around lines 106 - 149, The
sampling start calculation using start = i * stride biases samples to each band
edge and can miss phase/resonance patterns; change the sampling in the
Array.from mapping (where stride, start, and sampleContent are computed inside
the sampleResults creation) to distribute starts across the full [0,
content.length - OUTPUT_SAMPLE_SIZE] range instead of using i * stride — e.g.,
compute maxStart = Math.max(0, content.length - OUTPUT_SAMPLE_SIZE) and set
start = Math.floor(((i + 0.5) / sampleCount) * maxStart) (or add a small
per-bucket jitter) to center/jitter each bucket, keeping the rest of the logic
(validSamples, ratios, cv) unchanged and ensuring start is clamped to valid
bounds.


return estimated;
};
86 changes: 84 additions & 2 deletions tests/core/metrics/calculateOutputMetrics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ describe('calculateOutputMetrics', () => {
});

it('should process large content in parallel', async () => {
// Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL
// Generate a large content that exceeds MIN_CONTENT_LENGTH_FOR_PARALLEL but below sampling threshold
const content = 'a'.repeat(1_100_000); // 1.1MB of content
const encoding = 'o200k_base';
const path = 'large-file.txt';
Expand Down Expand Up @@ -143,7 +143,7 @@ describe('calculateOutputMetrics', () => {
});

it('should correctly split content into chunks for parallel processing', async () => {
const content = 'a'.repeat(1_100_000); // 1.1MB of content
const content = 'a'.repeat(1_100_000); // 1.1MB of content (below sampling threshold)
const encoding = 'o200k_base';
const processedChunks: string[] = [];

Expand Down Expand Up @@ -178,4 +178,86 @@ describe('calculateOutputMetrics', () => {
}
expect(processedChunks.join('')).toBe(content); // All content should be processed
});

describe('sampling estimation', () => {
it('should use sampling estimation for large content with uniform token density', async () => {
// 3.5MB of uniform content (above 3MB sampling threshold)
const content = 'hello world '.repeat(291_667); // ~3.5MB
const encoding = 'o200k_base';
let totalRunCalls = 0;

// Mock that returns consistent tokens-per-char ratio
const mockTaskRunner = {
run: async (task: TokenCountTask) => {
totalRunCalls++;
// ~4 chars per token, consistent ratio
return Math.round(task.content.length / 4);
},
cleanup: async () => {},
};

const result = await calculateOutputMetrics(content, encoding, 'test.txt', {
taskRunner: mockTaskRunner,
});

// Should have used sampling (10 samples), not full tokenization (35 chunks)
expect(totalRunCalls).toBeLessThanOrEqual(10);
// Estimated tokens should be approximately content.length / 4
expect(result).toBeGreaterThan(0);
expect(Math.abs(result - content.length / 4)).toBeLessThan((content.length / 4) * 0.05);
});

it('should fall back to full tokenization when sampling CV is too high', async () => {
// 3.5MB of content (above sampling threshold)
const content = 'a'.repeat(3_500_000);
const encoding = 'o200k_base';
let runCallCount = 0;

// Mock that returns wildly different ratios per sample to trigger high CV
const mockTaskRunner = {
run: async (task: TokenCountTask) => {
runCallCount++;
const isSample = task.path?.includes('-sample-');
if (isSample) {
// Alternate between very different ratios to produce high CV
const sampleIndex = Number.parseInt(task.path?.split('-sample-')[1] || '0', 10);
return sampleIndex % 2 === 0 ? task.content.length / 2 : task.content.length / 10;
}
// Full tokenization chunks
return Math.round(task.content.length / 4);
},
cleanup: async () => {},
};

const result = await calculateOutputMetrics(content, encoding, 'test.txt', {
taskRunner: mockTaskRunner,
});

// Should have fallen back to full parallel tokenization (10 samples + 35 chunks)
expect(runCallCount).toBeGreaterThan(10);
expect(result).toBeGreaterThan(0);
});

it('should not use sampling for content below threshold', async () => {
// 2MB (below 3MB sampling threshold, but above parallel threshold)
const content = 'a'.repeat(2_000_000);
const encoding = 'o200k_base';
let runCallCount = 0;

const mockTaskRunner = {
run: async (task: TokenCountTask) => {
runCallCount++;
return Math.round(task.content.length / 4);
},
cleanup: async () => {},
};

await calculateOutputMetrics(content, encoding, 'test.txt', {
taskRunner: mockTaskRunner,
});

// Should use full parallel tokenization (20 chunks), not sampling
expect(runCallCount).toBe(20);
});
});
});
Loading