Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/core/metrics/calculateOutputMetrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ import type { TaskRunner } from '../../shared/processConcurrency.js';
import type { TokenEncoding } from './TokenCounter.js';
import type { TokenCountTask } from './workers/calculateMetricsWorker.js';

const CHUNK_SIZE = 1000;
const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1000KB
// Target ~100KB per chunk so that each worker task does meaningful tokenization work.
// Previously this was 1000 (number of chunks), which created ~1KB chunks for 1MB output,
// causing ~1000 worker round-trips with ~0.5ms overhead each (~500ms total waste).
const TARGET_CHARS_PER_CHUNK = 100_000;
const MIN_CONTENT_LENGTH_FOR_PARALLEL = 1_000_000; // 1MB

export const calculateOutputMetrics = async (
content: string,
Expand All @@ -22,11 +25,10 @@ export const calculateOutputMetrics = async (

if (shouldRunInParallel) {
// Split content into chunks for parallel processing
const chunkSize = Math.ceil(content.length / CHUNK_SIZE);
const chunks: string[] = [];

for (let i = 0; i < content.length; i += chunkSize) {
chunks.push(content.slice(i, i + chunkSize));
for (let i = 0; i < content.length; i += TARGET_CHARS_PER_CHUNK) {
chunks.push(content.slice(i, i + TARGET_CHARS_PER_CHUNK));
}
Comment on lines +30 to 32
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Splitting strings using a fixed character count can break Unicode surrogate pairs (e.g., emojis or certain mathematical symbols) if the split occurs between the high and low surrogates. This results in invalid UTF-16 strings being sent to the worker, which may lead to slightly inaccurate token counts or errors depending on how the tokenizer handles malformed input. While the impact is likely small given the 100KB chunk size, it is safer to ensure splits occur at valid character boundaries.

      for (let i = 0; i < content.length; ) {
        let end = Math.min(i + TARGET_CHARS_PER_CHUNK, content.length);
        if (end < content.length && content.charCodeAt(end - 1) >= 0xd800 && content.charCodeAt(end - 1) <= 0xdbff) {
          end--;
        }
        chunks.push(content.slice(i, end));
        i = end;
      }


// Process chunks in parallel
Expand Down
12 changes: 7 additions & 5 deletions tests/core/metrics/calculateOutputMetrics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ describe('calculateOutputMetrics', () => {
});

expect(chunksProcessed).toBeGreaterThan(1); // Should have processed multiple chunks
expect(result).toBe(100_000); // 1000 chunks * 100 tokens per chunk
expect(result).toBe(chunksProcessed * 100); // chunks * 100 tokens per chunk
});

it('should handle errors in parallel processing', async () => {
Expand Down Expand Up @@ -168,12 +168,14 @@ describe('calculateOutputMetrics', () => {
}),
});

// Check that chunks are roughly equal in size
const _expectedChunkSize = Math.ceil(content.length / 1000); // CHUNK_SIZE is 1000
// With TARGET_CHARS_PER_CHUNK=100_000, 1.1MB content should produce 11 chunks
const chunkSizes = processedChunks.map((chunk) => chunk.length);

expect(processedChunks.length).toBe(1000); // Should have 1000 chunks
expect(Math.max(...chunkSizes) - Math.min(...chunkSizes)).toBeLessThanOrEqual(1); // Chunks should be almost equal in size
expect(processedChunks.length).toBe(11);
// All chunks except the last should be exactly TARGET_CHARS_PER_CHUNK
for (let i = 0; i < chunkSizes.length - 1; i++) {
expect(chunkSizes[i]).toBe(100_000);
}
expect(processedChunks.join('')).toBe(content); // All content should be processed
});
});
Loading