Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5ab8da0
baseline: PR #20834 + firstNonAsciiPos (threshold 9)
joshuaisaact Mar 30, 2026
e1360e2
exp1: fromCharCode + inline textDecoder.decode
joshuaisaact Mar 30, 2026
cfedd0b
exp4: batch fromCharCode with switch on len
joshuaisaact Mar 30, 2026
695f269
exp6: extend batch fromCharCode to threshold 12
joshuaisaact Mar 30, 2026
e4b31fb
exp7: extend batch fromCharCode to threshold 16
joshuaisaact Mar 30, 2026
842dce4
exp8: extend batch fromCharCode to threshold 24
joshuaisaact Mar 30, 2026
3ef3cd0
exp10: extend batch fromCharCode to threshold 32
joshuaisaact Mar 30, 2026
e08bc8d
exp13: extend batch fromCharCode to threshold 48
joshuaisaact Mar 30, 2026
04fdac8
exp15: pre-decode buffer as latin1, substr for ASCII strings
joshuaisaact Mar 30, 2026
5628183
exp19: strDataIsAscii flag + streamlined branching
joshuaisaact Mar 30, 2026
ebfbf99
exp24: minimize branching with firstNonAsciiBufPos
joshuaisaact Mar 30, 2026
a07cdd4
exp24b: fix source boundary + separate source/non-source paths
joshuaisaact Mar 30, 2026
69d3b2d
exp26: add lastNonAsciiSrcEnd for tail-ASCII fast path
joshuaisaact Mar 30, 2026
86720dc
exp29: cumulative non-ASCII count for O(1) ASCII range check
joshuaisaact Mar 30, 2026
e274000
exp30: simplified - only cumulative count, no first/lastNonAscii
joshuaisaact Mar 30, 2026
bc3c7bf
exp31: maximum simplification - no sourceText, pure cumulative + buff…
joshuaisaact Mar 30, 2026
b94307c
exp31b: restore sourceText path for ASCII source boundary strings
joshuaisaact Mar 30, 2026
c668176
exp32: micro-opts - remove len===0 check, inline pos read
joshuaisaact Mar 30, 2026
2a109a1
add sparse-table version: O(log k) binary search over non-ASCII drift…
joshuaisaact Apr 1, 2026
1e03e5a
fix: check entire string is within source region before using substr
joshuaisaact Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions versions/experiment.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/**
* Experiment 32: Micro-optimizations - remove len===0 check,
* use local references for hot variables.
*/

// oxlint-disable prefer-const

const textDecoder = new TextDecoder("utf-8", { ignoreBOM: true });

let bufferAsAscii;
let nonAsciiCum;

export function setup() {
const latin1Decoder = new TextDecoder("latin1");
bufferAsAscii = latin1Decoder.decode(uint8);
nonAsciiCum = new Uint32Array(uint8.length + 1);
let count = 0;
for (let i = 0; i < uint8.length; i++) {
nonAsciiCum[i] = count;
if (uint8[i] >= 128) count++;
}
nonAsciiCum[uint8.length] = count;
}

export function deserializeStr(pos) {
let pos32 = pos >> 2,
len = uint32[pos32 + 2],
p = uint32[pos32];
if (p < sourceEndPos && sourceIsAscii) return sourceText.substr(p, len);
let end = p + len;
if (nonAsciiCum[end] === nonAsciiCum[p]) return bufferAsAscii.substr(p, len);
return textDecoder.decode(uint8.subarray(p, end));
}
127 changes: 127 additions & 0 deletions versions/sparse-table.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/**
* Sparse UTF-8/UTF-16 translation table: extends sourceText.substr() to ALL
* source strings, not just those in the ASCII prefix.
*
* Builds on pr20834-fnap.mjs. Instead of finding the first non-ASCII byte and
* giving up after it, we scan the source region once in setup() and record the
* cumulative byte-vs-codeunit drift at each multi-byte character. A binary
* search at read time converts byte offsets to UTF-16 offsets in O(log k),
* where k is the number of non-ASCII characters (typically tiny for source).
*/

// oxlint-disable prefer-const

const textDecoder = new TextDecoder("utf-8", { ignoreBOM: true }),
decodeStr = textDecoder.decode.bind(textDecoder);

const { fromCodePoint } = String;

// Sparse table: parallel arrays for cache-friendly binary search.
// tableOffsets[i] = byte offset just past the i-th multi-byte character.
// tableDrifts[i] = cumulative drift (utf8 bytes - utf16 code units) at that point.
let tableOffsets;
let tableDrifts;
let tableLen;

export function setup() {
// First pass: count multi-byte characters to size the arrays.
let count = 0;
for (let i = 0; i < sourceEndPos; ) {
let b = uint8[i];
if (b >= 0xf0) { count++; i += 4; }
else if (b >= 0xe0) { count++; i += 3; }
else if (b >= 0xc0) { count++; i += 2; }
else { i++; }
}

tableOffsets = new Uint32Array(count);
tableDrifts = new Uint32Array(count);
tableLen = count;

// Second pass: populate the table.
let drift = 0;
let idx = 0;
for (let i = 0; i < sourceEndPos; ) {
let b = uint8[i];
if (b >= 0xf0) {
// 4-byte sequence -> 2 UTF-16 code units (surrogate pair), drift += 2
drift += 2;
i += 4;
tableOffsets[idx] = i;
tableDrifts[idx] = drift;
idx++;
} else if (b >= 0xe0) {
// 3-byte sequence -> 1 UTF-16 code unit, drift += 2
drift += 2;
i += 3;
tableOffsets[idx] = i;
tableDrifts[idx] = drift;
idx++;
} else if (b >= 0xc0) {
// 2-byte sequence -> 1 UTF-16 code unit, drift += 1
drift += 1;
i += 2;
tableOffsets[idx] = i;
tableDrifts[idx] = drift;
idx++;
} else {
i++;
}
}
}

// Binary search: find the largest index where tableOffsets[index] <= target.
// Returns -1 if target is before all entries (i.e. drift is 0).
function findDrift(target) {
let lo = 0, hi = tableLen - 1, result = -1;
while (lo <= hi) {
let mid = (lo + hi) >>> 1;
if (tableOffsets[mid] <= target) {
result = mid;
lo = mid + 1;
} else {
hi = mid - 1;
}
}
return result;
}

// Get the cumulative drift at a byte offset.
function driftAt(bytePos) {
if (tableLen === 0) return 0;
let idx = findDrift(bytePos);
return idx < 0 ? 0 : tableDrifts[idx];
}

export function deserializeStr(pos) {
let pos32 = pos >> 2,
len = uint32[pos32 + 2];
if (len === 0) return "";
pos = uint32[pos32];

if (pos + len <= sourceEndPos) {
if (sourceIsAscii) return sourceText.substr(pos, len);

// Use the sparse table to convert byte offsets to UTF-16 offsets.
let startDrift = driftAt(pos);
let endDrift = driftAt(pos + len);
let utf16Start = pos - startDrift;
let utf16Len = len - (endDrift - startDrift);
return sourceText.substr(utf16Start, utf16Len);
}

// Outside source region: fall back to concat/TextDecoder (from pr20834-fnap).
let end = pos + len;
if (len > 9) return decodeStr(uint8.subarray(pos, end));
let out = "",
c;
do {
c = uint8[pos++];
if (c < 128) out += fromCodePoint(c);
else {
out += decodeStr(uint8.subarray(pos - 1, end));
break;
}
} while (pos < end);
return out;
}