From 077b3ba026c233af4c3a882ecab2fa40c8bca9c3 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Tue, 16 Jul 2024 18:24:49 +0200 Subject: [PATCH] refactor: char index (#5926) --- .../src/lib/TrieBlob/CharIndex.test.ts | 8 +- .../src/lib/TrieBlob/CharIndex.ts | 86 +++------ .../src/lib/TrieBlob/FastTrieBlob.ts | 2 +- .../src/lib/TrieBlob/FastTrieBlobBuilder.ts | 33 ---- .../src/lib/TrieBlob/FastTrieBlobInternals.ts | 7 +- .../src/lib/TrieBlob/TrieBlob.ts | 4 +- .../src/lib/TrieBlob/Utf8.perf.ts | 165 +++++++++++++++++- .../src/lib/TrieBlob/Utf8.test.ts | 134 ++++++++++++++ .../cspell-trie-lib/src/lib/TrieBlob/Utf8.ts | 103 +++++++++-- .../src/perf/charIndex.perf.ts | 9 +- vitest.config.mjs | 2 + 11 files changed, 431 insertions(+), 122 deletions(-) diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts index 62a3ff5fc20..8b4419cf93b 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts @@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => { const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']; const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c)); expect(indexes).toEqual(letters.map((c) => c.codePointAt(0))); - const r = charIndexBuilder.wordToUtf8Seq('abcdefghij'); - expect(r).toEqual([...textEncoder.encode('abcdefghij')]); - expect(charIndexBuilder.size).toBe(11); // One extra for the empty string. + const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀'); + expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]); + expect(charIndexBuilder.size).toBe(16); // One extra for the empty string. // Add the same letters again. expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0))); const charIndex = charIndexBuilder.build(); - expect(charIndex.size).toBe(11); + expect(charIndex.size).toBe(16); expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]); }); }); diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts index 8da6246277a..18cfe91013a 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts @@ -1,4 +1,4 @@ -import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js'; +import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js'; export type Utf8Seq = Readonly; @@ -6,7 +6,7 @@ export type CharIndexMap = Record; export type RO_CharIndexMap = Readonly; -export type CharIndexSeqMap = Record; +export type CharIndexSeqMap = Record; export type RO_CharIndexSeqMap = Readonly; @@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0]; Object.freeze(emptySeq); export class CharIndex { - readonly charToUtf8Map: RO_CharIndexMap; - readonly charToUtf8SeqMap: RO_CharIndexSeqMap; + #charToUtf8SeqMap: CharIndexSeqMap; #lastWord = ''; #lastWordSeq: Utf8Seq = []; + #multiByteChars: boolean; constructor(readonly charIndex: readonly string[]) { - this.charToUtf8Map = buildCharIndexMap(charIndex); - this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map); - } - - getUtf8Value(c: string): number { - return this.charToUtf8Map[c] || 0; + this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex); + this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1); } getCharUtf8Seq(c: string): Utf8Seq { - const r = this.charToUtf8SeqMap[c] ?? emptySeq; - return typeof r === 'number' ? [r] : r; - } - - __wordToUtf8Seq(word: string): Utf8Seq { - // Note: Array.flatMap is very slow - const seq: number[] = new Array(word.length); - let i = 0; - for (const c of word) { - const cSep = this.charToUtf8SeqMap[c]; - if (typeof cSep === 'number') { - seq[i++] = cSep; - continue; - } - if (!cSep) { - seq[i++] = 0; - continue; - } - for (const cIdx of cSep) { - seq[i++] = cIdx; - } - } - if (seq.length !== i) seq.length = i; - return seq; + const found = this.#charToUtf8SeqMap[c]; + if (found) return found; + const s = encodeTextToUtf8(c); + this.#charToUtf8SeqMap[c] = s; + return s; } wordToUtf8Seq(word: string): Utf8Seq { if (this.#lastWord === word) return this.#lastWordSeq; - const seq = this.__wordToUtf8Seq(word); + const seq = encodeTextToUtf8(word); this.#lastWord = word; this.#lastWordSeq = seq; @@ -69,7 +46,7 @@ export class CharIndex { } indexContainsMultiByteChars(): boolean { - return Object.values(this.charToUtf8Map).some((v) => v >= 0x80); + return this.#multiByteChars; } get size(): number { @@ -81,22 +58,10 @@ export class CharIndex { } } -function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap { - const map: CharIndexMap = Object.create(null); - for (const c of charIndex) { - const cn = c.normalize('NFC'); - const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0); - map[c] = utf8; - map[c.normalize('NFC')] = utf8; - map[c.normalize('NFD')] = utf8; - } - return map; -} - -function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap { +function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap { const map: CharIndexSeqMap = Object.create(null); - for (const [key, value] of Object.entries(charIndexMap)) { - map[key] = splitUtf8IfNeeded(value); + for (const key of charIndex) { + map[key] = encodeTextToUtf8(key); } return map; } @@ -106,7 +71,7 @@ export class CharIndexBuilder { readonly charIndexMap: CharIndexMap = Object.create(null); readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null); - readonly #mapIdxToSeq = new Map(); + readonly #mapIdxToSeq = new Map(); constructor() { this.getUtf8Value(''); @@ -126,24 +91,22 @@ export class CharIndexBuilder { return utf8; } - utf8ValueToUtf8Seq(idx: number): number[] | number { + utf8ValueToUtf8Seq(idx: number): number[] { const found = this.#mapIdxToSeq.get(idx); if (found !== undefined) { return found; } - const seq = splitUtf8IfNeeded(idx); + const seq = splitUtf8(idx); this.#mapIdxToSeq.set(idx, seq); return seq; } charToUtf8Seq(c: string): number[] { const idx = this.getUtf8Value(c); - const s = this.utf8ValueToUtf8Seq(idx); - return typeof s === 'number' ? [s] : s; + return this.utf8ValueToUtf8Seq(idx); } wordToUtf8Seq(word: string): number[] { - // word = word.normalize('NFC'); const seq: number[] = new Array(word.length); let i = 0; for (const c of word) { @@ -170,8 +133,9 @@ export class CharIndexBuilder { } } -function splitUtf8IfNeeded(utf8: number): number | number[] { - if (utf8 < 0x80) return utf8; - const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v); - return s.length ? s : s[0]; +function splitUtf8(utf8: number): number[] { + if (utf8 <= 0xff) return [utf8]; + if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff]; + if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff]; + return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v); } diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts index 7f834dd979a..a41170e4492 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts @@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData { static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot { return new FastTrieBlobIRoot( - new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted), + new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo), 0, trie.info, ); diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts index 93d6a2f0338..abd9e8c6d58 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts @@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder { for (let i = 0; i < utf8Seq.length; ++i) { insertCharIndexes(utf8Seq[i], pDepth); } - // dumpState({ step: 'insertChar', char }); }; /** @@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder { const pos = s.pos; const node = nodes[nodeIdx]; node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask); - - // dumpState({ step: 'reference', refId, refNodeIdx }); }; const backStep = (num: number) => { @@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder { depth = stack[depth].pDepth; } nodeIdx = stack[depth + 1].nodeIdx; - - // dumpState({ step: 'backStep', num }); }; - // function dumpNode(node: number[]): string { - // const n = node - // .map((n, i) => { - // if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`; - // return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`; - // }) - // .join(', '); - // return `[${n}]`; - // } - - // function dumpNodes(nodes: FastTrieBlobNode[]) { - // return nodes.map((n, i) => `${i}: ${dumpNode(n)}`); - // } - - // const debug = false; - - // function dumpState(extra?: Record) { - // debug && - // console.warn('%o', { - // stack: stack.slice(0, depth + 1), - // nodes: dumpNodes(nodes), - // nodeIdx, - // depth, - // refNodes, - // ...extra, - // }); - // } - const c: BuilderCursor = { insertChar, markEOW, diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts index 4d0fe973b51..8768b249a19 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobInternals.ts @@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo { readonly nodes: number[][], readonly charIndex: CharIndex, maskInfo: FastTrieBlobBitMaskInfo, - sorted = false, ) { const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo; this.NodeMaskEOW = NodeMaskEOW; this.NodeMaskChildCharIndex = NodeMaskChildCharIndex; this.NodeChildRefShift = NodeChildRefShift; this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars(); - !sorted && sortNodes(nodes, this.NodeMaskChildCharIndex); + sortNodes(nodes, this.NodeMaskChildCharIndex); } } @@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo { * @returns */ export function sortNodes(nodes: number[][], mask: number): number[][] { + if (Object.isFrozen(nodes)) { + assertSorted(nodes, mask); + return nodes; + } for (let i = 0; i < nodes.length; ++i) { let node = nodes[i]; if (node.length > 2) { diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts index ad383790ed3..faf911788f5 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts @@ -82,7 +82,7 @@ export class TrieBlob implements TrieData { this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix); } - public wordToNodeCharIndexSequence(word: string): Utf8Seq { + public wordToUtf8Seq(word: string): Utf8Seq { return this.charIndex.wordToUtf8Seq(word); } @@ -159,7 +159,7 @@ export class TrieBlob implements TrieData { const NodeChildRefShift = TrieBlob.NodeChildRefShift; const nodes = this.nodes; const nodes8 = this.#nodes8; - const wordIndexes = this.wordToNodeCharIndexSequence(word); + const wordIndexes = this.wordToUtf8Seq(word); const lookup = this.#nodeIdxLookup; const len = wordIndexes.length; let p = 0; diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts index 5489996289f..b233f0f27cd 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.perf.ts @@ -1,11 +1,24 @@ import { suite } from 'perf-insight'; -import { decodeUtf8ByteStream, decodeUtf8N_BE, decodeUtf8N_LE, encodeUtf8N_BE, encodeUtf8N_LE } from './Utf8.js'; +import { + decodeUtf8ByteStream, + decodeUtf8N_BE, + decodeUtf8N_LE, + encodeCodePointsToUtf8Into, + encodeTextToUtf8, + encodeTextToUtf8Into, + encodeUtf8N_BE, + encodeUtf8N_LE, + textToCodePoints, +} from './Utf8.js'; suite('Utf8 encode/decode', async (test) => { const iterations = 1000; const text = sampleText(); + const words = text.split(/\s+/).filter((a) => !!a); + const wordsCP = words.map((word) => [...word].map((char) => char.codePointAt(0) || 0)); const chars = [...text]; + const codePoints = chars.map((char) => char.codePointAt(0) || 0); const encoder = new TextEncoder(); const decoder = new TextDecoder(); const scratchBuffer = new Uint8Array(1024); @@ -29,6 +42,16 @@ suite('Utf8 encode/decode', async (test) => { } }); + test('TextEncoder.encodeInto by char', () => { + const buffer = new Uint8Array(scratchBuffer.buffer, 0, 4); + for (let i = iterations; i > 0; --i) { + for (const char of chars) { + buffer[0] = 0; + encoder.encodeInto(char, buffer); + } + } + }); + test('encodeUtf8N_BE', () => { for (let i = iterations; i > 0; --i) { for (const char of chars) { @@ -50,6 +73,146 @@ suite('Utf8 encode/decode', async (test) => { } } }); + + test('TextEncoder.encodeInto text', () => { + const buffer = scratchBuffer; + const _text = text; + for (let i = iterations; i > 0; --i) { + encoder.encodeInto(_text, buffer); + } + }); + + test('Buffer.write text', () => { + const buffer = Buffer.from(scratchBuffer.buffer); + // const _text = text; + for (let i = iterations; i > 0; --i) { + buffer.write(text, 'utf16le'); + } + }); + + test('encodeCodePointsInto', () => { + const buffer = scratchBuffer; + const points = codePoints; + for (let i = iterations; i > 0; --i) { + encodeCodePointsToUtf8Into(points, buffer); + } + }); + + test(`TextEncoder.encodeInto words (${words.length})`, () => { + const buffer = scratchBuffer; + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + encoder.encodeInto(word, buffer); + } + } + }); + + test(`encodeCodePointsInto wordsCP (${words.length})`, () => { + const buffer = scratchBuffer; + const words = wordsCP; + for (let i = iterations; i > 0; --i) { + for (const points of words) { + encodeCodePointsToUtf8Into(points, buffer); + } + } + }); + + test(`encodeCodePointsInto Array wordsCP (${words.length})`, () => { + const buffer = new Array(100); + const words = wordsCP; + for (let i = iterations; i > 0; --i) { + for (const points of words) { + encodeCodePointsToUtf8Into(points, buffer); + } + } + }); + + test(`encodeCodePointsInto wordsCP .codePointAt (${words.length})`, () => { + const buffer = scratchBuffer; + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + encodeCodePointsToUtf8Into( + [...word].map((a) => a.codePointAt(0) || 0), + buffer, + ); + } + } + }); + + test(`encodeTextToUtf8Into Uint8Array words (${words.length})`, () => { + const buffer = scratchBuffer; + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + encodeTextToUtf8Into(word, buffer); + } + } + }); + + test(`encodeTextToUtf8Into array words (${words.length})`, () => { + const buffer = new Array(100); + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + encodeTextToUtf8Into(word, buffer); + } + } + }); + + test(`encoder.encode(word) to array words (${words.length})`, () => { + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + [...encoder.encode(word)]; + } + } + }); + + test(`encodeTextToUtf8 array words (${words.length})`, () => { + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + encodeTextToUtf8(word); + } + } + }); + + const charToUtf8Map = new Map( + [...new Set([...sampleText()])].map((char) => [char, encodeTextToUtf8(char)] as const), + ); + + test(`encodeTextToUtf8 to array with lookup (${words.length})`, () => { + const _words = words; + for (let i = iterations; i > 0; --i) { + for (const word of _words) { + const a: number[] = new Array(word.length * 2); + let i = 0; + for (const c of word) { + const u8 = charToUtf8Map.get(c); + for (const u of u8 || []) { + a[i++] = u; + } + } + a.length = i; + } + } + }); + + test('textToCodePoints', () => { + const _text = text; + for (let i = iterations; i > 0; --i) { + textToCodePoints(_text); + } + }); + + test('textToCodePoints map', () => { + const _text = text; + for (let i = iterations; i > 0; --i) { + [..._text].map((a) => a.codePointAt(0) || 0); + } + }); }); suite('Utf8 decode buffer', async (test) => { diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts index 9ef7d11ab7f..424989fa5c8 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts @@ -6,9 +6,12 @@ import { decodeUtf8ByteStream, decodeUtf8N_BE, decodeUtf8N_LE, + encodeCodePointsToUtf8Into, + encodeTextToUtf8, encodeUtf8N_BE, encodeUtf8N_LE, hex32, + textToCodePoints, Utf8Accumulator, } from './Utf8.js'; @@ -43,6 +46,77 @@ describe('Utf8 lib', () => { } }); + test.each` + text | expected + ${'a'} | ${[0x61]} + ${'ab'} | ${[0x61, 0x62]} + ${'é'} | ${[195, 169]} + ${'🇺🇸'} | ${[240, 159, 135, 186, 240, 159, 135, 184]} + `('encodeTextToUtf8 $text', ({ text, expected }) => { + expect(encodeTextToUtf8(text)).toEqual(expected); + expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]); + + const scratch: number[] = []; + const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch); + expect(scratch.slice(0, len)).toEqual(expected); + }); + + test('encodeCodePointsToUtf8Into', () => { + const decoder = new TextDecoder(); + const text = sampleText(); + const scratch: number[] = []; + const len = encodeCodePointsToUtf8Into(textToCodePoints(text), scratch); + const buf = new Uint8Array(scratch.slice(0, len)); + expect(decoder.decode(buf)).toBe(text); + }); + + test.each` + text | expected + ${'a'} | ${[0x61]} + ${'ab'} | ${[0x61, 0x62]} + ${'é'} | ${[0xc3a9]} + ${'🇺🇸'} | ${[0xf09f_87ba, 0xf09f_87b8]} + `('encodeUtf8N_BE $text', ({ text, expected }) => { + const utf = textToCodePoints(text).map((cp) => encodeUtf8N_BE(cp)); + expect(utf).toEqual(expected); + expect( + String.fromCodePoint( + ...utf + .map((v) => v ^ ~1) // force it to be native + .map((v) => v ^ ~1) + .map((c) => decodeUtf8N_BE(c)), + ), + ).toEqual(text); + }); + + test('decodeUtf8N_BE invalid', () => { + expect(decodeUtf8N_BE(0xff)).toBe(0xfffd); + }); + + test('decodeUtf8N_LE invalid', () => { + expect(decodeUtf8N_LE(0xff)).toBe(0xfffd); + }); + + test.each` + text | expected + ${'a'} | ${[0x61]} + ${'ab'} | ${[0x61, 0x62]} + ${'é'} | ${[0xa9c3]} + ${'ë'} | ${[0xabc3]} + ${'🇺🇸'} | ${[0xba87_9ff0, 0xb887_9ff0]} + `('encodeUtf8N_LE $text', ({ text, expected }) => { + const utf = textToCodePoints(text).map((cp) => encodeUtf8N_LE(cp)); + expect(utf).toEqual(expected); + expect( + String.fromCodePoint( + ...utf + .map((v) => v ^ ~1) // force it to be native + .map((v) => v ^ ~1) + .map((c) => decodeUtf8N_LE(c)), + ), + ).toEqual(text); + }); + test.each` value | expected ${0xff} | ${'0x0000_00ff'} @@ -78,6 +152,66 @@ describe('Utf8Accumulator', () => { const data = encoder.encode(text); expect([...decodeUtf8ByteStream(data)]).toEqual([...text].map((c) => c.codePointAt(0))); + + function* gen() { + yield* data; + } + expect([...decodeUtf8ByteStream(gen())]).toEqual([...text].map((c) => c.codePointAt(0))); + }); + + test('encodeTextToUtf8', () => { + const text = sampleText(); + expect(encodeTextToUtf8(text)).toEqual([...encoder.encode(text)]); + }); + + test('decodeUtf8ByteStream', () => { + const text = sampleText(); + expect(String.fromCodePoint(...decodeUtf8ByteStream(encoder.encode(text)))).toBe(text); + }); + + test('Utf8Accumulator isMultiByte', () => { + expect(Utf8Accumulator.isMultiByte(0x7f)).toBe(false); + expect(Utf8Accumulator.isMultiByte(0xf0)).toBe(true); + expect(Utf8Accumulator.isSingleByte(0x7f)).toBe(true); + expect(Utf8Accumulator.isSingleByte(0xf0)).toBe(false); + }); + + test('Utf8Accumulator', () => { + const acc = Utf8Accumulator.create(); + + expect(acc.decode(0x61)).toBe(0x61); + expect(acc.decode(0x61)).toBe(0x61); + + // é + expect(acc.decode(0xc3)).toBe(undefined); + const cloneAcc = acc.clone(); + expect(acc.decode(0xa9)).toBe('é'.codePointAt(0)); + expect(acc.decode(0x61)).toBe(0x61); + // ë + expect(cloneAcc.decode(0xab)).toBe('ë'.codePointAt(0)); + + // out of order + expect(acc.decode(0xa9)).toBe(0xfffd); + expect(acc.decode(0xc3)).toBe(undefined); + acc.reset(); + + // two leads in a row + expect(acc.decode(0xc3)).toBe(undefined); + expect(acc.decode(0xc3)).toBe(0xfffd); + expect(acc.decode(0xa9)).toBe(0xfffd); + + // two leads in a row + expect(acc.decode(0xc3)).toBe(undefined); + acc.reset(); + expect(acc.decode(0xc3)).toBe(undefined); + expect(acc.decode(0xa9)).toBe('é'.codePointAt(0)); + }); +}); + +describe('textToCodePoints', () => { + test('textToCodePoints', () => { + const text = sampleText(); + expect(textToCodePoints(text)).toEqual([...text].map((c) => c.codePointAt(0))); }); }); diff --git a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts index cbe5441cd34..43bad3e9621 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.ts @@ -1,3 +1,4 @@ +/* eslint-disable unicorn/prefer-code-point */ /** A utf8 value represented as big endian 32bit number */ export type Utf8BE32 = number; @@ -102,20 +103,6 @@ export function decodeUtf8N_LE(utf8: Utf8LE32): CodePoint { return 0xfffd; } -export function writeUtf8NtoBuffer(utf8: Utf8BE32, buffer: Uint8Array, offset: number): number { - const b0 = (utf8 >> 24) & 0xff; - const b1 = (utf8 >> 16) & 0xff; - const b2 = (utf8 >> 8) & 0xff; - const b3 = utf8 & 0xff; - - let i = 0; - b0 && (buffer[offset + i++] = b0); - b1 && (buffer[offset + i++] = b1); - b2 && (buffer[offset + i++] = b2); - buffer[offset + i++] = b3; - return i; -} - export class Utf8Accumulator { remaining = 0; value = 0; @@ -210,6 +197,94 @@ function* _decodeUtf8ByteStream(bytes: Iterable): Iterable { } } +export function encodeUtf8into(code: CodePoint, into: Array | Uint8Array, offset = 0): number { + if (code < 0x80) { + into[offset] = code; + return 1; + } + if (code < 0x800) { + const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f); + into[offset] = u >>> 8; + into[offset + 1] = u & 0xff; + return 2; + } + if (code < 0x1_0000) { + const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f); + into[offset] = u >>> 16; + into[offset + 1] = (u >>> 8) & 0xff; + into[offset + 2] = u & 0xff; + return 3; + } + const u = + 0xf080_8080 | (((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f)); + into[offset] = (u >>> 24) & 0x0ff; + into[offset + 1] = (u >>> 16) & 0xff; + into[offset + 2] = (u >>> 8) & 0xff; + into[offset + 3] = u & 0xff; + return 4; +} + +export function encodeTextToUtf8Into(text: string, into: Array | Uint8Array, offset = 0): number { + let i = offset; + const len = text.length; + for (let j = 0; j < len; j++) { + let code = text.charCodeAt(j); + code = (code & 0xf800) === 0xd800 ? text.codePointAt(j++) || 0 : code; + if (code < 0x80) { + into[i++] = code; + continue; + } + if (code < 0x800) { + const u = 0xc080 | ((code & 0x7c0) << 2) | (code & 0x3f); + into[i++] = u >>> 8; + into[i++] = u & 0xff; + continue; + } + if (code < 0x1_0000) { + const u = 0xe0_8080 | ((code & 0xf000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f); + into[i++] = u >>> 16; + into[i++] = (u >>> 8) & 0xff; + into[i++] = u & 0xff; + continue; + } + const u = + 0xf080_8080 | + (((code & 0x1c_0000) << 6) | ((code & 0x03_f000) << 4) | ((code & 0x0fc0) << 2) | (code & 0x3f)); + into[i++] = (u >>> 24) & 0x0ff; + into[i++] = (u >>> 16) & 0xff; + into[i++] = (u >>> 8) & 0xff; + into[i++] = u & 0xff; + } + return i - offset; +} + +export function encodeTextToUtf8(text: string): number[] { + const array = new Array(text.length); + const len = encodeTextToUtf8Into(text, array); + array.length !== len && (array.length = len); + return array; +} + +export function textToCodePoints(text: string): CodePoint[] { + const codePoints: CodePoint[] = new Array(text.length); + const len = text.length; + let j = 0; + for (let i = 0; i < len; i++) { + const code = text.charCodeAt(i); + codePoints[j++] = (code & 0xf800) === 0xd800 ? text.codePointAt(i++) || 0 : code; + } + codePoints.length = j; + return codePoints; +} + +export function encodeCodePointsToUtf8Into(data: CodePoint[], into: Array | Uint8Array, offset = 0): number { + let i = offset; + for (const code of data) { + i += encodeUtf8into(code, into, i); + } + return i - offset; +} + export function hex32(n: number): string { if (n < 0) n = 0x1_0000_0000 + n; const s = '0x' + n.toString(16).padStart(8, '0'); diff --git a/packages/cspell-trie-lib/src/perf/charIndex.perf.ts b/packages/cspell-trie-lib/src/perf/charIndex.perf.ts index 0d99d59dac9..c82f6644e43 100644 --- a/packages/cspell-trie-lib/src/perf/charIndex.perf.ts +++ b/packages/cspell-trie-lib/src/perf/charIndex.perf.ts @@ -1,5 +1,6 @@ import { suite } from 'perf-insight'; +import { encodeTextToUtf8 } from '../lib/TrieBlob/Utf8.js'; import { readFastTrieBlobFromConfig, readTrieFromConfig } from '../test/dictionaries.test.helper.js'; // const measureTimeout = 100; @@ -24,14 +25,14 @@ suite('encode to sequence', async (test) => { test('trieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => { for (const word of words) { - trieBlob.wordToNodeCharIndexSequence(word); + trieBlob.wordToUtf8Seq(word); } }); test('trieBlob.wordToNodeCharIndexSequence x4' + msgSuffix, () => { for (const word of words) { for (let i = 0; i < 4; ++i) { - trieBlob.wordToNodeCharIndexSequence(word); + trieBlob.wordToUtf8Seq(word); } } }); @@ -42,9 +43,9 @@ suite('encode to sequence', async (test) => { } }); - test('charIndex.__wordToCharIndexSequence' + msgSuffix, () => { + test('encodeTextToUtf8' + msgSuffix, () => { for (const word of words) { - charIndex.__wordToUtf8Seq(word); + encodeTextToUtf8(word); } }); diff --git a/vitest.config.mjs b/vitest.config.mjs index f5b1ead25ce..1c7f31f06fa 100644 --- a/vitest.config.mjs +++ b/vitest.config.mjs @@ -36,8 +36,10 @@ const defaultConfig = { '**/*.d.mts', '**/*.d.ts', '**/*.test.*', + '**/*.config.*', '**/fixtures/**', '**/perf/**', + '**/*.perf.*', '**/samples/**', '**/test*/**', '**/test.*',