Skip to content

Commit

Permalink
refactor: char index (#5926)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S authored Jul 16, 2024
1 parent 9986720 commit 077b3ba
Show file tree
Hide file tree
Showing 11 changed files with 431 additions and 122 deletions.
8 changes: 4 additions & 4 deletions packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.

// Add the same letters again.
expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));

const charIndex = charIndexBuilder.build();
expect(charIndex.size).toBe(11);
expect(charIndex.size).toBe(16);
expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
});
});
86 changes: 25 additions & 61 deletions packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';

export type Utf8Seq = Readonly<number[]>;

export type CharIndexMap = Record<string, Utf8BE32>;

export type RO_CharIndexMap = Readonly<CharIndexMap>;

export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
export type CharIndexSeqMap = Record<string, Utf8Seq>;

export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;

Expand All @@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
Object.freeze(emptySeq);

export class CharIndex {
readonly charToUtf8Map: RO_CharIndexMap;
readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
#charToUtf8SeqMap: CharIndexSeqMap;

#lastWord = '';
#lastWordSeq: Utf8Seq = [];
#multiByteChars: boolean;

constructor(readonly charIndex: readonly string[]) {
this.charToUtf8Map = buildCharIndexMap(charIndex);
this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
}

getUtf8Value(c: string): number {
return this.charToUtf8Map[c] || 0;
this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
}

getCharUtf8Seq(c: string): Utf8Seq {
const r = this.charToUtf8SeqMap[c] ?? emptySeq;
return typeof r === 'number' ? [r] : r;
}

__wordToUtf8Seq(word: string): Utf8Seq {
// Note: Array.flatMap is very slow
const seq: number[] = new Array(word.length);
let i = 0;
for (const c of word) {
const cSep = this.charToUtf8SeqMap[c];
if (typeof cSep === 'number') {
seq[i++] = cSep;
continue;
}
if (!cSep) {
seq[i++] = 0;
continue;
}
for (const cIdx of cSep) {
seq[i++] = cIdx;
}
}
if (seq.length !== i) seq.length = i;
return seq;
const found = this.#charToUtf8SeqMap[c];
if (found) return found;
const s = encodeTextToUtf8(c);
this.#charToUtf8SeqMap[c] = s;
return s;
}

wordToUtf8Seq(word: string): Utf8Seq {
if (this.#lastWord === word) return this.#lastWordSeq;

const seq = this.__wordToUtf8Seq(word);
const seq = encodeTextToUtf8(word);

this.#lastWord = word;
this.#lastWordSeq = seq;
Expand All @@ -69,7 +46,7 @@ export class CharIndex {
}

indexContainsMultiByteChars(): boolean {
return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
return this.#multiByteChars;
}

get size(): number {
Expand All @@ -81,22 +58,10 @@ export class CharIndex {
}
}

function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
const map: CharIndexMap = Object.create(null);
for (const c of charIndex) {
const cn = c.normalize('NFC');
const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
map[c] = utf8;
map[c.normalize('NFC')] = utf8;
map[c.normalize('NFD')] = utf8;
}
return map;
}

function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
const map: CharIndexSeqMap = Object.create(null);
for (const [key, value] of Object.entries(charIndexMap)) {
map[key] = splitUtf8IfNeeded(value);
for (const key of charIndex) {
map[key] = encodeTextToUtf8(key);
}
return map;
}
Expand All @@ -106,7 +71,7 @@ export class CharIndexBuilder {
readonly charIndexMap: CharIndexMap = Object.create(null);
readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);

readonly #mapIdxToSeq = new Map<number, number[] | number>();
readonly #mapIdxToSeq = new Map<number, number[]>();

constructor() {
this.getUtf8Value('');
Expand All @@ -126,24 +91,22 @@ export class CharIndexBuilder {
return utf8;
}

utf8ValueToUtf8Seq(idx: number): number[] | number {
utf8ValueToUtf8Seq(idx: number): number[] {
const found = this.#mapIdxToSeq.get(idx);
if (found !== undefined) {
return found;
}
const seq = splitUtf8IfNeeded(idx);
const seq = splitUtf8(idx);
this.#mapIdxToSeq.set(idx, seq);
return seq;
}

charToUtf8Seq(c: string): number[] {
const idx = this.getUtf8Value(c);
const s = this.utf8ValueToUtf8Seq(idx);
return typeof s === 'number' ? [s] : s;
return this.utf8ValueToUtf8Seq(idx);
}

wordToUtf8Seq(word: string): number[] {
// word = word.normalize('NFC');
const seq: number[] = new Array(word.length);
let i = 0;
for (const c of word) {
Expand All @@ -170,8 +133,9 @@ export class CharIndexBuilder {
}
}

function splitUtf8IfNeeded(utf8: number): number | number[] {
if (utf8 < 0x80) return utf8;
const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
return s.length ? s : s[0];
function splitUtf8(utf8: number): number[] {
if (utf8 <= 0xff) return [utf8];
if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
}
2 changes: 1 addition & 1 deletion packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {

static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
return new FastTrieBlobIRoot(
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
0,
trie.info,
);
Expand Down
33 changes: 0 additions & 33 deletions packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
for (let i = 0; i < utf8Seq.length; ++i) {
insertCharIndexes(utf8Seq[i], pDepth);
}
// dumpState({ step: 'insertChar', char });
};

/**
Expand Down Expand Up @@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
const pos = s.pos;
const node = nodes[nodeIdx];
node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);

// dumpState({ step: 'reference', refId, refNodeIdx });
};

const backStep = (num: number) => {
Expand All @@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
depth = stack[depth].pDepth;
}
nodeIdx = stack[depth + 1].nodeIdx;

// dumpState({ step: 'backStep', num });
};

// function dumpNode(node: number[]): string {
// const n = node
// .map((n, i) => {
// if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
// return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
// })
// .join(', ');
// return `[${n}]`;
// }

// function dumpNodes(nodes: FastTrieBlobNode[]) {
// return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
// }

// const debug = false;

// function dumpState(extra?: Record<string, unknown>) {
// debug &&
// console.warn('%o', {
// stack: stack.slice(0, depth + 1),
// nodes: dumpNodes(nodes),
// nodeIdx,
// depth,
// refNodes,
// ...extra,
// });
// }

const c: BuilderCursor = {
insertChar,
markEOW,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
readonly nodes: number[][],
readonly charIndex: CharIndex,
maskInfo: FastTrieBlobBitMaskInfo,
sorted = false,
) {
const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
this.NodeMaskEOW = NodeMaskEOW;
this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
this.NodeChildRefShift = NodeChildRefShift;
this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
!sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
sortNodes(nodes, this.NodeMaskChildCharIndex);
}
}

Expand All @@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
* @returns
*/
export function sortNodes(nodes: number[][], mask: number): number[][] {
if (Object.isFrozen(nodes)) {
assertSorted(nodes, mask);
return nodes;
}
for (let i = 0; i < nodes.length; ++i) {
let node = nodes[i];
if (node.length > 2) {
Expand Down
4 changes: 2 additions & 2 deletions packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
}

public wordToNodeCharIndexSequence(word: string): Utf8Seq {
public wordToUtf8Seq(word: string): Utf8Seq {
return this.charIndex.wordToUtf8Seq(word);
}

Expand Down Expand Up @@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
const NodeChildRefShift = TrieBlob.NodeChildRefShift;
const nodes = this.nodes;
const nodes8 = this.#nodes8;
const wordIndexes = this.wordToNodeCharIndexSequence(word);
const wordIndexes = this.wordToUtf8Seq(word);
const lookup = this.#nodeIdxLookup;
const len = wordIndexes.length;
let p = 0;
Expand Down
Loading

0 comments on commit 077b3ba

Please sign in to comment.