From cb9f648cb587c6c0553fceaf0dc57d90ac34321c Mon Sep 17 00:00:00 2001 From: "Marc J. Schmidt" Date: Tue, 13 Feb 2024 21:11:34 +0100 Subject: [PATCH] fix(bson): fix surrogate pair decoding Also, remove various custom utf8 encoder/decoding and rely more on TextEncoder. --- packages/bson/index.ts | 2 +- packages/bson/src/bson-deserializer.ts | 15 ++- packages/bson/src/bson-parser.ts | 92 ++----------------- packages/bson/src/bson-serializer.ts | 33 +++++++ packages/bson/src/strings.ts | 74 +-------------- packages/bson/tests/bson-parser.spec.ts | 19 +--- packages/bson/tests/bson-serialize.spec.ts | 55 ++++++++++- .../src/stopwatch-encoding.ts | 22 ++++- 8 files changed, 131 insertions(+), 181 deletions(-) diff --git a/packages/bson/index.ts b/packages/bson/index.ts index d839ebda6..a4a839a87 100644 --- a/packages/bson/index.ts +++ b/packages/bson/index.ts @@ -10,6 +10,6 @@ export * from './src/model.js'; export * from './src/bson-parser.js'; -export { ParserV2 as Parser } from './src/bson-parser.js'; +export { BaseParser } from './src/bson-parser.js'; export * from './src/bson-deserializer.js'; export * from './src/bson-serializer.js'; diff --git a/packages/bson/src/bson-deserializer.ts b/packages/bson/src/bson-deserializer.ts index f19f13032..81dff9154 100644 --- a/packages/bson/src/bson-deserializer.ts +++ b/packages/bson/src/bson-deserializer.ts @@ -1,8 +1,17 @@ -import { executeTemplates, getTypeJitContainer, JitStack, NamingStrategy, ReceiveType, resolveReceiveType, TemplateState, Type } from '@deepkit/type'; +import { + executeTemplates, + getTypeJitContainer, + JitStack, + NamingStrategy, + ReceiveType, + resolveReceiveType, + TemplateState, + Type, +} from '@deepkit/type'; import { CompilerContext, toFastProperties } from '@deepkit/core'; import { seekElementSize } from './continuation.js'; import { BSONBinarySerializer, bsonBinarySerializer } from './bson-serializer.js'; -import { ParserV2 } from './bson-parser.js'; +import { BaseParser } from './bson-parser.js'; function createBSONDeserializer(type: Type, serializer: BSONBinarySerializer, namingStrategy: NamingStrategy = new NamingStrategy(), path: string = '', jitStack: JitStack = new JitStack()) { const compiler = new CompilerContext(); @@ -32,7 +41,7 @@ export function getBSONDeserializer(serializer: BSONBinarySerializer = bsonBi const deserializer = createBSONDeserializer(type, bsonBinarySerializer); jit[serializer.deserializeId] = function (bson: Uint8Array, offset: number = 0) { - const parser = new ParserV2(bson, offset); + const parser = new BaseParser(bson, offset); return deserializer(parser); }; toFastProperties(jit); diff --git a/packages/bson/src/bson-parser.ts b/packages/bson/src/bson-parser.ts index 07afd522c..108d6bbc9 100644 --- a/packages/bson/src/bson-parser.ts +++ b/packages/bson/src/bson-parser.ts @@ -8,8 +8,14 @@ * You should have received a copy of the MIT License along with this program. */ -import { BSON_BINARY_SUBTYPE_BYTE_ARRAY, BSON_BINARY_SUBTYPE_UUID, BSONType, digitByteSize, TWO_PWR_32_DBL_N } from './utils.js'; -import { buildStringDecoder, decodeUTF8 } from './strings.js'; +import { + BSON_BINARY_SUBTYPE_BYTE_ARRAY, + BSON_BINARY_SUBTYPE_UUID, + BSONType, + digitByteSize, + TWO_PWR_32_DBL_N, +} from './utils.js'; +import { decodeUTF8 } from './strings.js'; import { nodeBufferToArrayBuffer, ReflectionKind, SerializationError, Type } from '@deepkit/type'; import { hexTable } from './model.js'; @@ -318,86 +324,6 @@ export class BaseParser { } } -const stringParser = buildStringDecoder(32); - -/** - * This is a general purpose Parser assuming ascii names as property names. - * It falls back automatically to UTF8 when a UTF8 byte was found. - * This is way faster than BaseParser when property names are mainly ascii (which is usually the case). - */ -export class ParserV2 extends BaseParser { - - eatObjectPropertyName() { - let end = this.offset; - let simple = true; - let string = ''; - while (this.buffer[end] !== 0) { - if (this.buffer[end] > 127) { - simple = false; - } - if (simple) { - string += String.fromCharCode(this.buffer[end]); - } - end++; - } - - if (simple) { - //do simple ascii - this.offset = end + 1; - return string; - } - - const s = stringParser(this.buffer, this.offset, end); - this.offset = end + 1; - - return s; - } - - eatString(size: number): string { - // const s = stringParser(this.buffer, this.offset, this.offset + size); - let s = ''; - if (size > 64 && 'undefined' !== typeof Buffer && 'function' === typeof Buffer.from) { - s = Buffer.from(this.buffer.buffer, this.buffer.byteOffset + this.offset, size - 1).toString('utf8'); - } else { - s = stringParser(this.buffer, this.offset, this.offset + size); - } - this.offset += size; - return s; - } -} - -const decoder = new TextDecoder('utf8'); - -export class ParserV3 extends BaseParser { - eatObjectPropertyName() { - let end = this.offset; - let simple = true; - while (this.buffer[end] !== 0) { - if (this.buffer[end] > 127) simple = false; - end++; - } - - if (simple) { - //do simple ascii - const s = String.fromCharCode.apply(String, this.buffer.slice(this.offset, end) as any); - this.offset = end + 1; - return s; - } - - const s = decoder.decode(this.buffer.slice(this.offset, end)); - this.offset = end + 1; - - return s; - } - - eatString(size: number): string { - const end = this.offset + size; - let s = decoder.decode(this.buffer.slice(this.offset, end - 1)); - this.offset = end; - return s; - } -} - export function parseObject(parser: BaseParser): any { const result: any = {}; const end = parser.eatUInt32() + parser.offset; @@ -431,6 +357,6 @@ export function parseArray(parser: BaseParser): any[] { } export function deserializeBSONWithoutOptimiser(buffer: Uint8Array, offset = 0) { - return parseObject(new ParserV2(buffer, offset)); + return parseObject(new BaseParser(buffer, offset)); } diff --git a/packages/bson/src/bson-serializer.ts b/packages/bson/src/bson-serializer.ts index 158c18c8e..23226c88b 100644 --- a/packages/bson/src/bson-serializer.ts +++ b/packages/bson/src/bson-serializer.ts @@ -141,6 +141,19 @@ export function stringByteLength(str: string): number { let size = 0; for (let i = 0; i < str.length; i++) { const c = str.charCodeAt(i); + + // surrogate pair + if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) { + const lo = str.charCodeAt(i + 1); + if (lo >= 0xDC00 && lo <= 0xDFFF) { + // surrogate pair is a 4-byte character in UTF-8 + size += 4; + // move past the low surrogate since it's part of the character + i++; + continue; + } + } + if (c < 128) size += 1; else if (c > 127 && c < 2048) size += 2; else size += 3; @@ -344,6 +357,26 @@ export class Writer { if (typeof str !== 'string') return; for (let i = 0; i < str.length; i++) { const c = str.charCodeAt(i); + + // surrogate pairs for characters outside the BMP + if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) { + const hi = c; + const lo = str.charCodeAt(i + 1); + if (lo >= 0xDC00 && lo <= 0xDFFF) { + // combine the surrogate pair and subtract 0x10000 for UTF-8 encoding + const codePoint = ((hi - 0xD800) * 0x400) + (lo - 0xDC00) + 0x10000; + + this.buffer[this.offset++] = (codePoint >> 18) | 240; + this.buffer[this.offset++] = ((codePoint >> 12) & 63) | 128; + this.buffer[this.offset++] = ((codePoint >> 6) & 63) | 128; + this.buffer[this.offset++] = (codePoint & 63) | 128; + + // skip the next code unit, since it's part of the surrogate pair + i++; + continue; + } + } + if (c < 128) { this.buffer[this.offset++] = c; } else if (c > 127 && c < 2048) { diff --git a/packages/bson/src/strings.ts b/packages/bson/src/strings.ts index 0f0b2489b..811e4e7fb 100644 --- a/packages/bson/src/strings.ts +++ b/packages/bson/src/strings.ts @@ -8,10 +8,10 @@ * You should have received a copy of the MIT License along with this program. */ -import { CompilerContext } from '@deepkit/core'; import { BSONError } from './model.js'; -const decoder = new TextDecoder("utf-8"); +const decoder = new TextDecoder('utf-8'); + export function decodeUTF8(buffer: Uint8Array, off: number = 0, end: number) { if (end - off > 512) { return decoder.decode(buffer.slice(off, end)); @@ -55,73 +55,3 @@ export function decodeUTF8Short(buffer: Uint8Array, off: number = 0, end: number } return s; } - -export function buildStringDecoder(specializations: number = 10) { - const compiler = new CompilerContext(); - // const midDecoding: string[] = []; - const endDecoding: string[] = []; - - function fromCharCode(number: number): string { - const codes: string[] = []; - for (let i = 0; i < number; i++) { - codes.push(`codes[${i}]`); - } - return `fromCharCode(${codes.join(', ')})`; - } - - const fns: Function[] = []; - for (let i = 1; i <= specializations; i++) { - const fn = new Function('fromCharCode', 'return function(codes) { return ' + fromCharCode(i) + '}')(String.fromCharCode); - compiler.context.set('decodeCodes' + i, fn); - fns.push(fn); - } - - for (let i = 0; i < specializations; i++) { - // midDecoding.push(`if (codesOffset === ${i + 1}) s += decodeCodes${i + 1}(codes);`) - endDecoding.push(`if (codesOffset === ${i + 1}) return s + decodeCodes${i + 1}(codes);`); - } - compiler.context.set('codes', new Uint16Array(specializations)); - compiler.context.set('fns', fns); - compiler.context.set('fromCharCode', String.fromCharCode); - - const functionCode = ` - let codesOffset = 0; - let s = ''; - while (off < end) { - let c = buffer[off++]; - - if (c > 127) { - if (c > 191 && c < 224) { - c = (c & 31) << 6 | buffer[off++] & 63; - } else if (c > 223 && c < 240) { - c = (c & 15) << 12 | (buffer[off++] & 63) << 6 | buffer[off++] & 63; - } else if (c > 239 && c < 248) { - c = (c & 7) << 18 | (buffer[off++] & 63) << 12 | (buffer[off++] & 63) << 6 | buffer[off++] & 63; - } - if (c <= 0xffff) { - codes[codesOffset++] = c; - } else if (c <= 0x10ffff) { - c -= 0x10000; - codes[codesOffset++] = c >> 10 | 0xd800; - codes[codesOffset++] = c & 0x3FF | 0xdc00; - } - } else { - if (c === 0) { - return codesOffset ? s + fns[codesOffset - 1](codes) : s; - } - - codes[codesOffset++] = c; - } - - if (codesOffset >= ${specializations}) { - s += decodeCodes${specializations}(codes); - codesOffset = 0; - } - } - - if (codesOffset === 0) return s; - return s + fns[codesOffset - 1](codes); - `; - - return compiler.build(functionCode, 'buffer', 'off', 'end'); -} diff --git a/packages/bson/tests/bson-parser.spec.ts b/packages/bson/tests/bson-parser.spec.ts index 0507cb46a..766e1168e 100644 --- a/packages/bson/tests/bson-parser.spec.ts +++ b/packages/bson/tests/bson-parser.spec.ts @@ -1,20 +1,7 @@ import { expect, test } from '@jest/globals'; import bson, { Binary } from 'bson'; import { deserializeBSON, getBSONDeserializer } from '../src/bson-deserializer.js'; -import { - BinaryBigInt, - copyAndSetParent, - MongoId, - nodeBufferToArrayBuffer, - PrimaryKey, - Reference, - ReflectionKind, - SignedBinaryBigInt, - TypeObjectLiteral, - typeOf, - uuid, - UUID -} from '@deepkit/type'; +import { BinaryBigInt, copyAndSetParent, MongoId, nodeBufferToArrayBuffer, PrimaryKey, Reference, ReflectionKind, SignedBinaryBigInt, TypeObjectLiteral, typeOf, uuid, UUID } from '@deepkit/type'; import { getClassName } from '@deepkit/core'; import { serializeBSONWithoutOptimiser } from '../src/bson-serializer.js'; @@ -730,8 +717,8 @@ test('any', () => { const bson = serializeBSONWithoutOptimiser(data); const deserializer = getBSONDeserializer(undefined, type); - const back = deserializer(bson); - console.log('back', back); + const back: any = deserializer(bson); + expect(back.value).toEqual(data.value); }); test('circular', () => { diff --git a/packages/bson/tests/bson-serialize.spec.ts b/packages/bson/tests/bson-serialize.spec.ts index fc67a6304..496e6510c 100644 --- a/packages/bson/tests/bson-serialize.spec.ts +++ b/packages/bson/tests/bson-serialize.spec.ts @@ -1426,7 +1426,7 @@ test('wrapValue', () => { } { const objectId = wrapValue('507f191e810c19729de860ea'); - const serialize = getBSONSerializer<{v: any }>(); + const serialize = getBSONSerializer<{ v: any }>(); const bson = serialize({ v: objectId }); const back = deserialize(bson); expect(back.v).toBeInstanceOf(OfficialObjectId); @@ -1447,3 +1447,56 @@ test('wrapValue', () => { expect(back.v.toHexString()).toBe(uuid1.value); } }); + +test('utf16 surrogate pair', () => { + const comment = 'Hehe, yes. Baby’s first collar \uD83E\uDD2D'; + + { + const bson1 = serialize({ v: comment }); + const bson2 = Buffer.from(serializeBSONWithoutOptimiser({v: comment})); + expect(bson1.toString('hex')).toBe(bson2.toString('hex')); + + const back1 = deserialize(bson1); + const back2 = deserializeBSONWithoutOptimiser(bson1); + expect(back1.v).toBe(comment); + expect(back2.v).toBe(comment); + } + + { + const bson = serialize({ comment }); + const back = deserialize(bson); + expect(back.comment).toBe(comment); + } + { + const bson = serialize({ comment }); + const back = deserializeBSONWithoutOptimiser(bson); + expect(back.comment).toBe(comment); + } + + { + const bson = serializeBSONWithoutOptimiser({ comment }); + const back = deserialize(bson); + expect(back.comment).toBe(comment); + } + { + const bson = serializeBSONWithoutOptimiser({ comment }); + const back = deserializeBSONWithoutOptimiser(bson); + expect(back.comment).toBe(comment); + } + { + const bson = getBSONSerializer<{ comment: string }>()({ comment }); + const back = getBSONDeserializer<{ comment: string }>()(bson); + expect(back.comment).toBe(comment); + } + + { + const o = { + comment: 'Hehe, yes. Baby’s first collar \uD83E\uDD2D' + }; + const bson = serialize(o); + const back1 = deserialize(bson); + const back2 = deserializeBSONWithoutOptimiser(bson); + expect(back1).toEqual(o); + expect(back2).toEqual(o); + } +}); diff --git a/packages/framework-debug-api/src/stopwatch-encoding.ts b/packages/framework-debug-api/src/stopwatch-encoding.ts index b2e6f43ae..e7e480fcb 100644 --- a/packages/framework-debug-api/src/stopwatch-encoding.ts +++ b/packages/framework-debug-api/src/stopwatch-encoding.ts @@ -1,4 +1,12 @@ -import { deserializeBSONWithoutOptimiser, getBSONDeserializer, getBSONSerializer, getBSONSizer, Parser, stringByteLength, Writer } from '@deepkit/bson'; +import { + BaseParser, + deserializeBSONWithoutOptimiser, + getBSONDeserializer, + getBSONSerializer, + getBSONSizer, + stringByteLength, + Writer, +} from '@deepkit/bson'; import { AnalyticData, FrameData, FrameEnd, FrameStart, FrameType, getTypeOfCategory } from '@deepkit/stopwatch'; export function encodeFrames(frames: (FrameStart | FrameEnd)[]): Uint8Array { @@ -82,7 +90,7 @@ export function encodeAnalytic(data: AnalyticData[]) { } export function decodeAnalytic(buffer: Uint8Array, callback: (data: AnalyticData) => void) { - const parser = new Parser(buffer); + const parser = new BaseParser(buffer); while (parser.offset < buffer.byteLength) { const timestamp = parser.eatUInt32(); @@ -93,8 +101,12 @@ export function decodeAnalytic(buffer: Uint8Array, callback: (data: AnalyticData } } -export function decodeFrameData(buffer: Uint8Array, callback: (data: { cid: number, category: number, data: Uint8Array }) => void) { - const parser = new Parser(buffer); +export function decodeFrameData(buffer: Uint8Array, callback: (data: { + cid: number, + category: number, + data: Uint8Array +}) => void) { + const parser = new BaseParser(buffer); while (parser.offset < buffer.byteLength) { const cid = parser.eatUInt32(); @@ -112,7 +124,7 @@ export function deserializeFrameData(data: { cid: number, category: number, data } export function decodeFrames(buffer: Uint8Array, callback: (frame: FrameStart | FrameEnd) => void): void { - const parser = new Parser(buffer); + const parser = new BaseParser(buffer); while (parser.offset < buffer.byteLength) { const cid = parser.eatUInt32();