Skip to content

Commit

Permalink
fix(bson): fix surrogate pair decoding
Browse files Browse the repository at this point in the history
Also, remove various custom utf8 encoder/decoding and rely more on TextEncoder.
  • Loading branch information
marcj committed Feb 13, 2024
1 parent 6275c37 commit cb9f648
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 181 deletions.
2 changes: 1 addition & 1 deletion packages/bson/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@

export * from './src/model.js';
export * from './src/bson-parser.js';
export { ParserV2 as Parser } from './src/bson-parser.js';
export { BaseParser } from './src/bson-parser.js';
export * from './src/bson-deserializer.js';
export * from './src/bson-serializer.js';
15 changes: 12 additions & 3 deletions packages/bson/src/bson-deserializer.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import { executeTemplates, getTypeJitContainer, JitStack, NamingStrategy, ReceiveType, resolveReceiveType, TemplateState, Type } from '@deepkit/type';
import {
executeTemplates,
getTypeJitContainer,
JitStack,
NamingStrategy,
ReceiveType,
resolveReceiveType,
TemplateState,
Type,
} from '@deepkit/type';
import { CompilerContext, toFastProperties } from '@deepkit/core';
import { seekElementSize } from './continuation.js';
import { BSONBinarySerializer, bsonBinarySerializer } from './bson-serializer.js';
import { ParserV2 } from './bson-parser.js';
import { BaseParser } from './bson-parser.js';

function createBSONDeserializer(type: Type, serializer: BSONBinarySerializer, namingStrategy: NamingStrategy = new NamingStrategy(), path: string = '', jitStack: JitStack = new JitStack()) {
const compiler = new CompilerContext();
Expand Down Expand Up @@ -32,7 +41,7 @@ export function getBSONDeserializer<T>(serializer: BSONBinarySerializer = bsonBi

const deserializer = createBSONDeserializer(type, bsonBinarySerializer);
jit[serializer.deserializeId] = function (bson: Uint8Array, offset: number = 0) {
const parser = new ParserV2(bson, offset);
const parser = new BaseParser(bson, offset);
return deserializer(parser);
};
toFastProperties(jit);
Expand Down
92 changes: 9 additions & 83 deletions packages/bson/src/bson-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@
* You should have received a copy of the MIT License along with this program.
*/

import { BSON_BINARY_SUBTYPE_BYTE_ARRAY, BSON_BINARY_SUBTYPE_UUID, BSONType, digitByteSize, TWO_PWR_32_DBL_N } from './utils.js';
import { buildStringDecoder, decodeUTF8 } from './strings.js';
import {
BSON_BINARY_SUBTYPE_BYTE_ARRAY,
BSON_BINARY_SUBTYPE_UUID,
BSONType,
digitByteSize,
TWO_PWR_32_DBL_N,
} from './utils.js';
import { decodeUTF8 } from './strings.js';
import { nodeBufferToArrayBuffer, ReflectionKind, SerializationError, Type } from '@deepkit/type';
import { hexTable } from './model.js';

Expand Down Expand Up @@ -318,86 +324,6 @@ export class BaseParser {
}
}

const stringParser = buildStringDecoder(32);

/**
* This is a general purpose Parser assuming ascii names as property names.
* It falls back automatically to UTF8 when a UTF8 byte was found.
* This is way faster than BaseParser when property names are mainly ascii (which is usually the case).
*/
export class ParserV2 extends BaseParser {

eatObjectPropertyName() {
let end = this.offset;
let simple = true;
let string = '';
while (this.buffer[end] !== 0) {
if (this.buffer[end] > 127) {
simple = false;
}
if (simple) {
string += String.fromCharCode(this.buffer[end]);
}
end++;
}

if (simple) {
//do simple ascii
this.offset = end + 1;
return string;
}

const s = stringParser(this.buffer, this.offset, end);
this.offset = end + 1;

return s;
}

eatString(size: number): string {
// const s = stringParser(this.buffer, this.offset, this.offset + size);
let s = '';
if (size > 64 && 'undefined' !== typeof Buffer && 'function' === typeof Buffer.from) {
s = Buffer.from(this.buffer.buffer, this.buffer.byteOffset + this.offset, size - 1).toString('utf8');
} else {
s = stringParser(this.buffer, this.offset, this.offset + size);
}
this.offset += size;
return s;
}
}

const decoder = new TextDecoder('utf8');

export class ParserV3 extends BaseParser {
eatObjectPropertyName() {
let end = this.offset;
let simple = true;
while (this.buffer[end] !== 0) {
if (this.buffer[end] > 127) simple = false;
end++;
}

if (simple) {
//do simple ascii
const s = String.fromCharCode.apply(String, this.buffer.slice(this.offset, end) as any);
this.offset = end + 1;
return s;
}

const s = decoder.decode(this.buffer.slice(this.offset, end));
this.offset = end + 1;

return s;
}

eatString(size: number): string {
const end = this.offset + size;
let s = decoder.decode(this.buffer.slice(this.offset, end - 1));
this.offset = end;
return s;
}
}

export function parseObject(parser: BaseParser): any {
const result: any = {};
const end = parser.eatUInt32() + parser.offset;
Expand Down Expand Up @@ -431,6 +357,6 @@ export function parseArray(parser: BaseParser): any[] {
}

export function deserializeBSONWithoutOptimiser(buffer: Uint8Array, offset = 0) {
return parseObject(new ParserV2(buffer, offset));
return parseObject(new BaseParser(buffer, offset));
}

33 changes: 33 additions & 0 deletions packages/bson/src/bson-serializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,19 @@ export function stringByteLength(str: string): number {
let size = 0;
for (let i = 0; i < str.length; i++) {
const c = str.charCodeAt(i);

// surrogate pair
if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) {
const lo = str.charCodeAt(i + 1);
if (lo >= 0xDC00 && lo <= 0xDFFF) {
// surrogate pair is a 4-byte character in UTF-8
size += 4;
// move past the low surrogate since it's part of the character
i++;
continue;
}
}

if (c < 128) size += 1;
else if (c > 127 && c < 2048) size += 2;
else size += 3;
Expand Down Expand Up @@ -344,6 +357,26 @@ export class Writer {
if (typeof str !== 'string') return;
for (let i = 0; i < str.length; i++) {
const c = str.charCodeAt(i);

// surrogate pairs for characters outside the BMP
if (c >= 0xD800 && c <= 0xDBFF && i + 1 < str.length) {
const hi = c;
const lo = str.charCodeAt(i + 1);
if (lo >= 0xDC00 && lo <= 0xDFFF) {
// combine the surrogate pair and subtract 0x10000 for UTF-8 encoding
const codePoint = ((hi - 0xD800) * 0x400) + (lo - 0xDC00) + 0x10000;

this.buffer[this.offset++] = (codePoint >> 18) | 240;
this.buffer[this.offset++] = ((codePoint >> 12) & 63) | 128;
this.buffer[this.offset++] = ((codePoint >> 6) & 63) | 128;
this.buffer[this.offset++] = (codePoint & 63) | 128;

// skip the next code unit, since it's part of the surrogate pair
i++;
continue;
}
}

if (c < 128) {
this.buffer[this.offset++] = c;
} else if (c > 127 && c < 2048) {
Expand Down
74 changes: 2 additions & 72 deletions packages/bson/src/strings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
* You should have received a copy of the MIT License along with this program.
*/

import { CompilerContext } from '@deepkit/core';
import { BSONError } from './model.js';

const decoder = new TextDecoder("utf-8");
const decoder = new TextDecoder('utf-8');

export function decodeUTF8(buffer: Uint8Array, off: number = 0, end: number) {
if (end - off > 512) {
return decoder.decode(buffer.slice(off, end));
Expand Down Expand Up @@ -55,73 +55,3 @@ export function decodeUTF8Short(buffer: Uint8Array, off: number = 0, end: number
}
return s;
}

export function buildStringDecoder(specializations: number = 10) {
const compiler = new CompilerContext();
// const midDecoding: string[] = [];
const endDecoding: string[] = [];

function fromCharCode(number: number): string {
const codes: string[] = [];
for (let i = 0; i < number; i++) {
codes.push(`codes[${i}]`);
}
return `fromCharCode(${codes.join(', ')})`;
}

const fns: Function[] = [];
for (let i = 1; i <= specializations; i++) {
const fn = new Function('fromCharCode', 'return function(codes) { return ' + fromCharCode(i) + '}')(String.fromCharCode);
compiler.context.set('decodeCodes' + i, fn);
fns.push(fn);
}

for (let i = 0; i < specializations; i++) {
// midDecoding.push(`if (codesOffset === ${i + 1}) s += decodeCodes${i + 1}(codes);`)
endDecoding.push(`if (codesOffset === ${i + 1}) return s + decodeCodes${i + 1}(codes);`);
}
compiler.context.set('codes', new Uint16Array(specializations));
compiler.context.set('fns', fns);
compiler.context.set('fromCharCode', String.fromCharCode);

const functionCode = `
let codesOffset = 0;
let s = '';
while (off < end) {
let c = buffer[off++];
if (c > 127) {
if (c > 191 && c < 224) {
c = (c & 31) << 6 | buffer[off++] & 63;
} else if (c > 223 && c < 240) {
c = (c & 15) << 12 | (buffer[off++] & 63) << 6 | buffer[off++] & 63;
} else if (c > 239 && c < 248) {
c = (c & 7) << 18 | (buffer[off++] & 63) << 12 | (buffer[off++] & 63) << 6 | buffer[off++] & 63;
}
if (c <= 0xffff) {
codes[codesOffset++] = c;
} else if (c <= 0x10ffff) {
c -= 0x10000;
codes[codesOffset++] = c >> 10 | 0xd800;
codes[codesOffset++] = c & 0x3FF | 0xdc00;
}
} else {
if (c === 0) {
return codesOffset ? s + fns[codesOffset - 1](codes) : s;
}
codes[codesOffset++] = c;
}
if (codesOffset >= ${specializations}) {
s += decodeCodes${specializations}(codes);
codesOffset = 0;
}
}
if (codesOffset === 0) return s;
return s + fns[codesOffset - 1](codes);
`;

return compiler.build(functionCode, 'buffer', 'off', 'end');
}
19 changes: 3 additions & 16 deletions packages/bson/tests/bson-parser.spec.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,7 @@
import { expect, test } from '@jest/globals';
import bson, { Binary } from 'bson';
import { deserializeBSON, getBSONDeserializer } from '../src/bson-deserializer.js';
import {
BinaryBigInt,
copyAndSetParent,
MongoId,
nodeBufferToArrayBuffer,
PrimaryKey,
Reference,
ReflectionKind,
SignedBinaryBigInt,
TypeObjectLiteral,
typeOf,
uuid,
UUID
} from '@deepkit/type';
import { BinaryBigInt, copyAndSetParent, MongoId, nodeBufferToArrayBuffer, PrimaryKey, Reference, ReflectionKind, SignedBinaryBigInt, TypeObjectLiteral, typeOf, uuid, UUID } from '@deepkit/type';
import { getClassName } from '@deepkit/core';
import { serializeBSONWithoutOptimiser } from '../src/bson-serializer.js';

Expand Down Expand Up @@ -730,8 +717,8 @@ test('any', () => {

const bson = serializeBSONWithoutOptimiser(data);
const deserializer = getBSONDeserializer(undefined, type);
const back = deserializer(bson);
console.log('back', back);
const back: any = deserializer(bson);
expect(back.value).toEqual(data.value);
});

test('circular', () => {
Expand Down
55 changes: 54 additions & 1 deletion packages/bson/tests/bson-serialize.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1426,7 +1426,7 @@ test('wrapValue', () => {
}
{
const objectId = wrapValue<MongoId>('507f191e810c19729de860ea');
const serialize = getBSONSerializer<{v: any }>();
const serialize = getBSONSerializer<{ v: any }>();
const bson = serialize({ v: objectId });
const back = deserialize(bson);
expect(back.v).toBeInstanceOf(OfficialObjectId);
Expand All @@ -1447,3 +1447,56 @@ test('wrapValue', () => {
expect(back.v.toHexString()).toBe(uuid1.value);
}
});

test('utf16 surrogate pair', () => {
const comment = 'Hehe, yes. Baby’s first collar \uD83E\uDD2D';

{
const bson1 = serialize({ v: comment });
const bson2 = Buffer.from(serializeBSONWithoutOptimiser({v: comment}));
expect(bson1.toString('hex')).toBe(bson2.toString('hex'));

const back1 = deserialize(bson1);
const back2 = deserializeBSONWithoutOptimiser(bson1);
expect(back1.v).toBe(comment);
expect(back2.v).toBe(comment);
}

{
const bson = serialize({ comment });
const back = deserialize(bson);
expect(back.comment).toBe(comment);
}
{
const bson = serialize({ comment });
const back = deserializeBSONWithoutOptimiser(bson);
expect(back.comment).toBe(comment);
}

{
const bson = serializeBSONWithoutOptimiser({ comment });
const back = deserialize(bson);
expect(back.comment).toBe(comment);
}
{
const bson = serializeBSONWithoutOptimiser({ comment });
const back = deserializeBSONWithoutOptimiser(bson);
expect(back.comment).toBe(comment);
}
{
const bson = getBSONSerializer<{ comment: string }>()({ comment });
const back = getBSONDeserializer<{ comment: string }>()(bson);
expect(back.comment).toBe(comment);
}

{
const o = {
comment: 'Hehe, yes. Baby’s first collar \uD83E\uDD2D'
};
const bson = serialize(o);
const back1 = deserialize(bson);
const back2 = deserializeBSONWithoutOptimiser(bson);
expect(back1).toEqual(o);
expect(back2).toEqual(o);
}
});
Loading

0 comments on commit cb9f648

Please sign in to comment.