near · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/README.md b/README.md
@@ -39,8 +39,8 @@ const decoded = borsh.deserialize(schema, encoded);
 
 ## API
 The package exposes the following functions:
-- `serialize(schema: Schema, obj: any): Uint8Array` - serializes an object `obj` according to the schema `schema`.
-- `deserialize(schema: Schema, buffer: Uint8Array, class?: Class): any` - deserializes an object according to the schema `schema` from the buffer `buffer`. If the optional parameter `class` is present, the deserialized object will be an of `class`.
+- `serialize(schema: Schema, obj: any, validate: boolean = true): Uint8Array` - serializes an object `obj` according to the schema `schema`. Setting `validate` to false will skip the validation of the `schema`.
+- `deserialize(schema: Schema, buffer: Uint8Array, validate: boolean = true): any` - deserializes an object according to the schema `schema` from the buffer `buffer`. Setting `validate` to false will skip the validation of the `schema`.
 
 ## Schemas
 Schemas are used to describe the structure of the data being serialized or deserialized. They are used to
@@ -62,7 +62,7 @@ More complex objects are described by a JSON object. The following types are sup
 - `{ option: Schema }` - an optional object. The type of the object is described by the `type` field.
 - `{ map: { key: Schema, value: Schema }}` - a map. The type of the keys and values are described by the `key` and `value` fields respectively.
 - `{ set: Schema }` - a set. The type of the elements is described by the `type` field.
-- `{ enum: [{ className1: { struct: {...} } }, { className2: { struct: {...} } }, ... ] }` - an enum. The variants of the enum are described by the `className1`, `className2`, etc. fields. The variants are structs.
+- `{ enum: [ { struct: { className1: structSchema1 } }, { struct: { className2: structSchema2 } }, ... ] }` - an enum. The variants of the enum are described by the `className1`, `className2`, etc. fields. The variants are structs.
 - `{ struct: { field1: Schema1, field2: Schema2, ... } }` - a struct. The fields of the struct are described by the `field1`, `field2`, etc. fields.
 
 ### Type Mappings
@@ -119,4 +119,4 @@ When publishing to npm use [np](https://github.com/sindresorhus/np).
 This repository is distributed under the terms of both the MIT license and the Apache License (Version 2.0).
 See [LICENSE-MIT](LICENSE-MIT.txt) and [LICENSE-APACHE](LICENSE-APACHE) for details.
 
-[Borsh]:          https://borsh.io
+[Borsh]:          https://borsh.io
diff --git a/borsh-ts/deserialize.ts b/borsh-ts/deserialize.ts
@@ -54,7 +54,26 @@ export class BorshDeserializer {
     decode_string(): string {
         const len: number = this.decode_integer('u32') as number;
         const buffer = new Uint8Array(this.buffer.consume_bytes(len));
-        return String.fromCharCode.apply(null, buffer);
+
+        // decode utf-8 string without using TextDecoder
+        // first get all bytes to single byte code points
+        const codePoints = [];
+        for (let i = 0; i < len; ++i) {
+            const byte = buffer[i];
+            if (byte < 0x80) {
+                codePoints.push(byte);
+            } else if (byte < 0xE0) {
+                codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
+            } else if (byte < 0xF0) {
+                codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
+            } else {
+                const codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
+                codePoints.push(codePoint);
+            }
+        }
+
+        // then decode code points to utf-8
+        return String.fromCodePoint(...codePoints);
     }
 
     decode_boolean(): boolean {

diff --git a/borsh-ts/serialize.ts b/borsh-ts/serialize.ts
@@ -63,13 +63,32 @@ export class BorshSerializer {
         this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);
         const _value = value as string;
 
-        // 4 bytes for length
-        this.encoded.store_value(_value.length, 'u32');
-
-        // string bytes
+        // encode to utf8 bytes without using TextEncoder
+        const utf8Bytes: number[] = [];
         for (let i = 0; i < _value.length; i++) {
-            this.encoded.store_value(_value.charCodeAt(i), 'u8');
+            let charCode = _value.charCodeAt(i);
+
+            if (charCode < 0x80) {
+                utf8Bytes.push(charCode);
+            } else if (charCode < 0x800) {
+                utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
+            } else if (charCode < 0xd800 || charCode >= 0xe000) {
+                utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
+            } else {
+                i++;
+                charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
+                utf8Bytes.push(
+                    0xf0 | (charCode >> 18),
+                    0x80 | ((charCode >> 12) & 0x3f),
+                    0x80 | ((charCode >> 6) & 0x3f),
+                    0x80 | (charCode & 0x3f),
+                );
+            }
         }
+
+        // 4 bytes for length + string bytes
+        this.encoded.store_value(utf8Bytes.length, 'u32');
+        this.encoded.store_bytes(new Uint8Array(utf8Bytes));
     }
 
     encode_boolean(value: unknown): void {

diff --git a/borsh-ts/test/(de)serialize.test.js b/borsh-ts/test/(de)serialize.test.js
@@ -40,6 +40,10 @@ test('serialize booleans', async () => {
 
 test('serialize strings', async () => {
     check_roundtrip('h"i', 'string', [3, 0, 0, 0, 104, 34, 105]);
+    check_roundtrip('Chévere', 'string', [8, 0, 0, 0, 67, 104, 195, 169, 118, 101, 114, 101]);
+    check_roundtrip('!ǬЇЉي࠺👍ઠ൧࿄ሒᘻᏠᬅᡝ࠻', 'string', [43, 0, 0, 0, 33, 199, 172, 208, 135, 208, 137, 217, 138, 224, 160, 186, 240, 159, 145, 141, 224, 170, 160, 224, 181, 167, 224, 191, 132, 225, 136, 146, 225, 152, 187, 225, 143, 160, 225, 172, 133, 225, 161, 157, 224, 160, 187]);
+    check_roundtrip('óñ@‡؏ث 漢࠶⭐🔒􀀀', 'string', [30, 0, 0, 0, 195, 179, 195, 177, 64, 226, 128, 161, 216, 143, 216, 171, 32, 230, 188, 162, 224, 160, 182, 226, 173, 144, 240, 159, 148, 146, 244, 128, 128, 128]);
+    check_roundtrip('f © bar 𝌆 baz ☃ qux', 'string', [25, 0, 0, 0, 102, 32, 194, 169, 32, 98, 97, 114, 32, 240, 157, 140, 134, 32, 98, 97, 122, 32, 226, 152, 131, 32, 113, 117, 120]);
 });
 
 test('serialize floats', async () => {

diff --git a/examples/cjs/package.json b/examples/cjs/package.json
@@ -1,7 +1,7 @@
 {
   "name": "cjs-example",
   "private": true,
-  "version": "1.0.0",
+  "version": "2.0.0",
   "description": "",
   "main": "index.js",
   "dependencies": {

diff --git a/examples/esm/package.json b/examples/esm/package.json
@@ -1,7 +1,7 @@
 {
   "name": "esm-example",
   "private": true,
-  "version": "1.0.0",
+  "version": "2.0.0",
   "description": "",
   "type": "module",
   "main": "index.js",

diff --git a/lib/cjs/deserialize.js b/lib/cjs/deserialize.js
@@ -55,7 +55,27 @@ var BorshDeserializer = /** @class */ (function () {
     BorshDeserializer.prototype.decode_string = function () {
         var len = this.decode_integer('u32');
         var buffer = new Uint8Array(this.buffer.consume_bytes(len));
-        return String.fromCharCode.apply(null, buffer);
+        // decode utf-8 string without using TextDecoder
+        // first get all bytes to single byte code points
+        var codePoints = [];
+        for (var i = 0; i < len; ++i) {
+            var byte = buffer[i];
+            if (byte < 0x80) {
+                codePoints.push(byte);
+            }
+            else if (byte < 0xE0) {
+                codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
+            }
+            else if (byte < 0xF0) {
+                codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
+            }
+            else {
+                var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
+                codePoints.push(codePoint);
+            }
+        }
+        // then decode code points to utf-8
+        return String.fromCodePoint.apply(String, codePoints);
     };
     BorshDeserializer.prototype.decode_boolean = function () {
         return this.buffer.consume_value('u8') > 0;

diff --git a/lib/cjs/serialize.js b/lib/cjs/serialize.js
@@ -84,12 +84,28 @@ var BorshSerializer = /** @class */ (function () {
     BorshSerializer.prototype.encode_string = function (value) {
         this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);
         var _value = value;
-        // 4 bytes for length
-        this.encoded.store_value(_value.length, 'u32');
-        // string bytes
+        // encode to utf8 bytes without using TextEncoder
+        var utf8Bytes = [];
         for (var i = 0; i < _value.length; i++) {
-            this.encoded.store_value(_value.charCodeAt(i), 'u8');
+            var charCode = _value.charCodeAt(i);
+            if (charCode < 0x80) {
+                utf8Bytes.push(charCode);
+            }
+            else if (charCode < 0x800) {
+                utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
+            }
+            else if (charCode < 0xd800 || charCode >= 0xe000) {
+                utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
+            }
+            else {
+                i++;
+                charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
+                utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
+            }
         }
+        // 4 bytes for length + string bytes
+        this.encoded.store_value(utf8Bytes.length, 'u32');
+        this.encoded.store_bytes(new Uint8Array(utf8Bytes));
     };
     BorshSerializer.prototype.encode_boolean = function (value) {
         this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath);

diff --git a/lib/esm/deserialize.js b/lib/esm/deserialize.js
@@ -52,7 +52,27 @@ var BorshDeserializer = /** @class */ (function () {
     BorshDeserializer.prototype.decode_string = function () {
         var len = this.decode_integer('u32');
         var buffer = new Uint8Array(this.buffer.consume_bytes(len));
-        return String.fromCharCode.apply(null, buffer);
+        // decode utf-8 string without using TextDecoder
+        // first get all bytes to single byte code points
+        var codePoints = [];
+        for (var i = 0; i < len; ++i) {
+            var byte = buffer[i];
+            if (byte < 0x80) {
+                codePoints.push(byte);
+            }
+            else if (byte < 0xE0) {
+                codePoints.push(((byte & 0x1F) << 6) | (buffer[++i] & 0x3F));
+            }
+            else if (byte < 0xF0) {
+                codePoints.push(((byte & 0x0F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F));
+            }
+            else {
+                var codePoint = ((byte & 0x07) << 18) | ((buffer[++i] & 0x3F) << 12) | ((buffer[++i] & 0x3F) << 6) | (buffer[++i] & 0x3F);
+                codePoints.push(codePoint);
+            }
+        }
+        // then decode code points to utf-8
+        return String.fromCodePoint.apply(String, codePoints);
     };
     BorshDeserializer.prototype.decode_boolean = function () {
         return this.buffer.consume_value('u8') > 0;

diff --git a/lib/esm/serialize.js b/lib/esm/serialize.js
@@ -58,12 +58,28 @@ var BorshSerializer = /** @class */ (function () {
     BorshSerializer.prototype.encode_string = function (value) {
         this.checkTypes && utils.expect_type(value, 'string', this.fieldPath);
         var _value = value;
-        // 4 bytes for length
-        this.encoded.store_value(_value.length, 'u32');
-        // string bytes
+        // encode to utf8 bytes without using TextEncoder
+        var utf8Bytes = [];
         for (var i = 0; i < _value.length; i++) {
-            this.encoded.store_value(_value.charCodeAt(i), 'u8');
+            var charCode = _value.charCodeAt(i);
+            if (charCode < 0x80) {
+                utf8Bytes.push(charCode);
+            }
+            else if (charCode < 0x800) {
+                utf8Bytes.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
+            }
+            else if (charCode < 0xd800 || charCode >= 0xe000) {
+                utf8Bytes.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
+            }
+            else {
+                i++;
+                charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (_value.charCodeAt(i) & 0x3ff));
+                utf8Bytes.push(0xf0 | (charCode >> 18), 0x80 | ((charCode >> 12) & 0x3f), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
+            }
         }
+        // 4 bytes for length + string bytes
+        this.encoded.store_value(utf8Bytes.length, 'u32');
+        this.encoded.store_bytes(new Uint8Array(utf8Bytes));
     };
     BorshSerializer.prototype.encode_boolean = function (value) {
         this.checkTypes && utils.expect_type(value, 'boolean', this.fieldPath);

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "borsh",
-  "version": "1.0.0",
+  "version": "2.0.0",
   "description": "Binary Object Representation Serializer for Hashing",
   "main": "./lib/cjs/index.js",
   "module": "./lib/esm/index.js",