diff --git a/encodings/internal.js b/encodings/internal.js index dc1074f0..9f91a18a 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -28,6 +28,8 @@ function InternalCodec(codecOptions, iconv) { if (this.enc === "base64") this.encoder = InternalEncoderBase64; + else if (this.enc === "utf8") + this.encoder = InternalEncoderUtf8; else if (this.enc === "cesu8") { this.enc = "utf8"; // Use utf8 for decoding. this.encoder = InternalEncoderCesu8; @@ -196,3 +198,35 @@ InternalDecoderCesu8.prototype.end = function() { res += this.defaultCharUnicode; return res; } + +//------------------------------------------------------------------------------ +// check the chunk boundaries for surrogate pair + +function InternalEncoderUtf8(options, codec) { + this.highSurrogate = ''; +} + +InternalEncoderUtf8.prototype.write = function (str) { + if (this.highSurrogate) { + str = this.highSurrogate + str; + this.highSurrogate = ''; + } + + if (str.length > 0) { + var charCode = str.charCodeAt(str.length - 1); + if (0xd800 <= charCode && charCode < 0xdc00) { + this.highSurrogate = str[str.length - 1]; + str = str.slice(0, str.length - 1); + } + } + + return Buffer.from(str, this.enc); +} + +InternalEncoderUtf8.prototype.end = function () { + if (this.highSurrogate) { + var str = this.highSurrogate; + this.highSurrogate = ''; + return Buffer.from(str, this.enc); + } +} diff --git a/test/streams-test.js b/test/streams-test.js index d4054de2..173a1569 100644 --- a/test/streams-test.js +++ b/test/streams-test.js @@ -232,11 +232,6 @@ describe("Streaming mode", function() { output: "e4b882", })); - it("Encoding using internal modules: utf8 with surrogates", checkEncodeStream({ - encoding: "utf8", - input: ["\uD83D\uDE3B"], - output: "f09f98bb", - })); it("Decoding of incomplete chars in DBCS (gbk)", checkDecodeStream({ encoding: "gbk", @@ -331,3 +326,32 @@ describe("Streaming sugar", function() { }); }); +describe("Encoding using internal modules with surrogates in separate chunks:", function () { + function checkUtf8EncodeStream (input) { + return checkEncodeStream({ + encoding: "utf8", + input: input, + output: Buffer.from(input.join(''), 'utf8').toString('hex') + }) + } + + it("a single string", checkUtf8EncodeStream(["\uD83D\uDE3B"])) + + it("normal", checkUtf8EncodeStream(["\uD83D", "\uDE3B"])) + + it("reverse", checkUtf8EncodeStream(["\uDE3B", "\uD83D"])) + + it("multiple surrogates", checkUtf8EncodeStream(["\uD83D", "\uDE3B\uD83D", "\uDE3B"])) + + it("more than one character with left", checkUtf8EncodeStream(["abc\uD83D", "\uDE3B"])) + + it("more than one character with right", checkUtf8EncodeStream(["\uD83D", "\uDE3Befg"])) + + it("more than one character at both ends", checkUtf8EncodeStream(["abc\uD83D", "\uDE3Befg"])) + + it("surrogates pair be interrupted", checkUtf8EncodeStream(["abc\uD83D", "efg\uDE3B"])) + + it("a half of surrogates pair only left", checkUtf8EncodeStream(["abc\uD83D"])) + + it("a half of surrogates pair only right", checkUtf8EncodeStream(["\uDE3Befg"])) +});