Skip to content
34 changes: 34 additions & 0 deletions encodings/internal.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ function InternalCodec(codecOptions, iconv) {

if (this.enc === "base64")
this.encoder = InternalEncoderBase64;
else if (this.enc === "utf8")
this.encoder = InternalEncoderUtf8;
else if (this.enc === "cesu8") {
this.enc = "utf8"; // Use utf8 for decoding.
this.encoder = InternalEncoderCesu8;
Expand Down Expand Up @@ -196,3 +198,35 @@ InternalDecoderCesu8.prototype.end = function() {
res += this.defaultCharUnicode;
return res;
}

//------------------------------------------------------------------------------
// check the chunk boundaries for surrogate pair

function InternalEncoderUtf8(options, codec) {
this.highSurrogate = '';
}

InternalEncoderUtf8.prototype.write = function (str) {
if (this.highSurrogate) {
str = this.highSurrogate + str;
this.highSurrogate = '';
}

if (str.length > 0) {
var charCode = str.charCodeAt(str.length - 1);
if (0xd800 <= charCode && charCode < 0xdc00) {
this.highSurrogate = str[str.length - 1];
str = str.slice(0, str.length - 1);
}
}

return Buffer.from(str, this.enc);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you're returning Buffer.from in the end, this is good.
The lastr handling became more messy though. I thought we could do something like this:

InternalEncoderUtf8.prototype.write = function (str) {
    if (!str) return;

    if (this.lowSurrogate) {
        str = this.lowSurrogate + str;
        this.lowSurrogate = '';
    }

    var lastCharCode = str.charCodeAt(str.length-1);
    if (0xD800 < lastCharCode && lastCharCode <= 0x...) {
        this.lowSurrogate = str.slice(str.length - 1);
        str = str.slice(0, str.length - 1)
    }

    return Buffer.from(str, this.enc);
}

}

InternalEncoderUtf8.prototype.end = function () {
if (this.highSurrogate) {
var str = this.highSurrogate;
this.highSurrogate = '';
return Buffer.from(str, this.enc);
}
}
34 changes: 29 additions & 5 deletions test/streams-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,6 @@ describe("Streaming mode", function() {
output: "e4b882",
}));

it("Encoding using internal modules: utf8 with surrogates", checkEncodeStream({
encoding: "utf8",
input: ["\uD83D\uDE3B"],
output: "f09f98bb",
}));

it("Decoding of incomplete chars in DBCS (gbk)", checkDecodeStream({
encoding: "gbk",
Expand Down Expand Up @@ -331,3 +326,32 @@ describe("Streaming sugar", function() {
});
});

describe("Encoding using internal modules with surrogates in separate chunks:", function () {
function checkUtf8EncodeStream (input) {
return checkEncodeStream({
encoding: "utf8",
input: input,
output: Buffer.from(input.join(''), 'utf8').toString('hex')
})
}

it("a single string", checkUtf8EncodeStream(["\uD83D\uDE3B"]))

it("normal", checkUtf8EncodeStream(["\uD83D", "\uDE3B"]))

it("reverse", checkUtf8EncodeStream(["\uDE3B", "\uD83D"]))

it("multiple surrogates", checkUtf8EncodeStream(["\uD83D", "\uDE3B\uD83D", "\uDE3B"]))

it("more than one character with left", checkUtf8EncodeStream(["abc\uD83D", "\uDE3B"]))

it("more than one character with right", checkUtf8EncodeStream(["\uD83D", "\uDE3Befg"]))

it("more than one character at both ends", checkUtf8EncodeStream(["abc\uD83D", "\uDE3Befg"]))

it("surrogates pair be interrupted", checkUtf8EncodeStream(["abc\uD83D", "efg\uDE3B"]))

it("a half of surrogates pair only left", checkUtf8EncodeStream(["abc\uD83D"]))

it("a half of surrogates pair only right", checkUtf8EncodeStream(["\uDE3Befg"]))
});