Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add utf16 method to JSI #47356

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/decorator.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,13 @@ class RuntimeDecorator : public Base, private jsi::Instrumentation {
return plain_.utf8(s);
}

std::u16string utf16(const String& str) override {
return plain_.utf16(str);
}
std::u16string utf16(const PropNameID& sym) override {
return plain_.utf16(sym);
}

Object createObject() override {
return plain_.createObject();
};
Expand Down Expand Up @@ -674,6 +681,15 @@ class WithRuntimeDecorator : public RuntimeDecorator<Plain, Base> {
return RD::utf8(s);
}

std::u16string utf16(const String& str) override {
Around around{with_};
return RD::utf16(str);
}
std::u16string utf16(const PropNameID& sym) override {
Around around{with_};
return RD::utf16(sym);
}

Value createValueFromJsonUtf8(const uint8_t* json, size_t length) override {
Around around{with_};
return RD::createValueFromJsonUtf8(json, length);
Expand Down
111 changes: 111 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/jsi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,107 @@ Value callGlobalFunction(Runtime& runtime, const char* name, const Value& arg) {
return f.call(runtime, arg);
}

// Given a sequence of UTF8 encoded bytes, advance the input to past where a
// 32-bit unicode codepoint as been decoded and return the codepoint. If the
// UTF8 encoding is invalid, then return the value with the unicode replacement
// character (U+FFFD). This decoder also relies on zero termination at end of
// the input for bound checks.
// \param input char pointer pointing to the current character
// \return Unicode codepoint
uint32_t decodeUTF8(const char*& input) {
uint32_t ch = (unsigned char)input[0];
if (ch <= 0x7f) {
input += 1;
return ch;
}
uint32_t ret;
constexpr uint32_t replacementCharacter = 0xFFFD;
if ((ch & 0xE0) == 0xC0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0xC0) != 0x80) {
input += 1;
return replacementCharacter;
}
ret = ((ch & 0x1F) << 6) | (ch1 & 0x3F);
input += 2;
if (ret <= 0x7F) {
return replacementCharacter;
}
} else if ((ch & 0xF0) == 0xE0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) {
input += 1;
return replacementCharacter;
}
uint32_t ch2 = (unsigned char)input[2];
if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) {
input += 2;
return replacementCharacter;
}
ret = ((ch & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
input += 3;
if (ret <= 0x7FF) {
return replacementCharacter;
}
} else if ((ch & 0xF8) == 0xF0) {
uint32_t ch1 = (unsigned char)input[1];
if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) {
input += 1;
return replacementCharacter;
}
uint32_t ch2 = (unsigned char)input[2];
if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) {
input += 2;
return replacementCharacter;
}
uint32_t ch3 = (unsigned char)input[3];
if ((ch3 & 0x40) != 0 || (ch3 & 0x80) == 0) {
input += 3;
return replacementCharacter;
}
ret = ((ch & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) |
(ch3 & 0x3F);
input += 4;
if (ret <= 0xFFFF) {
return replacementCharacter;
}
if (ret > 0x10FFFF) {
return replacementCharacter;
}
} else {
input += 1;
return replacementCharacter;
}
return ret;
}

// Given a valid 32-bit unicode codepoint, encode it as UTF-16 into the output.
void encodeUTF16(std::u16string& out, uint32_t cp) {
if (cp < 0x10000) {
out.push_back((uint16_t)cp);
return;
}
cp -= 0x10000;
uint16_t highSurrogate = 0xD800 + ((cp >> 10) & 0x3FF);
out.push_back(highSurrogate);
uint16_t lowSurrogate = 0xDC00 + (cp & 0x3FF);
out.push_back(lowSurrogate);
}

// Convert the UTF8 encoded string into a UTF16 encoded string. If the
// input is not valid UTF8, the replacement character (U+FFFD) is used to
// represent the invalid sequence.
std::u16string convertUTF8ToUTF16(const std::string& utf8) {
std::u16string ret;
const char* curr = utf8.data();
const char* end = curr + utf8.length();
while (curr < end) {
auto cp = decodeUTF8(curr);
encodeUTF16(ret, cp);
}
return ret;
}

} // namespace

Buffer::~Buffer() = default;
Expand Down Expand Up @@ -147,6 +248,16 @@ Value Runtime::createValueFromJsonUtf8(const uint8_t* json, size_t length) {
return parseJson.call(*this, String::createFromUtf8(*this, json, length));
}

std::u16string Runtime::utf16(const PropNameID& sym) {
auto utf8Str = utf8(sym);
return convertUTF8ToUTF16(utf8Str);
}

std::u16string Runtime::utf16(const String& str) {
auto utf8Str = utf8(str);
return convertUTF8ToUTF16(utf8Str);
}

Pointer& Pointer::operator=(Pointer&& other) {
if (ptr_) {
ptr_->invalidate();
Expand Down
13 changes: 13 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/jsi.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,9 @@ class JSI_EXPORT Runtime {
const jsi::Object& obj,
size_t amount) = 0;

virtual std::u16string utf16(const String& str);
virtual std::u16string utf16(const PropNameID& sym);

// These exist so derived classes can access the private parts of
// Value, Symbol, String, and Object, which are all friends of Runtime.
template <typename T>
Expand Down Expand Up @@ -501,6 +504,11 @@ class JSI_EXPORT PropNameID : public Pointer {
return runtime.utf8(*this);
}

/// Copies the data in a PropNameID as utf16 into a C++ string.
std::u16string utf16(Runtime& runtime) const {
return runtime.utf16(*this);
}

static bool compare(
Runtime& runtime,
const jsi::PropNameID& a,
Expand Down Expand Up @@ -651,6 +659,11 @@ class JSI_EXPORT String : public Pointer {
return runtime.utf8(*this);
}

/// Copies the data in a JS string as utf16 into a C++ string.
std::u16string utf16(Runtime& runtime) const {
return runtime.utf16(*this);
}

friend class Runtime;
friend class Value;
};
Expand Down
59 changes: 59 additions & 0 deletions packages/react-native/ReactCommon/jsi/jsi/test/testlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,65 @@ TEST_P(JSITest, UTF8ExceptionTest) {
}
}

TEST_P(JSITest, UTF16Test) {
// This Runtime Decorator is used to test the conversion from UTF-8 to UTF-16
// in the default utf16 method for runtimes that do not provide their own
// utf16 implementation.
class UTF16RD : public RuntimeDecorator<Runtime, Runtime> {
public:
UTF16RD(Runtime& rt) : RuntimeDecorator(rt) {}

std::string utf8(const String&) override {
return utf8Str;
}

std::u16string utf16(const String& str) override {
return Runtime::utf16(str);
}

std::string utf8Str;
};

UTF16RD rd = UTF16RD(rt);
String str = String::createFromUtf8(rd, "placeholder");

rd.utf8Str = "foobar";
EXPECT_EQ(str.utf16(rd), u"foobar");

rd.utf8Str = "你好";
EXPECT_EQ(str.utf16(rd), u"你好");

rd.utf8Str = "👍";
EXPECT_EQ(str.utf16(rd), u"👍");

rd.utf8Str = "foobar👍你好";
EXPECT_EQ(str.utf16(rd), u"foobar👍你好");

// String ended before second byte of the encoding
rd.utf8Str = "\xcf";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Third byte should follow the pattern of 0b10xxxxxx
rd.utf8Str = "\xef\x8f\x29";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u0029");

// U+2200 should be encoded in 3 bytes as 0xE2 0x88 0x80, not 4 bytes
rd.utf8Str = "\xf0\x82\x88\x80";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Unicode Max Value is U+10FFFF, U+11FFFF is invalid
rd.utf8Str = "\xf4\x9f\xbf\xbf";
EXPECT_EQ(str.utf16(rd), u"\uFFFD");

// Missing the third byte of the 3-byte encoding, followed by 'z'
rd.utf8Str = "\xe1\xa0\x7a";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A");

// First byte is neither ASCII nor a valid continuation byte
rd.utf8Str = "\xea\x7a";
EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A");
}

INSTANTIATE_TEST_CASE_P(
Runtimes,
JSITest,
Expand Down
Loading