From c98b00e88bb685e75b769be67919a23a7f03b2e0 Mon Sep 17 00:00:00 2001 From: Chi Tsai Date: Fri, 1 Nov 2024 13:52:46 -0700 Subject: [PATCH] Add utf16 method to JSI Summary: X-link: https://github.com/facebook/react-native/pull/47356 Add utf16 method to JSI. This change will add the default implementation for all VMs by calling UTF8 and manually convert it to UTF16. A later change will be added for Hermes to use internal VM information to get the UTF16 string. Changelog: [Internal] Reviewed By: neildhar Differential Revision: D64918244 fbshipit-source-id: 6fc0c44fc397c2f8bb40a4262596b178ee4f1f29 --- API/jsi/jsi/decorator.h | 16 +++++ API/jsi/jsi/jsi.cpp | 111 +++++++++++++++++++++++++++++++++++ API/jsi/jsi/jsi.h | 13 ++++ API/jsi/jsi/test/testlib.cpp | 59 +++++++++++++++++++ 4 files changed, 199 insertions(+) diff --git a/API/jsi/jsi/decorator.h b/API/jsi/jsi/decorator.h index 07705161702..5e57b44a87d 100644 --- a/API/jsi/jsi/decorator.h +++ b/API/jsi/jsi/decorator.h @@ -225,6 +225,13 @@ class RuntimeDecorator : public Base, private jsi::Instrumentation { return plain_.utf8(s); } + std::u16string utf16(const String& str) override { + return plain_.utf16(str); + } + std::u16string utf16(const PropNameID& sym) override { + return plain_.utf16(sym); + } + Object createObject() override { return plain_.createObject(); }; @@ -674,6 +681,15 @@ class WithRuntimeDecorator : public RuntimeDecorator { return RD::utf8(s); } + std::u16string utf16(const String& str) override { + Around around{with_}; + return RD::utf16(str); + } + std::u16string utf16(const PropNameID& sym) override { + Around around{with_}; + return RD::utf16(sym); + } + Value createValueFromJsonUtf8(const uint8_t* json, size_t length) override { Around around{with_}; return RD::createValueFromJsonUtf8(json, length); diff --git a/API/jsi/jsi/jsi.cpp b/API/jsi/jsi/jsi.cpp index 3a54aa1751c..8de94ebc2f2 100644 --- a/API/jsi/jsi/jsi.cpp +++ b/API/jsi/jsi/jsi.cpp @@ -62,6 +62,107 @@ Value callGlobalFunction(Runtime& runtime, const char* name, const Value& arg) { return f.call(runtime, arg); } +// Given a sequence of UTF8 encoded bytes, advance the input to past where a +// 32-bit unicode codepoint as been decoded and return the codepoint. If the +// UTF8 encoding is invalid, then return the value with the unicode replacement +// character (U+FFFD). This decoder also relies on zero termination at end of +// the input for bound checks. +// \param input char pointer pointing to the current character +// \return Unicode codepoint +uint32_t decodeUTF8(const char*& input) { + uint32_t ch = (unsigned char)input[0]; + if (ch <= 0x7f) { + input += 1; + return ch; + } + uint32_t ret; + constexpr uint32_t replacementCharacter = 0xFFFD; + if ((ch & 0xE0) == 0xC0) { + uint32_t ch1 = (unsigned char)input[1]; + if ((ch1 & 0xC0) != 0x80) { + input += 1; + return replacementCharacter; + } + ret = ((ch & 0x1F) << 6) | (ch1 & 0x3F); + input += 2; + if (ret <= 0x7F) { + return replacementCharacter; + } + } else if ((ch & 0xF0) == 0xE0) { + uint32_t ch1 = (unsigned char)input[1]; + if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) { + input += 1; + return replacementCharacter; + } + uint32_t ch2 = (unsigned char)input[2]; + if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) { + input += 2; + return replacementCharacter; + } + ret = ((ch & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F); + input += 3; + if (ret <= 0x7FF) { + return replacementCharacter; + } + } else if ((ch & 0xF8) == 0xF0) { + uint32_t ch1 = (unsigned char)input[1]; + if ((ch1 & 0x40) != 0 || (ch1 & 0x80) == 0) { + input += 1; + return replacementCharacter; + } + uint32_t ch2 = (unsigned char)input[2]; + if ((ch2 & 0x40) != 0 || (ch2 & 0x80) == 0) { + input += 2; + return replacementCharacter; + } + uint32_t ch3 = (unsigned char)input[3]; + if ((ch3 & 0x40) != 0 || (ch3 & 0x80) == 0) { + input += 3; + return replacementCharacter; + } + ret = ((ch & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | + (ch3 & 0x3F); + input += 4; + if (ret <= 0xFFFF) { + return replacementCharacter; + } + if (ret > 0x10FFFF) { + return replacementCharacter; + } + } else { + input += 1; + return replacementCharacter; + } + return ret; +} + +// Given a valid 32-bit unicode codepoint, encode it as UTF-16 into the output. +void encodeUTF16(std::u16string& out, uint32_t cp) { + if (cp < 0x10000) { + out.push_back((uint16_t)cp); + return; + } + cp -= 0x10000; + uint16_t highSurrogate = 0xD800 + ((cp >> 10) & 0x3FF); + out.push_back(highSurrogate); + uint16_t lowSurrogate = 0xDC00 + (cp & 0x3FF); + out.push_back(lowSurrogate); +} + +// Convert the UTF8 encoded string into a UTF16 encoded string. If the +// input is not valid UTF8, the replacement character (U+FFFD) is used to +// represent the invalid sequence. +std::u16string convertUTF8ToUTF16(const std::string& utf8) { + std::u16string ret; + const char* curr = utf8.data(); + const char* end = curr + utf8.length(); + while (curr < end) { + auto cp = decodeUTF8(curr); + encodeUTF16(ret, cp); + } + return ret; +} + } // namespace Buffer::~Buffer() = default; @@ -147,6 +248,16 @@ Value Runtime::createValueFromJsonUtf8(const uint8_t* json, size_t length) { return parseJson.call(*this, String::createFromUtf8(*this, json, length)); } +std::u16string Runtime::utf16(const PropNameID& sym) { + auto utf8Str = utf8(sym); + return convertUTF8ToUTF16(utf8Str); +} + +std::u16string Runtime::utf16(const String& str) { + auto utf8Str = utf8(str); + return convertUTF8ToUTF16(utf8Str); +} + Pointer& Pointer::operator=(Pointer&& other) { if (ptr_) { ptr_->invalidate(); diff --git a/API/jsi/jsi/jsi.h b/API/jsi/jsi/jsi.h index 16e251dc5f3..8d75b06c96e 100644 --- a/API/jsi/jsi/jsi.h +++ b/API/jsi/jsi/jsi.h @@ -399,6 +399,9 @@ class JSI_EXPORT Runtime { const jsi::Object& obj, size_t amount) = 0; + virtual std::u16string utf16(const String& str); + virtual std::u16string utf16(const PropNameID& sym); + // These exist so derived classes can access the private parts of // Value, Symbol, String, and Object, which are all friends of Runtime. template @@ -501,6 +504,11 @@ class JSI_EXPORT PropNameID : public Pointer { return runtime.utf8(*this); } + /// Copies the data in a PropNameID as utf16 into a C++ string. + std::u16string utf16(Runtime& runtime) const { + return runtime.utf16(*this); + } + static bool compare( Runtime& runtime, const jsi::PropNameID& a, @@ -651,6 +659,11 @@ class JSI_EXPORT String : public Pointer { return runtime.utf8(*this); } + /// Copies the data in a JS string as utf16 into a C++ string. + std::u16string utf16(Runtime& runtime) const { + return runtime.utf16(*this); + } + friend class Runtime; friend class Value; }; diff --git a/API/jsi/jsi/test/testlib.cpp b/API/jsi/jsi/test/testlib.cpp index d9090bdb2aa..57d5afbcfc4 100644 --- a/API/jsi/jsi/test/testlib.cpp +++ b/API/jsi/jsi/test/testlib.cpp @@ -1576,6 +1576,65 @@ TEST_P(JSITest, UTF8ExceptionTest) { } } +TEST_P(JSITest, UTF16Test) { + // This Runtime Decorator is used to test the conversion from UTF-8 to UTF-16 + // in the default utf16 method for runtimes that do not provide their own + // utf16 implementation. + class UTF16RD : public RuntimeDecorator { + public: + UTF16RD(Runtime& rt) : RuntimeDecorator(rt) {} + + std::string utf8(const String&) override { + return utf8Str; + } + + std::u16string utf16(const String& str) override { + return Runtime::utf16(str); + } + + std::string utf8Str; + }; + + UTF16RD rd = UTF16RD(rt); + String str = String::createFromUtf8(rd, "placeholder"); + + rd.utf8Str = "foobar"; + EXPECT_EQ(str.utf16(rd), u"foobar"); + + rd.utf8Str = "你好"; + EXPECT_EQ(str.utf16(rd), u"你好"); + + rd.utf8Str = "👍"; + EXPECT_EQ(str.utf16(rd), u"👍"); + + rd.utf8Str = "foobar👍你好"; + EXPECT_EQ(str.utf16(rd), u"foobar👍你好"); + + // String ended before second byte of the encoding + rd.utf8Str = "\xcf"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD"); + + // Third byte should follow the pattern of 0b10xxxxxx + rd.utf8Str = "\xef\x8f\x29"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD\u0029"); + + // U+2200 should be encoded in 3 bytes as 0xE2 0x88 0x80, not 4 bytes + rd.utf8Str = "\xf0\x82\x88\x80"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD"); + + // Unicode Max Value is U+10FFFF, U+11FFFF is invalid + rd.utf8Str = "\xf4\x9f\xbf\xbf"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD"); + + // Missing the third byte of the 3-byte encoding, followed by 'z' + rd.utf8Str = "\xe1\xa0\x7a"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A"); + + // First byte is neither ASCII nor a valid continuation byte + rd.utf8Str = "\xea\x7a"; + EXPECT_EQ(str.utf16(rd), u"\uFFFD\u007A"); +} + INSTANTIATE_TEST_CASE_P( Runtimes, JSITest,