Skip to content

Commit 3863a36

Browse files
dannysufacebook-github-bot
authored andcommitted
Add TextEncoder.prototype.encodeInto()
Summary: Implement TextEncoder's `encodeInto()` function. Reviewed By: avp Differential Revision: D53216139 fbshipit-source-id: eb4f5a1461084d22c77a7c0723de624f50468785
1 parent 7f9d9d5 commit 3863a36

File tree

6 files changed

+257
-29
lines changed

6 files changed

+257
-29
lines changed

include/hermes/Support/UTF8.h

+10
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,16 @@ bool convertUTF16ToUTF8WithReplacements(
245245
llvh::ArrayRef<char16_t> input,
246246
size_t maxCharacters = 0);
247247

248+
/// Convert a UTF-16 encoded string \p input to a pre-allocated UTF-8 buffer
249+
/// \p outBuffer of length \p outBufferLength, replacing unpaired surrogates
250+
/// halves with the Unicode replacement character.
251+
/// \return a std::pair with the first element being the number of UTF-16
252+
/// characters converted, and the second element being the number of UTF-8
253+
/// characters written
254+
std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
255+
llvh::MutableArrayRef<uint8_t> outBuffer,
256+
llvh::ArrayRef<char16_t> input);
257+
248258
/// Convert a UTF-8 encoded string (with surrogates) \p input to a UTF-8 one
249259
/// (without surrogates), storing the conversion in \p output. Output characters
250260
/// are appended to \p output.

include/hermes/VM/NativeFunctions.def

+1
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ NATIVE_FUNCTION(symbolPrototypeValueOf)
364364
NATIVE_FUNCTION(textEncoderConstructor)
365365
NATIVE_FUNCTION(textEncoderPrototypeEncoding)
366366
NATIVE_FUNCTION(textEncoderPrototypeEncode)
367+
NATIVE_FUNCTION(textEncoderPrototypeEncodeInto)
367368
NATIVE_FUNCTION(throwTypeError)
368369
NATIVE_FUNCTION(typedArrayBaseConstructor)
369370
NATIVE_FUNCTION(typedArrayFrom)

include/hermes/VM/PredefinedStrings.def

+3
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,9 @@ STR(squareSymbolSplit, "[Symbol.split]")
486486

487487
STR(TextEncoder, "TextEncoder")
488488
STR(encode, "encode")
489+
STR(encodeInto, "encodeInto")
490+
STR(read, "read")
491+
STR(written, "written")
489492
STR(encoding, "encoding")
490493
STR(utf8, "utf-8")
491494

lib/Support/UTF8.cpp

+84-29
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,39 @@ void encodeUTF8(char *&dst, uint32_t cp) {
6363
dst = d;
6464
}
6565

66+
/// The following logic is a combination of ES14 11.1.4 CodePointAt() and
67+
/// what https://infra.spec.whatwg.org/#strings says about what to do with
68+
/// singular surrogates: "To convert a string into a scalar value string,
69+
/// replace any surrogates with U+FFFD." Therefore, if we encounter any lone
70+
/// surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
71+
/// The result of this process is that the enclosing for-loop processes only
72+
/// scalar values (aka a code point that is not a surrogate).
73+
/// \param cur Iterator pointing to the current character
74+
/// \param end Iterator pointing to the end of the string
75+
/// \return std::pair with first element being the Unicode code point, and the
76+
/// second being how many code point units were consumed
77+
static std::pair<char32_t, size_t> convertToCodePointAt(
78+
llvh::ArrayRef<char16_t>::iterator cur,
79+
llvh::ArrayRef<char16_t>::iterator end) {
80+
char16_t c = cur[0];
81+
if (isLowSurrogate(c)) {
82+
// Unpaired low surrogate.
83+
return {UNICODE_REPLACEMENT_CHARACTER, 1};
84+
} else if (isHighSurrogate(c)) {
85+
// Leading high surrogate. See if the next character is a low surrogate.
86+
if (cur + 1 == end || !isLowSurrogate(cur[1])) {
87+
// Trailing or unpaired high surrogate.
88+
return {UNICODE_REPLACEMENT_CHARACTER, 1};
89+
} else {
90+
// Decode surrogate pair and increment, because we consumed two chars.
91+
return {utf16SurrogatePairToCodePoint(c, cur[1]), 2};
92+
}
93+
} else {
94+
// Not a surrogate.
95+
return {c, 1};
96+
}
97+
}
98+
6699
bool convertUTF16ToUTF8WithReplacements(
67100
std::string &out,
68101
llvh::ArrayRef<char16_t> input,
@@ -85,40 +118,62 @@ bool convertUTF16ToUTF8WithReplacements(
85118
continue;
86119
}
87120

88-
// The following logic is a combination of ES14 11.1.4 CodePointAt() and
89-
// what https://infra.spec.whatwg.org/#strings says about what to do with
90-
// singular surrogates: "To convert a string into a scalar value string,
91-
// replace any surrogates with U+FFFD." Therefore, if we encounter any lone
92-
// surrogate, replace the value with UNICODE_REPLACEMENT_CHARACTER (U+FFFD).
93-
// The result of this process is that the enclosing for-loop processes only
94-
// scalar values (aka a code point that is not a surrogate).
95-
char32_t c32;
96-
if (isLowSurrogate(cur[0])) {
97-
// Unpaired low surrogate.
98-
c32 = UNICODE_REPLACEMENT_CHARACTER;
99-
} else if (isHighSurrogate(cur[0])) {
100-
// Leading high surrogate. See if the next character is a low surrogate.
101-
if (cur + 1 == end || !isLowSurrogate(cur[1])) {
102-
// Trailing or unpaired high surrogate.
103-
c32 = UNICODE_REPLACEMENT_CHARACTER;
104-
} else {
105-
// Decode surrogate pair and increment, because we consumed two chars.
106-
c32 = utf16SurrogatePairToCodePoint(cur[0], cur[1]);
107-
++cur;
121+
auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
122+
cur += (inputConsumed - 1);
123+
124+
// The code point to be encoded here is guaranteed to be a valid unicode
125+
// code point and not a surrogate. Because of the convertToCodePointAt()
126+
// process.
127+
std::array<char, UTF8CodepointMaxBytes> buff;
128+
char *ptr = buff.data();
129+
encodeUTF8(ptr, c32);
130+
out.insert(out.end(), buff.data(), ptr);
131+
}
132+
return cur == end;
133+
}
134+
135+
std::pair<uint32_t, uint32_t> convertUTF16ToUTF8BufferWithReplacements(
136+
llvh::MutableArrayRef<uint8_t> outBuffer,
137+
llvh::ArrayRef<char16_t> input) {
138+
uint32_t numRead = 0;
139+
uint32_t numWritten = 0;
140+
uint8_t *writtenPtr = outBuffer.begin();
141+
auto end = input.end();
142+
for (auto cur = input.begin(); cur < end; ++cur) {
143+
char16_t c = cur[0];
144+
// ASCII fast-path.
145+
if (LLVM_LIKELY(c <= 0x7F)) {
146+
if (numWritten + 1 > outBuffer.size()) {
147+
break;
108148
}
109-
} else {
110-
// Not a surrogate.
111-
c32 = c;
149+
*writtenPtr = static_cast<char>(c);
150+
writtenPtr++;
151+
numWritten++;
152+
numRead++;
153+
continue;
112154
}
113155

114-
// The code point to be converted here is guaranteed to be a valid unicode
115-
// code point and not a surrogate. Because of the conversion above.
116-
char buff[UTF8CodepointMaxBytes];
117-
char *ptr = buff;
156+
auto [c32, inputConsumed] = convertToCodePointAt(cur, end);
157+
cur += (inputConsumed - 1);
158+
159+
// The code point to be encoded here is guaranteed to be a valid unicode
160+
// code point and not a surrogate. Because of the convertToCodePointAt()
161+
// process.
162+
std::array<char, UTF8CodepointMaxBytes> buff;
163+
char *ptr = buff.data();
118164
encodeUTF8(ptr, c32);
119-
out.insert(out.end(), buff, ptr);
165+
166+
size_t convertedLength = ptr - buff.data();
167+
if (numWritten + convertedLength > outBuffer.size()) {
168+
break;
169+
}
170+
std::memcpy(writtenPtr, buff.data(), convertedLength);
171+
writtenPtr += convertedLength;
172+
numWritten += convertedLength;
173+
numRead += inputConsumed;
120174
}
121-
return cur == end;
175+
176+
return {numRead, numWritten};
122177
}
123178

124179
void convertUTF16ToUTF8WithSingleSurrogates(

lib/VM/JSLib/TextEncoder.cpp

+103
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ Handle<JSObject> createTextEncoderConstructor(Runtime &runtime) {
5151
textEncoderPrototypeEncode,
5252
1);
5353

54+
defineMethod(
55+
runtime,
56+
textEncoderPrototype,
57+
Predefined::getSymbolID(Predefined::encodeInto),
58+
nullptr,
59+
textEncoderPrototypeEncodeInto,
60+
2);
61+
5462
auto cons = defineSystemConstructor<JSObject>(
5563
runtime,
5664
Predefined::getSymbolID(Predefined::TextEncoder),
@@ -182,5 +190,100 @@ textEncoderPrototypeEncode(void *, Runtime &runtime, NativeArgs args) {
182190
}
183191
}
184192

193+
CallResult<HermesValue>
194+
textEncoderPrototypeEncodeInto(void *, Runtime &runtime, NativeArgs args) {
195+
GCScope gcScope{runtime};
196+
auto selfHandle = args.dyncastThis<JSObject>();
197+
NamedPropertyDescriptor desc;
198+
bool exists = JSObject::getOwnNamedDescriptor(
199+
selfHandle,
200+
runtime,
201+
Predefined::getSymbolID(Predefined::InternalPropertyTextEncoderType),
202+
desc);
203+
if (LLVM_UNLIKELY(!exists)) {
204+
return runtime.raiseTypeError(
205+
"TextEncoder.prototype.encodeInto() called on non-TextEncoder object");
206+
}
207+
208+
auto strRes = toString_RJS(runtime, args.getArgHandle(0));
209+
if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) {
210+
return ExecutionStatus::EXCEPTION;
211+
}
212+
Handle<StringPrimitive> string = runtime.makeHandle(std::move(*strRes));
213+
214+
Handle<Uint8Array> typedArray = args.dyncastArg<Uint8Array>(1);
215+
if (LLVM_UNLIKELY(!typedArray)) {
216+
return runtime.raiseTypeError("The second argument should be a Uint8Array");
217+
}
218+
219+
if (LLVM_UNLIKELY(!typedArray->attached(runtime))) {
220+
return runtime.raiseTypeError(
221+
"TextEncoder.prototype.encodeInto() called on a detached Uint8Array");
222+
}
223+
224+
PseudoHandle<JSObject> objRes = JSObject::create(runtime, 2);
225+
Handle<JSObject> obj = runtime.makeHandle(objRes.get());
226+
227+
uint32_t numRead = 0;
228+
uint32_t numWritten = 0;
229+
230+
if (LLVM_UNLIKELY(string->getStringLength() == 0)) {
231+
numRead = 0;
232+
numWritten = 0;
233+
} else if (string->isASCII()) {
234+
// ASCII string can trivially be converted to UTF-8 because ASCII is a
235+
// strict subset. However, since the output array size is provided by the
236+
// caller, we will only copy as much length as provided.
237+
llvh::ArrayRef<char> strRef = string->getStringRef<char>();
238+
239+
uint32_t copiedLength =
240+
std::min(string->getStringLength(), typedArray->getLength());
241+
242+
std::memcpy(typedArray->begin(runtime), strRef.data(), copiedLength);
243+
244+
numRead = copiedLength;
245+
numWritten = copiedLength;
246+
} else {
247+
// Convert UTF-16 to the given Uint8Array
248+
llvh::ArrayRef<char16_t> strRef = string->getStringRef<char16_t>();
249+
std::pair<uint32_t, uint32_t> result =
250+
convertUTF16ToUTF8BufferWithReplacements(
251+
llvh::makeMutableArrayRef<uint8_t>(
252+
typedArray->begin(runtime), typedArray->getLength()),
253+
strRef);
254+
numRead = result.first;
255+
numWritten = result.second;
256+
}
257+
258+
// Construct the result JSObject containing information about how much data
259+
// was converted
260+
auto numReadHandle =
261+
runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numRead));
262+
auto numWrittenHandle =
263+
runtime.makeHandle(HermesValue::encodeTrustedNumberValue(numWritten));
264+
265+
auto res = JSObject::defineNewOwnProperty(
266+
obj,
267+
runtime,
268+
Predefined::getSymbolID(Predefined::read),
269+
PropertyFlags::defaultNewNamedPropertyFlags(),
270+
numReadHandle);
271+
if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
272+
return ExecutionStatus::EXCEPTION;
273+
}
274+
275+
res = JSObject::defineNewOwnProperty(
276+
obj,
277+
runtime,
278+
Predefined::getSymbolID(Predefined::written),
279+
PropertyFlags::defaultNewNamedPropertyFlags(),
280+
numWrittenHandle);
281+
if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) {
282+
return ExecutionStatus::EXCEPTION;
283+
}
284+
285+
return obj.getHermesValue();
286+
}
287+
185288
} // namespace vm
186289
} // namespace hermes

test/hermes/text-encoder.js

+56
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,59 @@ print(result.length, result.join(' '));
6464
result = encoder.encode('\u{D83D}');
6565
print(result.length, result.join(' '));
6666
// CHECK-NEXT: 3 239 191 189
67+
68+
result = new Uint8Array(4);
69+
70+
try {
71+
const b = {};
72+
TextEncoder.prototype.encodeInto.call(b, '', result);
73+
} catch (e) {
74+
print(e.message);
75+
// CHECK-NEXT: TextEncoder.prototype.encodeInto() called on non-TextEncoder object
76+
}
77+
78+
// Test the ASCII case that just fits within the provided buffer
79+
let stats = encoder.encodeInto('test', result);
80+
print(stats.read, stats.written);
81+
// CHECK-NEXT: 4 4
82+
print(result[0], result[1], result[2], result[3]);
83+
// CHECK-NEXT: 116 101 115 116
84+
85+
stats = encoder.encodeInto('', result);
86+
print(stats.read, stats.written);
87+
// CHECK-NEXT: 0 0
88+
89+
// ASCII case that does NOT fit within the provided buffer
90+
stats = encoder.encodeInto('testing', result);
91+
print(stats.read, stats.written);
92+
// CHECK-NEXT: 4 4
93+
print(result[0], result[1], result[2], result[3]);
94+
// CHECK-NEXT: 116 101 115 116
95+
96+
// ASCII case that is smaller than the provided buffer
97+
stats = encoder.encodeInto('abc', result);
98+
print(stats.read, stats.written);
99+
// CHECK-NEXT: 3 3
100+
print(result[0], result[1], result[2]);
101+
// CHECK-NEXT: 97 98 99
102+
103+
// UTF-16 case that fits within the provided buffer
104+
stats = encoder.encodeInto('\u{2191}', result);
105+
print(stats.read, stats.written);
106+
// CHECK-NEXT: 1 3
107+
print(result[0], result[1], result[2]);
108+
// CHECK-NEXT: 226 134 145
109+
110+
// UTF-16 case that does NOT fit within the provided buffer
111+
stats = encoder.encodeInto('\u{2191}\u{2192}', result);
112+
print(stats.read, stats.written);
113+
// CHECK-NEXT: 1 3
114+
print(result[0], result[1], result[2]);
115+
// CHECK-NEXT: 226 134 145
116+
117+
// Surrogate case that just fits within the provided buffer
118+
stats = encoder.encodeInto('\u{D83D}\u{DE03}', result);
119+
print(stats.read, stats.written);
120+
// CHECK-NEXT: 2 4
121+
print(result[0], result[1], result[2], result[3]);
122+
// CHECK-NEXT: 240 159 152 131

0 commit comments

Comments
 (0)