Skip to content

Commit

Permalink
Merge pull request #3104 from cloudflare/jsnell/buffer-transcode-use-…
Browse files Browse the repository at this point in the history
…buffersource
  • Loading branch information
jasnell authored Nov 13, 2024
2 parents 12479a1 + d87a9ff commit e0cf3b4
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 47 deletions.
2 changes: 1 addition & 1 deletion src/node/internal/buffer.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ export function transcode(
source: ArrayBufferView,
fromEncoding: Encoding,
toEncoding: Encoding
): ArrayBuffer;
): Uint8Array;

export const ASCII: Encoding;
export const LATIN1: Encoding;
Expand Down
8 changes: 6 additions & 2 deletions src/node/internal/internal_buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2688,9 +2688,13 @@ export function transcode(
if (normalizedToEncoding === undefined) {
throw new ERR_UNKNOWN_ENCODING(toEncoding);
}
return Buffer.from(
bufferUtil.transcode(source, normalizedFromEncoding, normalizedToEncoding)

const u8: Uint8Array = bufferUtil.transcode(
source,
normalizedFromEncoding,
normalizedToEncoding
);
return Buffer.from(u8.buffer, u8.byteOffset, u8.byteLength);
}

export function resolveObjectURL(_id: string): unknown {
Expand Down
8 changes: 5 additions & 3 deletions src/workerd/api/node/buffer.c++
Original file line number Diff line number Diff line change
Expand Up @@ -756,15 +756,17 @@ bool BufferUtil::isUtf8(kj::Array<kj::byte> buffer) {
return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size());
}

kj::Array<kj::byte> BufferUtil::transcode(
kj::Array<kj::byte> source, EncodingValue rawFromEncoding, EncodingValue rawToEncoding) {
jsg::BufferSource BufferUtil::transcode(jsg::Lock& js,
kj::Array<kj::byte> source,
EncodingValue rawFromEncoding,
EncodingValue rawToEncoding) {
auto fromEncoding = static_cast<Encoding>(rawFromEncoding);
auto toEncoding = static_cast<Encoding>(rawToEncoding);

JSG_REQUIRE(i18n::canBeTranscoded(fromEncoding) && i18n::canBeTranscoded(toEncoding), Error,
"Unable to transcode buffer due to unsupported encoding");

return i18n::transcode(source, fromEncoding, toEncoding);
return i18n::transcode(js, source, fromEncoding, toEncoding);
}

} // namespace workerd::api::node
6 changes: 4 additions & 2 deletions src/workerd/api/node/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,10 @@ class BufferUtil final: public jsg::Object {
jsg::JsString flush(jsg::Lock& js, kj::Array<kj::byte> state);
bool isAscii(kj::Array<kj::byte> bytes);
bool isUtf8(kj::Array<kj::byte> bytes);
kj::Array<kj::byte> transcode(
kj::Array<kj::byte> source, EncodingValue rawFromEncoding, EncodingValue rawToEncoding);
jsg::BufferSource transcode(jsg::Lock& js,
kj::Array<kj::byte> source,
EncodingValue rawFromEncoding,
EncodingValue rawToEncoding);

JSG_RESOURCE_TYPE(BufferUtil) {
JSG_METHOD(byteLength);
Expand Down
108 changes: 72 additions & 36 deletions src/workerd/api/node/i18n.c++
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ namespace i18n {
namespace {

// An isolate has a 128mb memory limit.
const int ISOLATE_LIMIT = 134217728;
constexpr int ISOLATE_LIMIT = 134217728;

const char* getEncodingName(Encoding input) {
constexpr const char* getEncodingName(Encoding input) {
switch (input) {
case Encoding::ASCII:
return "us-ascii";
Expand All @@ -40,100 +40,130 @@ const char* getEncodingName(Encoding input) {
}
}

typedef kj::Maybe<kj::Array<kj::byte>> (*TranscodeImpl)(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding);
using TranscodeImpl = kj::Function<kj::Maybe<jsg::BufferSource>(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding)>;

kj::Maybe<kj::Array<kj::byte>> TranscodeDefault(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
kj::Maybe<jsg::BufferSource> TranscodeDefault(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
Converter to(toEncoding);
auto substitute = kj::str(kj::repeat('?', to.minCharSize()));
to.setSubstituteChars(substitute);
Converter from(fromEncoding);

size_t limit = source.size() * to.maxCharSize();
if (limit == 0) {
auto empty = jsg::BackingStore::alloc<v8::Uint8Array>(js, 0);
return jsg::BufferSource(js, kj::mv(empty));
}
// Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check.
JSG_REQUIRE(limit <= ISOLATE_LIMIT, Error, "Source buffer is too large to transcode");
auto out = kj::heapArray<kj::byte>(limit);
char* target = out.asChars().begin();

auto out = jsg::BackingStore::alloc<v8::Uint8Array>(js, limit);
auto outPtr = out.asArrayPtr().asChars();
char* target = outPtr.begin();
const char* source_ = source.asChars().begin();
UErrorCode status{};
ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, &source_, source_ + source.size(),
nullptr, nullptr, nullptr, nullptr, true, true, &status);
if (U_SUCCESS(status)) {
return out.first(target - out.asChars().begin()).attach(kj::mv(out));
out.limit(target - outPtr.begin());
return jsg::BufferSource(js, kj::mv(out));
}

return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeLatin1ToUTF16(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
auto length_in_chars = source.size() * sizeof(UChar);
kj::Maybe<jsg::BufferSource> TranscodeLatin1ToUTF16(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
auto length_in_chars = source.size() * sizeof(char16_t);
// Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check.
JSG_REQUIRE(length_in_chars <= ISOLATE_LIMIT, Error, "Source buffer is too large to transcode");

if (length_in_chars == 0) {
auto empty = jsg::BackingStore::alloc<v8::Uint8Array>(js, 0);
return jsg::BufferSource(js, kj::mv(empty));
}

Converter from(fromEncoding);
auto destbuf = kj::heapArray<UChar>(length_in_chars);
auto destBuf = jsg::BackingStore::alloc<v8::Uint8Array>(js, length_in_chars);
auto destPtr = destBuf.asArrayPtr<char16_t>();
auto actual_length =
simdutf::convert_latin1_to_utf16(source.asChars().begin(), source.size(), destbuf.begin());
simdutf::convert_latin1_to_utf16(source.asChars().begin(), source.size(), destPtr.begin());

// simdutf returns 0 for invalid value.
if (actual_length == 0) {
return kj::none;
}

return destbuf.first(actual_length).asBytes().attach(kj::mv(destbuf));
destBuf.limit(actual_length * sizeof(char16_t));
return jsg::BufferSource(js, kj::mv(destBuf));
}

kj::Maybe<kj::Array<kj::byte>> TranscodeFromUTF16(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
kj::Maybe<jsg::BufferSource> TranscodeFromUTF16(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
Converter to(toEncoding);
auto substitute = kj::str(kj::repeat('?', to.minCharSize()));
to.setSubstituteChars(substitute);

auto utf16_input = kj::arrayPtr<char16_t>(
reinterpret_cast<char16_t*>(source.begin()), source.size() / sizeof(UChar));
reinterpret_cast<char16_t*>(source.begin()), source.size() / sizeof(char16_t));

const auto limit = utf16_input.size() * to.maxCharSize();

// Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check.
JSG_REQUIRE(limit <= ISOLATE_LIMIT, Error, "Buffer is too large to transcode");

auto destbuf = kj::heapArray<UChar>(limit);
auto length_in_chars = limit * sizeof(char16_t);
if (length_in_chars == 0) {
auto empty = jsg::BackingStore::alloc<v8::Uint8Array>(js, 0);
return jsg::BufferSource(js, kj::mv(empty));
}

auto destBuf = jsg::BackingStore::alloc<v8::Uint8Array>(js, length_in_chars);
auto destPtr = destBuf.asArrayPtr<char16_t>();
UErrorCode status{};
auto len = ucnv_fromUChars(to.conv(), destbuf.asChars().begin(), destbuf.size(),
auto len = ucnv_fromUChars(to.conv(), destPtr.asChars().begin(), destPtr.size(),
utf16_input.begin(), utf16_input.size(), &status);

if (U_SUCCESS(status)) {
return destbuf.first(len).asBytes().attach(kj::mv(destbuf));
destBuf.limit(len * sizeof(char16_t));
return jsg::BufferSource(js, kj::mv(destBuf));
}

return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeUTF16FromUTF8(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
kj::Maybe<jsg::BufferSource> TranscodeUTF16FromUTF8(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
size_t expected_utf16_length =
simdutf::utf16_length_from_utf8(source.asChars().begin(), source.size());
// Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check.
JSG_REQUIRE(expected_utf16_length <= ISOLATE_LIMIT, Error,
"Expected UTF-16le length is too large to transcode");
auto destbuf = kj::heapArray<UChar>(expected_utf16_length);

auto length_in_chars = expected_utf16_length * sizeof(char16_t);
if (length_in_chars == 0) {
auto empty = jsg::BackingStore::alloc<v8::Uint8Array>(js, 0);
return jsg::BufferSource(js, kj::mv(empty));
}

auto destBuf = jsg::BackingStore::alloc<v8::Uint8Array>(js, length_in_chars);
auto destPtr = destBuf.asArrayPtr<char16_t>();

size_t actual_length =
simdutf::convert_utf8_to_utf16le(source.asChars().begin(), source.size(), destbuf.begin());
simdutf::convert_utf8_to_utf16le(source.asChars().begin(), source.size(), destPtr.begin());
JSG_REQUIRE(actual_length == expected_utf16_length, Error, "Expected UTF16 length mismatch");

// simdutf returns 0 for invalid UTF-8 value.
if (actual_length == 0) {
return kj::none;
}

return destbuf.asBytes().attach(kj::mv(destbuf));
return jsg::BufferSource(js, kj::mv(destBuf));
}

kj::Maybe<kj::Array<kj::byte>> TranscodeUTF8FromUTF16(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
kj::Maybe<jsg::BufferSource> TranscodeUTF8FromUTF16(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
JSG_REQUIRE(source.size() % 2 == 0, Error, "UTF-16le input size should be multiple of 2");
auto utf16_input =
kj::arrayPtr<char16_t>(reinterpret_cast<char16_t*>(source.begin()), source.size() / 2);
Expand All @@ -144,18 +174,24 @@ kj::Maybe<kj::Array<kj::byte>> TranscodeUTF8FromUTF16(
JSG_REQUIRE(expected_utf8_length <= ISOLATE_LIMIT, Error,
"Expected UTF-8 length is too large to transcode");

auto destbuf = kj::heapArray<kj::byte>(expected_utf8_length);
if (expected_utf8_length == 0) {
auto empty = jsg::BackingStore::alloc<v8::Uint8Array>(js, 0);
return jsg::BufferSource(js, kj::mv(empty));
}

auto destBuf = jsg::BackingStore::alloc<v8::Uint8Array>(js, expected_utf8_length);
auto destPtr = destBuf.asArrayPtr().asChars();

size_t actual_length = simdutf::convert_utf16le_to_utf8(
utf16_input.begin(), utf16_input.size(), destbuf.asChars().begin());
size_t actual_length =
simdutf::convert_utf16le_to_utf8(utf16_input.begin(), utf16_input.size(), destPtr.begin());
JSG_REQUIRE(actual_length == expected_utf8_length, Error, "Expected UTF8 length mismatch");

// simdutf returns 0 for invalid UTF-8 value.
if (actual_length == 0) {
return kj::none;
}

return destbuf.asBytes().attach(kj::mv(destbuf));
return jsg::BufferSource(js, kj::mv(destBuf));
}

} // namespace
Expand Down Expand Up @@ -197,8 +233,8 @@ void Converter::setSubstituteChars(kj::StringPtr sub) {
}
}

kj::Array<kj::byte> transcode(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
jsg::BufferSource transcode(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding) {
TranscodeImpl transcode_function = &TranscodeDefault;
switch (fromEncoding) {
case Encoding::ASCII:
Expand Down Expand Up @@ -228,8 +264,8 @@ kj::Array<kj::byte> transcode(
JSG_FAIL_REQUIRE(Error, "Invalid encoding passed to transcode");
}

return JSG_REQUIRE_NONNULL(
transcode_function(source, fromEncoding, toEncoding), Error, "Unable to transcode buffer");
return JSG_REQUIRE_NONNULL(transcode_function(js, source, fromEncoding, toEncoding), Error,
"Unable to transcode buffer");
}

} // namespace i18n
Expand Down
6 changes: 4 additions & 2 deletions src/workerd/api/node/i18n.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// https://opensource.org/licenses/Apache-2.0
#pragma once

#include <workerd/jsg/jsg.h>

#include <kj/common.h>
#include <kj/string.h>

Expand Down Expand Up @@ -54,8 +56,8 @@ class Converter final {
kj::Own<UConverter> conv_;
};

kj::Array<kj::byte> transcode(
kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding);
jsg::BufferSource transcode(
jsg::Lock& js, kj::ArrayPtr<kj::byte> source, Encoding fromEncoding, Encoding toEncoding);

} // namespace i18n

Expand Down
6 changes: 5 additions & 1 deletion src/workerd/api/node/tests/buffer-nodejs-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -5972,7 +5972,11 @@ export const transcodeTest = {

for (const test in tests) {
const dest = transcode(orig, 'utf8', test);
strictEqual(dest.length, tests[test].length, `utf8->${test} length`);
strictEqual(
dest.length,
tests[test].length,
`utf8->${test} length (${dest.length}, ${tests[test].length})`
);
for (let n = 0; n < tests[test].length; n++) {
strictEqual(dest[n], tests[test][n], `utf8->${test} char ${n}`);
}
Expand Down
7 changes: 7 additions & 0 deletions src/workerd/jsg/buffersource.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,13 @@ class BackingStore {
byteLength -= bytes;
}

// Similar to trim except that it explicitly sets the byte length to a value
// equal to or less than the current byte length.
inline void limit(size_t bytes) {
KJ_ASSERT(bytes <= byteLength);
byteLength = bytes;
}

inline BackingStore clone() {
return BackingStore(backingStore, byteLength, byteOffset, elementSize, ctor, integerType);
}
Expand Down

0 comments on commit e0cf3b4

Please sign in to comment.