Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

util: add fast path for Latin1 decoding #55275

Merged
merged 15 commits into from
Dec 3, 2024
2 changes: 1 addition & 1 deletion benchmark/util/text-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
const common = require('../common.js');

const bench = common.createBenchmark(main, {
encoding: ['utf-8', 'latin1', 'iso-8859-3'],
encoding: ['utf-8', 'windows-1252', 'iso-8859-3'],
ignoreBOM: [0, 1],
fatal: [0, 1],
len: [256, 1024 * 16, 1024 * 128],
Expand Down
10 changes: 9 additions & 1 deletion lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const kDecoder = Symbol('decoder');
const kEncoder = Symbol('encoder');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kLatin1FastPath = Symbol('kLatin1FastPath');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const {
Expand All @@ -55,6 +56,7 @@ const {
encodeIntoResults,
encodeUtf8String,
decodeUTF8,
decodeLatin1,
} = binding;

const { Buffer } = require('buffer');
Expand Down Expand Up @@ -419,9 +421,10 @@ function makeTextDecoderICU() {
this[kFatal] = Boolean(options?.fatal);
// Only support fast path for UTF-8.
this[kUTF8FastPath] = enc === 'utf-8';
this[kLatin1FastPath] = enc === 'windows-1252';
this[kHandle] = undefined;

if (!this[kUTF8FastPath]) {
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
this.#prepareConverter();
anonrig marked this conversation as resolved.
Show resolved Hide resolved
}
}
Expand All @@ -438,11 +441,16 @@ function makeTextDecoderICU() {
validateDecoder(this);

this[kUTF8FastPath] &&= !(options?.stream);
this[kLatin1FastPath] &&= !(options?.stream);

if (this[kUTF8FastPath]) {
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
}

if (this[kLatin1FastPath]) {
anonrig marked this conversation as resolved.
Show resolved Hide resolved
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
}

this.#prepareConverter();

validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);
Expand Down
46 changes: 46 additions & 0 deletions src/encoding_binding.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "encoding_binding.h"
#include "ada.h"
#include "env-inl.h"
#include "node_buffer.h"
#include "node_errors.h"
#include "node_external_reference.h"
#include "simdutf.h"
Expand Down Expand Up @@ -226,6 +227,7 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
}

void BindingData::CreatePerContextProperties(Local<Object> target,
Expand All @@ -243,6 +245,50 @@ void BindingData::RegisterTimerExternalReferences(
registry->Register(DecodeUTF8);
registry->Register(ToASCII);
registry->Register(ToUnicode);
registry->Register(DecodeLatin1);
}

void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 1);
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"input\" argument must be an instance of ArrayBuffer, "
"SharedArrayBuffer, or ArrayBufferView.");
}

bool ignore_bom = args[1]->IsTrue();
bool has_fatal = args[2]->IsTrue();

ArrayBufferViewContents<uint8_t> buffer(args[0]);
const uint8_t* data = buffer.data();
size_t length = buffer.length();

if (ignore_bom && length > 0 && data[0] == 0xFF) {
data++;
length--;
}

if (length == 0) {
return args.GetReturnValue().SetEmptyString();
}

std::string result(length * 2, '\0');

size_t written = simdutf::convert_latin1_to_utf8(
reinterpret_cast<const char*>(data), length, result.data());

if (has_fatal && written == 0) {
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(), "The encoded data was not valid for encoding latin1");
}

Local<Object> buffer_result =
node::Buffer::Copy(env, result.c_str(), written).ToLocalChecked();
args.GetReturnValue().Set(buffer_result);
}

} // namespace encoding_binding
Expand Down
1 change: 1 addition & 0 deletions src/encoding_binding.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class BindingData : public SnapshotableObject {
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);

static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);
Expand Down
155 changes: 155 additions & 0 deletions test/cctest/test_encoding_binding.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#include "encoding_binding.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "node_test_fixture.h"
#include "v8.h"

namespace node {
namespace encoding_binding {

bool RunDecodeLatin1(Environment* env,
Local<Value> args[],
bool ignore_bom,
bool has_fatal,
Local<Value>* result) {
Isolate* isolate = env->isolate();
TryCatch try_catch(isolate);

Local<Boolean> ignoreBOMValue = Boolean::New(isolate, ignore_bom);
Local<Boolean> fatalValue = Boolean::New(isolate, has_fatal);

Local<Value> updatedArgs[] = {args[0], ignoreBOMValue, fatalValue};

BindingData::DecodeLatin1(FunctionCallbackInfo<Value>(updatedArgs));

if (try_catch.HasCaught()) {
return false;
}

*result = try_catch.Exception();
return true;
}

class EncodingBindingTest : public NodeTestFixture {};

TEST_F(EncodingBindingTest, DecodeLatin1_ValidInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t latin1_data[] = {0xC1, 0xE9, 0xF3};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "Áéó");
}

TEST_F(EncodingBindingTest, DecodeLatin1_EmptyInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, 0);
Local<Uint8Array> array = Uint8Array::New(ab, 0, 0);
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "");
}

TEST_F(EncodingBindingTest, DecodeLatin1_InvalidInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

Local<Value> args[] = {String::NewFromUtf8Literal(isolate, "Invalid input")};

Local<Value> result;
EXPECT_FALSE(RunDecodeLatin1(env, args, false, false, &result));
}

TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOM) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "Áéó");
}

TEST_F(EncodingBindingTest, DecodeLatin1_FatalInvalidInput) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t invalid_data[] = {0xFF, 0xFF, 0xFF};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(invalid_data));
memcpy(ab->GetBackingStore()->Data(), invalid_data, sizeof(invalid_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(invalid_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_FALSE(RunDecodeLatin1(env, args, false, true, &result));
}

TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOMAndFatal) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, true, true, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "Áéó");
}

TEST_F(EncodingBindingTest, DecodeLatin1_BOMPresent) {
Environment* env = CreateEnvironment();
Isolate* isolate = env->isolate();
HandleScope handle_scope(isolate);

const uint8_t latin1_data[] = {0xFF, 0xC1, 0xE9, 0xF3};
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));

Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
Local<Value> args[] = {array};

Local<Value> result;
EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result));

String::Utf8Value utf8_result(isolate, result);
EXPECT_STREQ(*utf8_result, "Áéó");
}

} // namespace encoding_binding
} // namespace node
Loading