From 9eccd7dba9efaaf647fc1eb464798a2a31f9eb2d Mon Sep 17 00:00:00 2001 From: Mert Can Altin Date: Tue, 3 Dec 2024 14:32:01 +0300 Subject: [PATCH] util: add fast path for Latin1 decoding PR-URL: https://github.com/nodejs/node/pull/55275 Reviewed-By: Rafael Gonzaga Reviewed-By: Yagiz Nizipli Reviewed-By: James M Snell Reviewed-By: Daniel Lemire --- benchmark/util/text-decoder.js | 2 +- lib/internal/encoding.js | 10 +- src/encoding_binding.cc | 46 ++++++++ src/encoding_binding.h | 1 + test/cctest/test_encoding_binding.cc | 155 +++++++++++++++++++++++++++ 5 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 test/cctest/test_encoding_binding.cc diff --git a/benchmark/util/text-decoder.js b/benchmark/util/text-decoder.js index dd4f02016df077..1aa60f2dd0bcd6 100644 --- a/benchmark/util/text-decoder.js +++ b/benchmark/util/text-decoder.js @@ -3,7 +3,7 @@ const common = require('../common.js'); const bench = common.createBenchmark(main, { - encoding: ['utf-8', 'latin1', 'iso-8859-3'], + encoding: ['utf-8', 'windows-1252', 'iso-8859-3'], ignoreBOM: [0, 1], fatal: [0, 1], len: [256, 1024 * 16, 1024 * 128], diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index 252eaa75fac22b..b2ca3c612bf6ef 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -29,6 +29,7 @@ const kDecoder = Symbol('decoder'); const kEncoder = Symbol('encoder'); const kFatal = Symbol('kFatal'); const kUTF8FastPath = Symbol('kUTF8FastPath'); +const kLatin1FastPath = Symbol('kLatin1FastPath'); const kIgnoreBOM = Symbol('kIgnoreBOM'); const { @@ -55,6 +56,7 @@ const { encodeIntoResults, encodeUtf8String, decodeUTF8, + decodeLatin1, } = binding; const { Buffer } = require('buffer'); @@ -419,9 +421,10 @@ function makeTextDecoderICU() { this[kFatal] = Boolean(options?.fatal); // Only support fast path for UTF-8. this[kUTF8FastPath] = enc === 'utf-8'; + this[kLatin1FastPath] = enc === 'windows-1252'; this[kHandle] = undefined; - if (!this[kUTF8FastPath]) { + if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) { this.#prepareConverter(); } } @@ -438,11 +441,16 @@ function makeTextDecoderICU() { validateDecoder(this); this[kUTF8FastPath] &&= !(options?.stream); + this[kLatin1FastPath] &&= !(options?.stream); if (this[kUTF8FastPath]) { return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]); } + if (this[kLatin1FastPath]) { + return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]); + } + this.#prepareConverter(); validateObject(options, 'options', kValidateObjectAllowObjectsAndNull); diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 97ddd59fb661c8..a132eeb62306c6 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -1,6 +1,7 @@ #include "encoding_binding.h" #include "ada.h" #include "env-inl.h" +#include "node_buffer.h" #include "node_errors.h" #include "node_external_reference.h" #include "simdutf.h" @@ -226,6 +227,7 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data, SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8); SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII); SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode); + SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1); } void BindingData::CreatePerContextProperties(Local target, @@ -243,6 +245,50 @@ void BindingData::RegisterTimerExternalReferences( registry->Register(DecodeUTF8); registry->Register(ToASCII); registry->Register(ToUnicode); + registry->Register(DecodeLatin1); +} + +void BindingData::DecodeLatin1(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + + CHECK_GE(args.Length(), 1); + if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() || + args[0]->IsArrayBufferView())) { + return node::THROW_ERR_INVALID_ARG_TYPE( + env->isolate(), + "The \"input\" argument must be an instance of ArrayBuffer, " + "SharedArrayBuffer, or ArrayBufferView."); + } + + bool ignore_bom = args[1]->IsTrue(); + bool has_fatal = args[2]->IsTrue(); + + ArrayBufferViewContents buffer(args[0]); + const uint8_t* data = buffer.data(); + size_t length = buffer.length(); + + if (ignore_bom && length > 0 && data[0] == 0xFF) { + data++; + length--; + } + + if (length == 0) { + return args.GetReturnValue().SetEmptyString(); + } + + std::string result(length * 2, '\0'); + + size_t written = simdutf::convert_latin1_to_utf8( + reinterpret_cast(data), length, result.data()); + + if (has_fatal && written == 0) { + return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( + env->isolate(), "The encoded data was not valid for encoding latin1"); + } + + Local buffer_result = + node::Buffer::Copy(env, result.c_str(), written).ToLocalChecked(); + args.GetReturnValue().Set(buffer_result); } } // namespace encoding_binding diff --git a/src/encoding_binding.h b/src/encoding_binding.h index 2690cb74f8a05b..97f55394d27641 100644 --- a/src/encoding_binding.h +++ b/src/encoding_binding.h @@ -31,6 +31,7 @@ class BindingData : public SnapshotableObject { static void EncodeInto(const v8::FunctionCallbackInfo& args); static void EncodeUtf8String(const v8::FunctionCallbackInfo& args); static void DecodeUTF8(const v8::FunctionCallbackInfo& args); + static void DecodeLatin1(const v8::FunctionCallbackInfo& args); static void ToASCII(const v8::FunctionCallbackInfo& args); static void ToUnicode(const v8::FunctionCallbackInfo& args); diff --git a/test/cctest/test_encoding_binding.cc b/test/cctest/test_encoding_binding.cc new file mode 100644 index 00000000000000..06cc36d8f6ae34 --- /dev/null +++ b/test/cctest/test_encoding_binding.cc @@ -0,0 +1,155 @@ +#include "encoding_binding.h" +#include "env-inl.h" +#include "gtest/gtest.h" +#include "node_test_fixture.h" +#include "v8.h" + +namespace node { +namespace encoding_binding { + +bool RunDecodeLatin1(Environment* env, + Local args[], + bool ignore_bom, + bool has_fatal, + Local* result) { + Isolate* isolate = env->isolate(); + TryCatch try_catch(isolate); + + Local ignoreBOMValue = Boolean::New(isolate, ignore_bom); + Local fatalValue = Boolean::New(isolate, has_fatal); + + Local updatedArgs[] = {args[0], ignoreBOMValue, fatalValue}; + + BindingData::DecodeLatin1(FunctionCallbackInfo(updatedArgs)); + + if (try_catch.HasCaught()) { + return false; + } + + *result = try_catch.Exception(); + return true; +} + +class EncodingBindingTest : public NodeTestFixture {}; + +TEST_F(EncodingBindingTest, DecodeLatin1_ValidInput) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + const uint8_t latin1_data[] = {0xC1, 0xE9, 0xF3}; + Local ab = ArrayBuffer::New(isolate, sizeof(latin1_data)); + memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data)); + + Local array = Uint8Array::New(ab, 0, sizeof(latin1_data)); + Local args[] = {array}; + + Local result; + EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result)); + + String::Utf8Value utf8_result(isolate, result); + EXPECT_STREQ(*utf8_result, "Áéó"); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_EmptyInput) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + Local ab = ArrayBuffer::New(isolate, 0); + Local array = Uint8Array::New(ab, 0, 0); + Local args[] = {array}; + + Local result; + EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result)); + + String::Utf8Value utf8_result(isolate, result); + EXPECT_STREQ(*utf8_result, ""); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_InvalidInput) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + Local args[] = {String::NewFromUtf8Literal(isolate, "Invalid input")}; + + Local result; + EXPECT_FALSE(RunDecodeLatin1(env, args, false, false, &result)); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOM) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3}; + Local ab = ArrayBuffer::New(isolate, sizeof(latin1_data)); + memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data)); + + Local array = Uint8Array::New(ab, 0, sizeof(latin1_data)); + Local args[] = {array}; + + Local result; + EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result)); + + String::Utf8Value utf8_result(isolate, result); + EXPECT_STREQ(*utf8_result, "Áéó"); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_FatalInvalidInput) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + const uint8_t invalid_data[] = {0xFF, 0xFF, 0xFF}; + Local ab = ArrayBuffer::New(isolate, sizeof(invalid_data)); + memcpy(ab->GetBackingStore()->Data(), invalid_data, sizeof(invalid_data)); + + Local array = Uint8Array::New(ab, 0, sizeof(invalid_data)); + Local args[] = {array}; + + Local result; + EXPECT_FALSE(RunDecodeLatin1(env, args, false, true, &result)); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOMAndFatal) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3}; + Local ab = ArrayBuffer::New(isolate, sizeof(latin1_data)); + memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data)); + + Local array = Uint8Array::New(ab, 0, sizeof(latin1_data)); + Local args[] = {array}; + + Local result; + EXPECT_TRUE(RunDecodeLatin1(env, args, true, true, &result)); + + String::Utf8Value utf8_result(isolate, result); + EXPECT_STREQ(*utf8_result, "Áéó"); +} + +TEST_F(EncodingBindingTest, DecodeLatin1_BOMPresent) { + Environment* env = CreateEnvironment(); + Isolate* isolate = env->isolate(); + HandleScope handle_scope(isolate); + + const uint8_t latin1_data[] = {0xFF, 0xC1, 0xE9, 0xF3}; + Local ab = ArrayBuffer::New(isolate, sizeof(latin1_data)); + memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data)); + + Local array = Uint8Array::New(ab, 0, sizeof(latin1_data)); + Local args[] = {array}; + + Local result; + EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result)); + + String::Utf8Value utf8_result(isolate, result); + EXPECT_STREQ(*utf8_result, "Áéó"); +} + +} // namespace encoding_binding +} // namespace node