From 51baa2c6227e18ac1ab5dace722073fe9ec18270 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 20 Jun 2024 04:26:26 -0400 Subject: [PATCH 01/13] Implement utf8 string escape --- components/core/CMakeLists.txt | 7 +- components/core/src/clp/ffi/utils.cpp | 231 +++++++++++++++++++++++ components/core/src/clp/ffi/utils.hpp | 18 ++ components/core/tests/test-ffi_utils.cpp | 168 +++++++++++++++++ 4 files changed, 423 insertions(+), 1 deletion(-) create mode 100644 components/core/src/clp/ffi/utils.cpp create mode 100644 components/core/src/clp/ffi/utils.hpp create mode 100644 components/core/tests/test-ffi_utils.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 99d3c8469..e5f999aa2 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -324,6 +324,8 @@ set(SOURCE_FILES_unitTest src/clp/ffi/search/Subquery.hpp src/clp/ffi/search/WildcardToken.cpp src/clp/ffi/search/WildcardToken.hpp + src/clp/ffi/utils.cpp + src/clp/ffi/utils.hpp src/clp/FileDescriptor.cpp src/clp/FileDescriptor.hpp src/clp/FileReader.cpp @@ -471,7 +473,10 @@ set(SOURCE_FILES_unitTest tests/test-string_utils.cpp tests/test-TimestampPattern.cpp tests/test-Utils.cpp - ) + src/clp/ffi/utils.hpp + src/clp/ffi/utils.cpp + tests/test-ffi_utils.cpp +) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest}) target_include_directories(unitTest PRIVATE diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp new file mode 100644 index 000000000..b4677b47e --- /dev/null +++ b/components/core/src/clp/ffi/utils.cpp @@ -0,0 +1,231 @@ +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::string; +using std::string_view; + +namespace clp::ffi { +namespace { +/* + * @param byte + * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte + * should match 0b10xx_xxxx. + */ +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; + +/** + * Appends a single-byte utf8 character into the given string, and escapes it if necessary. + * @param character Single-byte utf8 character. + * @parma escaped_string Input string where the character(s) are appended to. + */ +auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void; + +/** + * Validates whether the given code point is a valid UTF8 encoding with the given length. + * The valid range is defined as following: + * .---------------------------------------------. + * | Length | First Code Point | Last Code Point | + * |--------|------------------|-----------------| + * | 1 Byte | 0x00 | 0x7F | + * | 2 Byte | 0x80 | 0x7FF | + * | 3 Byte | 0x8FF | 0xFFFF | + * | 4 Byte | 0x10000 | 0x10FFFF | + * |--------|------------------|-----------------| + * @param code_point + * @param encoding_length + * @return Whether the code point is a valid encoding. + */ +[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool; + +/** + * Updates the code point by applying the payload of the given continuation byte. + * @param code_point + * @param continuation_byte + * @return Updated code point. + */ +[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; + +auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { + constexpr uint8_t cContinuationByteMask{0xC0}; + constexpr uint8_t cValidMaskedContinuationByte{0x80}; + return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; +} + +auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void { + switch (character) { + // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + case 0x08: + escaped_string.push_back('\\'); + escaped_string.push_back('b'); + break; + case 0x09: + escaped_string.push_back('\\'); + escaped_string.push_back('t'); + break; + case 0x0A: + escaped_string.push_back('\\'); + escaped_string.push_back('n'); + break; + case 0x0C: + escaped_string.push_back('\\'); + escaped_string.push_back('f'); + break; + case 0x0D: + escaped_string.push_back('\\'); + escaped_string.push_back('r'); + break; + case 0x22: + escaped_string.push_back('\\'); + escaped_string.push_back('\"'); + break; + case 0x5C: + escaped_string.push_back('\\'); + escaped_string.push_back('\\'); + break; + // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + default: { + constexpr uint8_t cControlCharacter{0x1F}; + if (cControlCharacter >= character) { + // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the + // last byte used by `snprintf` to append '\0' + constexpr size_t cControlCharacterBufSize{7}; + std::array buf{}; + std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character); + escaped_string.append(buf.cbegin(), buf.cend() - 1); + } else { + escaped_string.push_back(static_cast(character)); + } + break; + } + } +} + +auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool { + switch (encoding_length) { + // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + case 1: + return code_point <= 0x7F; + case 2: + return (0x80 <= code_point && code_point <= 0x7FF); + case 3: + return (0x800 <= code_point && code_point <= 0xFFFF); + case 4: + return (0x1'0000 <= code_point && code_point <= 0x10'FFFF); + // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + default: + return false; + } +} + +auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { + constexpr uint32_t cContinuationBytePayloadMask{0x3F}; + constexpr uint8_t cNumContinuationBytePayloadBits{6}; + return (code_point << cNumContinuationBytePayloadBits) + + (continuation_byte & cContinuationBytePayloadMask); +} +} // namespace + +auto escape_utf8_string(string_view raw) -> std::optional { + string_view::const_iterator bookmark_it{}; + size_t encoding_length{}; + enum class State : uint8_t { + HeadByteToValidate = 0, + OneContinuationByteToValidate, + TwoContinuationBytesToValidate, + ThreeContinuationBytesToValidate + }; + State state{State::HeadByteToValidate}; + string escaped; + escaped.reserve(raw.size() + (raw.size() >> 2)); + + uint32_t code_point{}; + auto validate_encoding_length_and_set_state + = [&encoding_length, &state, &code_point](uint8_t byte) -> bool { + constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx + constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx + constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx + constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx + constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx + constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx + if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) { + encoding_length = 4; + code_point = (~cThreeByteContinuationMask & byte); + state = State::ThreeContinuationBytesToValidate; + } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) { + encoding_length = 3; + code_point = (~cTwoByteContinuationMask & byte); + state = State::TwoContinuationBytesToValidate; + } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) { + encoding_length = 2; + code_point = (~cOneByteContinuationMask & byte); + state = State::OneContinuationByteToValidate; + } else { + return false; + } + return true; + }; + + // For multibyte encoded values, we will incrementally build the code point, and validate its + // range in the end. + for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) { + auto const byte{static_cast(*it)}; + switch (state) { + case State::HeadByteToValidate: { + if (is_valid_code_point(static_cast(byte), 1)) { + escape_and_append_single_byte_utf8_char(byte, escaped); + } else { + if (false == validate_encoding_length_and_set_state(byte)) { + return std::nullopt; + } + bookmark_it = it; + } + break; + } + case State::OneContinuationByteToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + + if (false == is_valid_code_point(code_point, encoding_length)) { + return std::nullopt; + } + escaped.append(bookmark_it, bookmark_it + encoding_length); + state = State::HeadByteToValidate; + break; + case State::TwoContinuationBytesToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + state = State::OneContinuationByteToValidate; + break; + case State::ThreeContinuationBytesToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + state = State::TwoContinuationBytesToValidate; + break; + default: + return std::nullopt; + } + } + + if (State::HeadByteToValidate != state) { + // Incomplete multibyte UTF8 sequence + return std::nullopt; + } + + return std::move(escaped); +} +} // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp new file mode 100644 index 000000000..cd1a60340 --- /dev/null +++ b/components/core/src/clp/ffi/utils.hpp @@ -0,0 +1,18 @@ +#ifndef CLP_FFI_UTILS_HPP +#define CLP_FFI_UTILS_HPP + +#include +#include +#include + +namespace clp::ffi { +/** + * Escapes a UTF8 encoded string. + * @param raw The raw string to escape. + * @return The escaped string on success. + * @return std::nullopt if the string contains none-UTF8 encoded byte sequence. + */ +[[nodiscard]] auto escape_utf8_string(std::string_view raw) -> std::optional; +} // namespace clp::ffi + +#endif // CLP_UTILS_HPP diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp new file mode 100644 index 000000000..372200f14 --- /dev/null +++ b/components/core/tests/test-ffi_utils.cpp @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../src/clp/ffi/utils.hpp" + +using clp::ffi::escape_utf8_string; + +namespace { +/** + * Gets an expected escaped string by first convert the raw string into a json string and then dumps + * the a printable string using nlohmann::json. + * @param raw + * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped. + */ +[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; + +auto get_expected_escaped_string(std::string_view raw) -> std::string { + nlohmann::json const json_str = raw; // Don't use '{}' initializer + auto const dumped_str{json_str.dump()}; + return {dumped_str.begin() + 1, dumped_str.end() - 1}; +} +} // namespace + +TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { + std::string test_str; + std::optional actual; + + // Test empty string + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string that has nothing to escape + test_str = "This string has nothing to escape :)"; + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string with all single byte UTF8 characters, which include all characters we escape + test_str.clear(); + for (uint8_t i{0}; i <= static_cast(INT8_MAX); ++i) { + test_str.push_back(static_cast(i)); + } + // Shuffle characters randomly, ensure control characters are not grouped together. + // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp) + std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{}); + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test valid UTF8 chars with continuation bytes + std::vector const valid_utf8{ + "\n", + "\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F + "a", + "\xE4\xB8\xAD", // https://en.wiktionary.org/wiki/%E4%B8%AD + "\x1F", + "\xC2\xA2", // ¢ + "\\" + }; + test_str.clear(); + for (auto const& str : valid_utf8) { + test_str.append(str); + } + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + +} + +TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") { + std::string test_str; + std::optional actual; + + // Test UTF8 code point range validation + auto const valid_code_point_lower_bound = GENERATE( + std::string_view{"\xC2\x80"}, + std::string_view{"\xE0\xA0\x80"}, + std::string_view{"\xF0\x90\x80\x80"} + ); + + auto const valid_code_point_upper_bound = GENERATE( + std::string_view{"\xDF\xBF"}, + std::string_view{"\xEF\xBF\xBF"}, + std::string_view{"\xF4\x8F\xBF\xBF"} + ); + + test_str = valid_code_point_lower_bound; + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + test_str = valid_code_point_upper_bound; + actual = escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test invalid code point: 0x7F (only need one byte) + test_str = "\xC1\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xE0\x81\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x81\x81\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x73 (only need one byte) + test_str = "\xC1\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xE0\x81\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x81\x81\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x7FF (only need 2 bytes) + test_str = "\xE0\x9F\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x80\x9F\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x7F3 (only need 2 bytes) + test_str = "\xE0\x9F\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x80\x9F\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0xFFFF (only need 3 bytes) + test_str = "\xF0\x8F\xBF\xBF"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0xFFF3 (only need 3 bytes) + test_str = "\xF0\x8F\xBF\xB3"; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test incomplete continuation bytes + std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()}; + std::string const valid{"Valid"}; + for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1}; + valid_code_point_lower_bound.cbegin() != it_end; + --it_end) + { + std::string const incomplete_byte_sequence{it_begin, it_end}; + REQUIRE((false == escape_utf8_string(valid + incomplete_byte_sequence).has_value())); + REQUIRE((false == escape_utf8_string(incomplete_byte_sequence + valid).has_value())); + } + + // Test invalid header byte + test_str = valid_code_point_lower_bound; + constexpr char cInvalidHeaderByte{'\xFF'}; + test_str.front() = cInvalidHeaderByte; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + + // Test invalid continuation bytes + for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) { + test_str = valid_code_point_lower_bound; + constexpr uint8_t cInvalidateMask{0x40}; + test_str.at(idx) |= cInvalidateMask; + REQUIRE((false == escape_utf8_string(test_str).has_value())); + } +} From 90a01632b7131f90856116c09b9924dc6817ffe9 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 20 Jun 2024 04:30:31 -0400 Subject: [PATCH 02/13] Refactoring --- components/core/src/clp/ffi/utils.cpp | 462 +++++++++++------------ components/core/src/clp/ffi/utils.hpp | 38 +- components/core/tests/test-ffi_utils.cpp | 341 ++++++++--------- 3 files changed, 424 insertions(+), 417 deletions(-) diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp index b4677b47e..43cd4269c 100644 --- a/components/core/src/clp/ffi/utils.cpp +++ b/components/core/src/clp/ffi/utils.cpp @@ -1,231 +1,231 @@ -#include "utils.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using std::string; -using std::string_view; - -namespace clp::ffi { -namespace { -/* - * @param byte - * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte - * should match 0b10xx_xxxx. - */ -[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; - -/** - * Appends a single-byte utf8 character into the given string, and escapes it if necessary. - * @param character Single-byte utf8 character. - * @parma escaped_string Input string where the character(s) are appended to. - */ -auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void; - -/** - * Validates whether the given code point is a valid UTF8 encoding with the given length. - * The valid range is defined as following: - * .---------------------------------------------. - * | Length | First Code Point | Last Code Point | - * |--------|------------------|-----------------| - * | 1 Byte | 0x00 | 0x7F | - * | 2 Byte | 0x80 | 0x7FF | - * | 3 Byte | 0x8FF | 0xFFFF | - * | 4 Byte | 0x10000 | 0x10FFFF | - * |--------|------------------|-----------------| - * @param code_point - * @param encoding_length - * @return Whether the code point is a valid encoding. - */ -[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool; - -/** - * Updates the code point by applying the payload of the given continuation byte. - * @param code_point - * @param continuation_byte - * @return Updated code point. - */ -[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; - -auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { - constexpr uint8_t cContinuationByteMask{0xC0}; - constexpr uint8_t cValidMaskedContinuationByte{0x80}; - return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; -} - -auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void { - switch (character) { - // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - case 0x08: - escaped_string.push_back('\\'); - escaped_string.push_back('b'); - break; - case 0x09: - escaped_string.push_back('\\'); - escaped_string.push_back('t'); - break; - case 0x0A: - escaped_string.push_back('\\'); - escaped_string.push_back('n'); - break; - case 0x0C: - escaped_string.push_back('\\'); - escaped_string.push_back('f'); - break; - case 0x0D: - escaped_string.push_back('\\'); - escaped_string.push_back('r'); - break; - case 0x22: - escaped_string.push_back('\\'); - escaped_string.push_back('\"'); - break; - case 0x5C: - escaped_string.push_back('\\'); - escaped_string.push_back('\\'); - break; - // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - default: { - constexpr uint8_t cControlCharacter{0x1F}; - if (cControlCharacter >= character) { - // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the - // last byte used by `snprintf` to append '\0' - constexpr size_t cControlCharacterBufSize{7}; - std::array buf{}; - std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character); - escaped_string.append(buf.cbegin(), buf.cend() - 1); - } else { - escaped_string.push_back(static_cast(character)); - } - break; - } - } -} - -auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool { - switch (encoding_length) { - // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - case 1: - return code_point <= 0x7F; - case 2: - return (0x80 <= code_point && code_point <= 0x7FF); - case 3: - return (0x800 <= code_point && code_point <= 0xFFFF); - case 4: - return (0x1'0000 <= code_point && code_point <= 0x10'FFFF); - // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - default: - return false; - } -} - -auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { - constexpr uint32_t cContinuationBytePayloadMask{0x3F}; - constexpr uint8_t cNumContinuationBytePayloadBits{6}; - return (code_point << cNumContinuationBytePayloadBits) - + (continuation_byte & cContinuationBytePayloadMask); -} -} // namespace - -auto escape_utf8_string(string_view raw) -> std::optional { - string_view::const_iterator bookmark_it{}; - size_t encoding_length{}; - enum class State : uint8_t { - HeadByteToValidate = 0, - OneContinuationByteToValidate, - TwoContinuationBytesToValidate, - ThreeContinuationBytesToValidate - }; - State state{State::HeadByteToValidate}; - string escaped; - escaped.reserve(raw.size() + (raw.size() >> 2)); - - uint32_t code_point{}; - auto validate_encoding_length_and_set_state - = [&encoding_length, &state, &code_point](uint8_t byte) -> bool { - constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx - constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx - constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx - constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx - constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx - constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx - if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) { - encoding_length = 4; - code_point = (~cThreeByteContinuationMask & byte); - state = State::ThreeContinuationBytesToValidate; - } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) { - encoding_length = 3; - code_point = (~cTwoByteContinuationMask & byte); - state = State::TwoContinuationBytesToValidate; - } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) { - encoding_length = 2; - code_point = (~cOneByteContinuationMask & byte); - state = State::OneContinuationByteToValidate; - } else { - return false; - } - return true; - }; - - // For multibyte encoded values, we will incrementally build the code point, and validate its - // range in the end. - for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) { - auto const byte{static_cast(*it)}; - switch (state) { - case State::HeadByteToValidate: { - if (is_valid_code_point(static_cast(byte), 1)) { - escape_and_append_single_byte_utf8_char(byte, escaped); - } else { - if (false == validate_encoding_length_and_set_state(byte)) { - return std::nullopt; - } - bookmark_it = it; - } - break; - } - case State::OneContinuationByteToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - - if (false == is_valid_code_point(code_point, encoding_length)) { - return std::nullopt; - } - escaped.append(bookmark_it, bookmark_it + encoding_length); - state = State::HeadByteToValidate; - break; - case State::TwoContinuationBytesToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - state = State::OneContinuationByteToValidate; - break; - case State::ThreeContinuationBytesToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - state = State::TwoContinuationBytesToValidate; - break; - default: - return std::nullopt; - } - } - - if (State::HeadByteToValidate != state) { - // Incomplete multibyte UTF8 sequence - return std::nullopt; - } - - return std::move(escaped); -} -} // namespace clp::ffi +#include "utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::string; +using std::string_view; + +namespace clp::ffi { +namespace { +/* + * @param byte + * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte + * should match 0b10xx_xxxx. + */ +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; + +/** + * Appends a single-byte utf8 character into the given string, and escapes it if necessary. + * @param character Single-byte utf8 character. + * @parma escaped_string Input string where the character(s) are appended to. + */ +auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void; + +/** + * Validates whether the given code point is a valid UTF8 encoding with the given length. + * The valid range is defined as following: + * .---------------------------------------------. + * | Length | First Code Point | Last Code Point | + * |--------|------------------|-----------------| + * | 1 Byte | 0x00 | 0x7F | + * | 2 Byte | 0x80 | 0x7FF | + * | 3 Byte | 0x8FF | 0xFFFF | + * | 4 Byte | 0x10000 | 0x10FFFF | + * |--------|------------------|-----------------| + * @param code_point + * @param encoding_length + * @return Whether the code point is a valid encoding. + */ +[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool; + +/** + * Updates the code point by applying the payload of the given continuation byte. + * @param code_point + * @param continuation_byte + * @return Updated code point. + */ +[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; + +auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { + constexpr uint8_t cContinuationByteMask{0xC0}; + constexpr uint8_t cValidMaskedContinuationByte{0x80}; + return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; +} + +auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void { + switch (character) { + // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + case 0x08: + escaped_string.push_back('\\'); + escaped_string.push_back('b'); + break; + case 0x09: + escaped_string.push_back('\\'); + escaped_string.push_back('t'); + break; + case 0x0A: + escaped_string.push_back('\\'); + escaped_string.push_back('n'); + break; + case 0x0C: + escaped_string.push_back('\\'); + escaped_string.push_back('f'); + break; + case 0x0D: + escaped_string.push_back('\\'); + escaped_string.push_back('r'); + break; + case 0x22: + escaped_string.push_back('\\'); + escaped_string.push_back('\"'); + break; + case 0x5C: + escaped_string.push_back('\\'); + escaped_string.push_back('\\'); + break; + // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + default: { + constexpr uint8_t cControlCharacter{0x1F}; + if (cControlCharacter >= character) { + // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the + // last byte used by `snprintf` to append '\0' + constexpr size_t cControlCharacterBufSize{7}; + std::array buf{}; + std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character); + escaped_string.append(buf.cbegin(), buf.cend() - 1); + } else { + escaped_string.push_back(static_cast(character)); + } + break; + } + } +} + +auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool { + switch (encoding_length) { + // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + case 1: + return code_point <= 0x7F; + case 2: + return (0x80 <= code_point && code_point <= 0x7FF); + case 3: + return (0x800 <= code_point && code_point <= 0xFFFF); + case 4: + return (0x1'0000 <= code_point && code_point <= 0x10'FFFF); + // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + default: + return false; + } +} + +auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { + constexpr uint32_t cContinuationBytePayloadMask{0x3F}; + constexpr uint8_t cNumContinuationBytePayloadBits{6}; + return (code_point << cNumContinuationBytePayloadBits) + + (continuation_byte & cContinuationBytePayloadMask); +} +} // namespace + +auto validate_and_escape_utf8_string(string_view raw) -> std::optional { + string_view::const_iterator bookmark_it{}; + size_t encoding_length{}; + enum class State : uint8_t { + HeadByteToValidate = 0, + OneContinuationByteToValidate, + TwoContinuationBytesToValidate, + ThreeContinuationBytesToValidate + }; + State state{State::HeadByteToValidate}; + string escaped; + escaped.reserve(raw.size() + (raw.size() >> 2)); + + uint32_t code_point{}; + auto validate_encoding_length_and_set_state + = [&encoding_length, &state, &code_point](uint8_t byte) -> bool { + constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx + constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx + constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx + constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx + constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx + constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx + if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) { + encoding_length = 4; + code_point = (~cThreeByteContinuationMask & byte); + state = State::ThreeContinuationBytesToValidate; + } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) { + encoding_length = 3; + code_point = (~cTwoByteContinuationMask & byte); + state = State::TwoContinuationBytesToValidate; + } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) { + encoding_length = 2; + code_point = (~cOneByteContinuationMask & byte); + state = State::OneContinuationByteToValidate; + } else { + return false; + } + return true; + }; + + // For multibyte encoded values, we will incrementally build the code point, and validate its + // range in the end. + for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) { + auto const byte{static_cast(*it)}; + switch (state) { + case State::HeadByteToValidate: { + if (is_valid_code_point(static_cast(byte), 1)) { + escape_and_append_single_byte_utf8_char(byte, escaped); + } else { + if (false == validate_encoding_length_and_set_state(byte)) { + return std::nullopt; + } + bookmark_it = it; + } + break; + } + case State::OneContinuationByteToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + + if (false == is_valid_code_point(code_point, encoding_length)) { + return std::nullopt; + } + escaped.append(bookmark_it, bookmark_it + encoding_length); + state = State::HeadByteToValidate; + break; + case State::TwoContinuationBytesToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + state = State::OneContinuationByteToValidate; + break; + case State::ThreeContinuationBytesToValidate: + if (false == is_valid_utf8_continuation_byte(byte)) { + return std::nullopt; + } + code_point = update_code_point(code_point, byte); + state = State::TwoContinuationBytesToValidate; + break; + default: + return std::nullopt; + } + } + + if (State::HeadByteToValidate != state) { + // Incomplete multibyte UTF8 sequence + return std::nullopt; + } + + return std::move(escaped); +} +} // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index cd1a60340..acd977b80 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -1,18 +1,20 @@ -#ifndef CLP_FFI_UTILS_HPP -#define CLP_FFI_UTILS_HPP - -#include -#include -#include - -namespace clp::ffi { -/** - * Escapes a UTF8 encoded string. - * @param raw The raw string to escape. - * @return The escaped string on success. - * @return std::nullopt if the string contains none-UTF8 encoded byte sequence. - */ -[[nodiscard]] auto escape_utf8_string(std::string_view raw) -> std::optional; -} // namespace clp::ffi - -#endif // CLP_UTILS_HPP +#ifndef CLP_FFI_UTILS_HPP +#define CLP_FFI_UTILS_HPP + +#include +#include +#include + +namespace clp::ffi { +/** + * Validates whether the given string is UTF8 encoded, and escapes any characters to generate to + * make the string human readable. + * @param raw The raw string to escape. + * @return The escaped string on success. + * @return std::nullopt if the string contains none-UTF8 encoded byte sequence. + */ +[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw +) -> std::optional; +} // namespace clp::ffi + +#endif // CLP_UTILS_HPP diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp index 372200f14..4deb16865 100644 --- a/components/core/tests/test-ffi_utils.cpp +++ b/components/core/tests/test-ffi_utils.cpp @@ -1,168 +1,173 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "../src/clp/ffi/utils.hpp" - -using clp::ffi::escape_utf8_string; - -namespace { -/** - * Gets an expected escaped string by first convert the raw string into a json string and then dumps - * the a printable string using nlohmann::json. - * @param raw - * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped. - */ -[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; - -auto get_expected_escaped_string(std::string_view raw) -> std::string { - nlohmann::json const json_str = raw; // Don't use '{}' initializer - auto const dumped_str{json_str.dump()}; - return {dumped_str.begin() + 1, dumped_str.end() - 1}; -} -} // namespace - -TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { - std::string test_str; - std::optional actual; - - // Test empty string - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - // Test string that has nothing to escape - test_str = "This string has nothing to escape :)"; - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - // Test string with all single byte UTF8 characters, which include all characters we escape - test_str.clear(); - for (uint8_t i{0}; i <= static_cast(INT8_MAX); ++i) { - test_str.push_back(static_cast(i)); - } - // Shuffle characters randomly, ensure control characters are not grouped together. - // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp) - std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{}); - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - // Test valid UTF8 chars with continuation bytes - std::vector const valid_utf8{ - "\n", - "\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F - "a", - "\xE4\xB8\xAD", // https://en.wiktionary.org/wiki/%E4%B8%AD - "\x1F", - "\xC2\xA2", // ¢ - "\\" - }; - test_str.clear(); - for (auto const& str : valid_utf8) { - test_str.append(str); - } - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - -} - -TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") { - std::string test_str; - std::optional actual; - - // Test UTF8 code point range validation - auto const valid_code_point_lower_bound = GENERATE( - std::string_view{"\xC2\x80"}, - std::string_view{"\xE0\xA0\x80"}, - std::string_view{"\xF0\x90\x80\x80"} - ); - - auto const valid_code_point_upper_bound = GENERATE( - std::string_view{"\xDF\xBF"}, - std::string_view{"\xEF\xBF\xBF"}, - std::string_view{"\xF4\x8F\xBF\xBF"} - ); - - test_str = valid_code_point_lower_bound; - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - test_str = valid_code_point_upper_bound; - actual = escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - // Test invalid code point: 0x7F (only need one byte) - test_str = "\xC1\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xE0\x81\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x81\x81\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x73 (only need one byte) - test_str = "\xC1\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xE0\x81\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x81\x81\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x7FF (only need 2 bytes) - test_str = "\xE0\x9F\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x80\x9F\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x7F3 (only need 2 bytes) - test_str = "\xE0\x9F\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x80\x9F\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0xFFFF (only need 3 bytes) - test_str = "\xF0\x8F\xBF\xBF"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0xFFF3 (only need 3 bytes) - test_str = "\xF0\x8F\xBF\xB3"; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test incomplete continuation bytes - std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()}; - std::string const valid{"Valid"}; - for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1}; - valid_code_point_lower_bound.cbegin() != it_end; - --it_end) - { - std::string const incomplete_byte_sequence{it_begin, it_end}; - REQUIRE((false == escape_utf8_string(valid + incomplete_byte_sequence).has_value())); - REQUIRE((false == escape_utf8_string(incomplete_byte_sequence + valid).has_value())); - } - - // Test invalid header byte - test_str = valid_code_point_lower_bound; - constexpr char cInvalidHeaderByte{'\xFF'}; - test_str.front() = cInvalidHeaderByte; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - - // Test invalid continuation bytes - for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) { - test_str = valid_code_point_lower_bound; - constexpr uint8_t cInvalidateMask{0x40}; - test_str.at(idx) |= cInvalidateMask; - REQUIRE((false == escape_utf8_string(test_str).has_value())); - } -} +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../src/clp/ffi/utils.hpp" + +using clp::ffi::validate_and_escape_utf8_string; + +namespace { +/** + * Gets an expected escaped string by first convert the raw string into a json string and then dumps + * the a printable string using nlohmann::json. + * @param raw + * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped. + */ +[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; + +auto get_expected_escaped_string(std::string_view raw) -> std::string { + nlohmann::json const json_str = raw; // Don't use '{}' initializer + auto const dumped_str{json_str.dump()}; + return {dumped_str.begin() + 1, dumped_str.end() - 1}; +} +} // namespace + +TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { + std::string test_str; + std::optional actual; + + // Test empty string + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string that has nothing to escape + test_str = "This string has nothing to escape :)"; + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test string with all single byte UTF8 characters, which include all characters we escape + test_str.clear(); + for (uint8_t i{0}; i <= static_cast(INT8_MAX); ++i) { + test_str.push_back(static_cast(i)); + } + // Shuffle characters randomly, ensure control characters are not grouped together. + // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp) + std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{}); + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test valid UTF8 chars with continuation bytes + std::vector const valid_utf8{ + "\n", + "\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F + "a", + "\xE4\xB8\xAD", // https://en.wiktionary.org/wiki/%E4%B8%AD + "\x1F", + "\xC2\xA2", // ¢ + "\\" + }; + test_str.clear(); + for (auto const& str : valid_utf8) { + test_str.append(str); + } + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); +} + +TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") { + std::string test_str; + std::optional actual; + + // Test UTF8 code point range validation + auto const valid_code_point_lower_bound = GENERATE( + std::string_view{"\xC2\x80"}, + std::string_view{"\xE0\xA0\x80"}, + std::string_view{"\xF0\x90\x80\x80"} + ); + + auto const valid_code_point_upper_bound = GENERATE( + std::string_view{"\xDF\xBF"}, + std::string_view{"\xEF\xBF\xBF"}, + std::string_view{"\xF4\x8F\xBF\xBF"} + ); + + test_str = valid_code_point_lower_bound; + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + test_str = valid_code_point_upper_bound; + actual = validate_and_escape_utf8_string(test_str); + REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); + + // Test invalid code point: 0x7F (only need one byte) + test_str = "\xC1\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xE0\x81\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x81\x81\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x73 (only need one byte) + test_str = "\xC1\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xE0\x81\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x81\x81\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x7FF (only need 2 bytes) + test_str = "\xE0\x9F\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x80\x9F\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0x7F3 (only need 2 bytes) + test_str = "\xE0\x9F\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = "\xF0\x80\x9F\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0xFFFF (only need 3 bytes) + test_str = "\xF0\x8F\xBF\xBF"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid code point: 0xFFF3 (only need 3 bytes) + test_str = "\xF0\x8F\xBF\xB3"; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test incomplete continuation bytes + std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()}; + std::string const valid{"Valid"}; + for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1}; + valid_code_point_lower_bound.cbegin() != it_end; + --it_end) + { + std::string const incomplete_byte_sequence{it_begin, it_end}; + REQUIRE( + (false + == validate_and_escape_utf8_string(valid + incomplete_byte_sequence).has_value()) + ); + REQUIRE( + (false + == validate_and_escape_utf8_string(incomplete_byte_sequence + valid).has_value()) + ); + } + + // Test invalid header byte + test_str = valid_code_point_lower_bound; + constexpr char cInvalidHeaderByte{'\xFF'}; + test_str.front() = cInvalidHeaderByte; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + // Test invalid continuation bytes + for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) { + test_str = valid_code_point_lower_bound; + constexpr uint8_t cInvalidateMask{0x40}; + test_str.at(idx) |= cInvalidateMask; + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + } +} From 690d1b4a231abd5d6ff435e18a28d9530749522c Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Thu, 20 Jun 2024 04:50:01 -0400 Subject: [PATCH 03/13] Update cmake --- components/core/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index e5f999aa2..c8ad15f79 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -473,8 +473,6 @@ set(SOURCE_FILES_unitTest tests/test-string_utils.cpp tests/test-TimestampPattern.cpp tests/test-Utils.cpp - src/clp/ffi/utils.hpp - src/clp/ffi/utils.cpp tests/test-ffi_utils.cpp ) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest}) From fd3b91cf46c8b800479afd13d856b3a3e886d16c Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Sat, 22 Jun 2024 17:18:25 -0400 Subject: [PATCH 04/13] Update cmake format --- components/core/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index c8ad15f79..50abbc295 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -474,7 +474,7 @@ set(SOURCE_FILES_unitTest tests/test-TimestampPattern.cpp tests/test-Utils.cpp tests/test-ffi_utils.cpp -) + ) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest}) target_include_directories(unitTest PRIVATE From ea9a924dcbef98f2e301b7171206b9ff0900cdc4 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Sat, 22 Jun 2024 17:20:58 -0400 Subject: [PATCH 05/13] Let compiler figure out which variable to capture --- components/core/src/clp/ffi/utils.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp index 43cd4269c..56fd7eb7c 100644 --- a/components/core/src/clp/ffi/utils.cpp +++ b/components/core/src/clp/ffi/utils.cpp @@ -148,8 +148,7 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional { escaped.reserve(raw.size() + (raw.size() >> 2)); uint32_t code_point{}; - auto validate_encoding_length_and_set_state - = [&encoding_length, &state, &code_point](uint8_t byte) -> bool { + auto validate_encoding_length_and_set_state = [&](uint8_t byte) -> bool { constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx From 191c4ff2bb2967361d37e9f0b40e75bd78cae4c5 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Mon, 24 Jun 2024 19:01:47 -0400 Subject: [PATCH 06/13] Refactoring according to the code review comments --- components/core/src/clp/ffi/utils.cpp | 304 ++++++++--------------- components/core/src/clp/ffi/utils.hpp | 123 ++++++++- components/core/tests/test-ffi_utils.cpp | 182 ++++++++------ 3 files changed, 335 insertions(+), 274 deletions(-) diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp index 56fd7eb7c..e074311b4 100644 --- a/components/core/src/clp/ffi/utils.cpp +++ b/components/core/src/clp/ffi/utils.cpp @@ -8,122 +8,131 @@ #include #include #include -#include using std::string; using std::string_view; namespace clp::ffi { -namespace { -/* - * @param byte - * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte - * should match 0b10xx_xxxx. - */ -[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; +auto validate_and_escape_utf8_string(string_view raw) -> std::optional { + string_view::const_iterator bookmark{raw.cbegin()}; + string escaped; + escaped.reserve(raw.size() + (raw.size() / 2)); + + auto escape_handler = [&](string_view::const_iterator it) -> void { + // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte + // used by `snprintf` to append '\0' + constexpr size_t cControlCharacterBufSize{7}; + std::array buf{}; + std::string_view escaped_content; + bool escape_required{true}; + switch (*it) { + case '\b': + escaped_content = "\\b"; + break; + case '\t': + escaped_content = "\\t"; + break; + case '\n': + escaped_content = "\\n"; + break; + case '\f': + escaped_content = "\\f"; + break; + case '\r': + escaped_content = "\\r"; + break; + case '\\': + escaped_content = "\\\\"; + break; + case '"': + escaped_content = "\\\""; + break; + default: { + constexpr uint8_t cLargestControlCharacter{0x1F}; + auto const byte{static_cast(*it)}; + if (cLargestControlCharacter >= byte) { + std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte); + escaped_content = {buf.data(), buf.size() - 1}; + } else { + escape_required = false; + } + break; + } + } + if (escape_required) { + escaped.append(bookmark, it); + escaped.append(escaped_content.cbegin(), escaped_content.cend()); + bookmark = it + 1; + } + }; -/** - * Appends a single-byte utf8 character into the given string, and escapes it if necessary. - * @param character Single-byte utf8 character. - * @parma escaped_string Input string where the character(s) are appended to. - */ -auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void; + if (false == generic_validate_utf8_string(raw, escape_handler)) { + return std::nullopt; + } -/** - * Validates whether the given code point is a valid UTF8 encoding with the given length. - * The valid range is defined as following: - * .---------------------------------------------. - * | Length | First Code Point | Last Code Point | - * |--------|------------------|-----------------| - * | 1 Byte | 0x00 | 0x7F | - * | 2 Byte | 0x80 | 0x7FF | - * | 3 Byte | 0x8FF | 0xFFFF | - * | 4 Byte | 0x10000 | 0x10FFFF | - * |--------|------------------|-----------------| - * @param code_point - * @param encoding_length - * @return Whether the code point is a valid encoding. - */ -[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool; + if (raw.cend() != bookmark) { + escaped.append(bookmark, raw.cend()); + } -/** - * Updates the code point by applying the payload of the given continuation byte. - * @param code_point - * @param continuation_byte - * @return Updated code point. - */ -[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; + return escaped; +} -auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { - constexpr uint8_t cContinuationByteMask{0xC0}; - constexpr uint8_t cValidMaskedContinuationByte{0x80}; - return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; +auto is_utf8_encoded(string_view str) -> bool { + auto escape_handler = []([[maybe_unused]] string_view::const_iterator it) -> void {}; + return generic_validate_utf8_string(str, escape_handler); } -auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void { - switch (character) { +namespace utils_hpp { +auto validate_header_byte_and_set_code_point( + uint8_t header, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool { + constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx + constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx + constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx + constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx + constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx + constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx + + if ((header & cThreeByteContinuationMask) == cValidThreeByteContinuation) { + num_continuation_bytes = 3; + code_point = (~cThreeByteContinuationMask & header); // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - case 0x08: - escaped_string.push_back('\\'); - escaped_string.push_back('b'); - break; - case 0x09: - escaped_string.push_back('\\'); - escaped_string.push_back('t'); - break; - case 0x0A: - escaped_string.push_back('\\'); - escaped_string.push_back('n'); - break; - case 0x0C: - escaped_string.push_back('\\'); - escaped_string.push_back('f'); - break; - case 0x0D: - escaped_string.push_back('\\'); - escaped_string.push_back('r'); - break; - case 0x22: - escaped_string.push_back('\\'); - escaped_string.push_back('\"'); - break; - case 0x5C: - escaped_string.push_back('\\'); - escaped_string.push_back('\\'); - break; + code_point_lower_bound = 0x1'0000; + code_point_upper_bound = 0x10'FFFF; // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - default: { - constexpr uint8_t cControlCharacter{0x1F}; - if (cControlCharacter >= character) { - // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the - // last byte used by `snprintf` to append '\0' - constexpr size_t cControlCharacterBufSize{7}; - std::array buf{}; - std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character); - escaped_string.append(buf.cbegin(), buf.cend() - 1); - } else { - escaped_string.push_back(static_cast(character)); - } - break; - } - } -} - -auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool { - switch (encoding_length) { + } else if ((header & cTwoByteContinuationMask) == cValidTwoByteContinuation) { + num_continuation_bytes = 2; + code_point = (~cTwoByteContinuationMask & header); // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - case 1: - return code_point <= 0x7F; - case 2: - return (0x80 <= code_point && code_point <= 0x7FF); - case 3: - return (0x800 <= code_point && code_point <= 0xFFFF); - case 4: - return (0x1'0000 <= code_point && code_point <= 0x10'FFFF); + code_point_lower_bound = 0x800; + code_point_upper_bound = 0xFFFF; // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - default: - return false; + } else if ((header & cOneByteContinuationMask) == cValidOneByteContinuation) { + num_continuation_bytes = 1; + code_point = (~cOneByteContinuationMask & header); + // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + code_point_lower_bound = 0x80; + code_point_upper_bound = 0x7FF; + // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + } else { + return false; } + return true; +} + +auto is_ascii_char(uint8_t byte) -> bool { + constexpr uint8_t cLargestValidASCIIChar{0x7F}; + return cLargestValidASCIIChar >= byte; +} + +auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { + constexpr uint8_t cContinuationByteMask{0xC0}; + constexpr uint8_t cValidMaskedContinuationByte{0x80}; + return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; } auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { @@ -132,99 +141,6 @@ auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32 return (code_point << cNumContinuationBytePayloadBits) + (continuation_byte & cContinuationBytePayloadMask); } -} // namespace - -auto validate_and_escape_utf8_string(string_view raw) -> std::optional { - string_view::const_iterator bookmark_it{}; - size_t encoding_length{}; - enum class State : uint8_t { - HeadByteToValidate = 0, - OneContinuationByteToValidate, - TwoContinuationBytesToValidate, - ThreeContinuationBytesToValidate - }; - State state{State::HeadByteToValidate}; - string escaped; - escaped.reserve(raw.size() + (raw.size() >> 2)); - - uint32_t code_point{}; - auto validate_encoding_length_and_set_state = [&](uint8_t byte) -> bool { - constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx - constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx - constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx - constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx - constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx - constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx - if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) { - encoding_length = 4; - code_point = (~cThreeByteContinuationMask & byte); - state = State::ThreeContinuationBytesToValidate; - } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) { - encoding_length = 3; - code_point = (~cTwoByteContinuationMask & byte); - state = State::TwoContinuationBytesToValidate; - } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) { - encoding_length = 2; - code_point = (~cOneByteContinuationMask & byte); - state = State::OneContinuationByteToValidate; - } else { - return false; - } - return true; - }; - - // For multibyte encoded values, we will incrementally build the code point, and validate its - // range in the end. - for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) { - auto const byte{static_cast(*it)}; - switch (state) { - case State::HeadByteToValidate: { - if (is_valid_code_point(static_cast(byte), 1)) { - escape_and_append_single_byte_utf8_char(byte, escaped); - } else { - if (false == validate_encoding_length_and_set_state(byte)) { - return std::nullopt; - } - bookmark_it = it; - } - break; - } - case State::OneContinuationByteToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - - if (false == is_valid_code_point(code_point, encoding_length)) { - return std::nullopt; - } - escaped.append(bookmark_it, bookmark_it + encoding_length); - state = State::HeadByteToValidate; - break; - case State::TwoContinuationBytesToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - state = State::OneContinuationByteToValidate; - break; - case State::ThreeContinuationBytesToValidate: - if (false == is_valid_utf8_continuation_byte(byte)) { - return std::nullopt; - } - code_point = update_code_point(code_point, byte); - state = State::TwoContinuationBytesToValidate; - break; - default: - return std::nullopt; - } - } +} // namespace utils_hpp - if (State::HeadByteToValidate != state) { - // Incomplete multibyte UTF8 sequence - return std::nullopt; - } - - return std::move(escaped); -} } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index acd977b80..81059d587 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -1,13 +1,15 @@ #ifndef CLP_FFI_UTILS_HPP #define CLP_FFI_UTILS_HPP +#include +#include #include #include #include namespace clp::ffi { /** - * Validates whether the given string is UTF8 encoded, and escapes any characters to generate to + * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to * make the string human readable. * @param raw The raw string to escape. * @return The escaped string on success. @@ -15,6 +17,125 @@ namespace clp::ffi { */ [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw ) -> std::optional; + +/** + * @param str + * @return Whether the input is a valid UTF-8 encoded string. + */ +[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; + +/** + * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using + * the given handler. + * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. Signature: + * (std::string_view::const_iterator it_ascii_char) -> void + * @param src + * @param escape_handler + * @return Whether the input is a valid UTF-8 encoded string. + */ +template +[[nodiscard]] auto +generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; + +namespace utils_hpp { +/** + * Validates whether the given byte is a valid UTF-8 header with continuation bytes, and set code + * point and code point range accordingly. + * The valid code point range is defined as following: + * .----------------------------------------------------------. + * | Continuation Length | First Code Point | Last Code Point | + * |---------------------|------------------|-----------------| + * | 1 Byte | 0x80 | 0x7FF | + * | 2 Byte | 0x800 | 0xFFFF | + * | 3 Byte | 0x10000 | 0x10FFFF | + * |---------------------|------------------|-----------------| + * @param header Input byte to validate + * @param num_continuation_bytes Outputs the number of continuation bytes corresponded to the header + * byte, if the header is valid. + * @param code_point Outputs the code extracted from the header byte, if the header is valid. + * @param code_point_lower_bound Outputs the lower bound of the valid code point range corresponded + * with the header byte, if the header if valid. + * @param code_point_upper_bound Outputs the upper bound of the valid code point range corresponded + * with the header byte, if the header if valid. + * @return Whether the input byte is a valid header byte. + */ +[[nodiscard]] auto validate_header_byte_and_set_code_point( + uint8_t header, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool; + +/** + * @param byte + * @return Whether the given byte is a valid ASCII character. + */ +[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; + +/* + * @param byte + * @return Whether the input byte is a valid UTF-8 continuation byte. A valid UTF-8 continuation + * byte should match 0b10xx_xxxx. + */ +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; + +/** + * Updates the code point by applying the payload of the given continuation byte. + * @param code_point + * @param continuation_byte + * @return Updated code point. + */ +[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; +} // namespace utils_hpp + +template +auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { + size_t num_continuation_bytes_to_validate{0}; + uint32_t code_point{}; + uint32_t code_point_lower_bound{}; + uint32_t code_point_upper_bound{}; + + for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) { + auto const byte{static_cast(*it)}; + if (0 == num_continuation_bytes_to_validate) { + if (utils_hpp::is_ascii_char(byte)) { + escape_handler(it); + } else { + if (false + == utils_hpp::validate_header_byte_and_set_code_point( + byte, + num_continuation_bytes_to_validate, + code_point, + code_point_lower_bound, + code_point_upper_bound + )) + { + return false; + } + } + } else { + if (false == utils_hpp::is_valid_utf8_continuation_byte(byte)) { + return false; + } + code_point = utils_hpp::update_code_point(code_point, byte); + --num_continuation_bytes_to_validate; + if (0 != num_continuation_bytes_to_validate) { + continue; + } + if (code_point < code_point_lower_bound || code_point_upper_bound < code_point) { + return false; + } + } + } + + if (0 != num_continuation_bytes_to_validate) { + // Incomplete continuation byte sequence + return false; + } + + return true; +} } // namespace clp::ffi #endif // CLP_UTILS_HPP diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp index 4deb16865..d2eb0d11f 100644 --- a/components/core/tests/test-ffi_utils.cpp +++ b/components/core/tests/test-ffi_utils.cpp @@ -12,6 +12,7 @@ #include "../src/clp/ffi/utils.hpp" +using clp::ffi::is_utf8_encoded; using clp::ffi::validate_and_escape_utf8_string; namespace { @@ -23,11 +24,54 @@ namespace { */ [[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; +/** + * Generates a UTF-8 encoded byte sequence of a given code point with the given number of + * continuation bytes. The range of the code point is not validated, which means the generated byte + * sequence can be overlong. + * @param code_point + * @param num_continuation_bytes + * @return The encoded UTF-8 byte sequence. + */ +[[nodiscard]] auto +generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) -> std::string; + auto get_expected_escaped_string(std::string_view raw) -> std::string { nlohmann::json const json_str = raw; // Don't use '{}' initializer auto const dumped_str{json_str.dump()}; + // Strip the quotes that nlohmann::json adds return {dumped_str.begin() + 1, dumped_str.end() - 1}; } + +auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) + -> std::string { + REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3)); + std::vector encoded_bytes; + while (true) { + auto const least_significant_byte{static_cast(code_point)}; + if (encoded_bytes.size() < num_continuation_bytes) { + constexpr uint8_t cContinuationPayloadMask{0x3F}; // 0b0011_1111 + constexpr uint8_t cContinuationSignature{0x80}; // 0b1000_0000 + constexpr uint8_t cNumContinuationBytePayloadBits{6}; + encoded_bytes.push_back(static_cast( + (least_significant_byte & cContinuationPayloadMask) | cContinuationSignature + )); + code_point >>= cNumContinuationBytePayloadBits; + } else { + constexpr uint8_t cHeaderPayloadMask{0x1F}; // 0b0001_1111 + constexpr int8_t cHeaderSignature{static_cast(0xC0)}; // 0b1100_0000 + auto const num_bits_shift{num_continuation_bytes - 1}; + auto const header_payload_mask{ + static_cast(cHeaderPayloadMask >> num_bits_shift) + }; + auto const header_signature{static_cast(cHeaderSignature >> num_bits_shift)}; + encoded_bytes.push_back(static_cast( + (least_significant_byte & header_payload_mask) | header_signature + )); + break; + } + } + return {encoded_bytes.rbegin(), encoded_bytes.rend()}; +} } // namespace TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { @@ -43,12 +87,12 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { actual = validate_and_escape_utf8_string(test_str); REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - // Test string with all single byte UTF8 characters, which include all characters we escape + // Test string with all single byte UTF-8 characters, including those we escape. test_str.clear(); for (uint8_t i{0}; i <= static_cast(INT8_MAX); ++i) { test_str.push_back(static_cast(i)); } - // Shuffle characters randomly, ensure control characters are not grouped together. + // Shuffle characters randomly // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp) std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{}); actual = validate_and_escape_utf8_string(test_str); @@ -72,102 +116,82 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); } -TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") { +TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") { std::string test_str; - std::optional actual; - // Test UTF8 code point range validation - auto const valid_code_point_lower_bound = GENERATE( - std::string_view{"\xC2\x80"}, - std::string_view{"\xE0\xA0\x80"}, - std::string_view{"\xF0\x90\x80\x80"} + auto const valid_utf8_byte_sequence = GENERATE( + generate_utf8_byte_sequence(0x80, 1), + generate_utf8_byte_sequence(0x800, 2), + generate_utf8_byte_sequence(0x1'0000, 3) ); - auto const valid_code_point_upper_bound = GENERATE( - std::string_view{"\xDF\xBF"}, - std::string_view{"\xEF\xBF\xBF"}, - std::string_view{"\xF4\x8F\xBF\xBF"} - ); - - test_str = valid_code_point_lower_bound; - actual = validate_and_escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - test_str = valid_code_point_upper_bound; - actual = validate_and_escape_utf8_string(test_str); - REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - - // Test invalid code point: 0x7F (only need one byte) - test_str = "\xC1\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xE0\x81\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x81\x81\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x73 (only need one byte) - test_str = "\xC1\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xE0\x81\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x81\x81\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x7FF (only need 2 bytes) - test_str = "\xE0\x9F\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x80\x9F\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0x7F3 (only need 2 bytes) - test_str = "\xE0\x9F\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - test_str = "\xF0\x80\x9F\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0xFFFF (only need 3 bytes) - test_str = "\xF0\x8F\xBF\xBF"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - - // Test invalid code point: 0xFFF3 (only need 3 bytes) - test_str = "\xF0\x8F\xBF\xB3"; - REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); - // Test incomplete continuation bytes - std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()}; + auto const it_begin{valid_utf8_byte_sequence.cbegin()}; std::string const valid{"Valid"}; - for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1}; - valid_code_point_lower_bound.cbegin() != it_end; + for (auto it_end{valid_utf8_byte_sequence.cend() - 1}; + valid_utf8_byte_sequence.cbegin() != it_end; --it_end) { std::string const incomplete_byte_sequence{it_begin, it_end}; - REQUIRE( - (false - == validate_and_escape_utf8_string(valid + incomplete_byte_sequence).has_value()) - ); - REQUIRE( - (false - == validate_and_escape_utf8_string(incomplete_byte_sequence + valid).has_value()) - ); + + test_str = valid + incomplete_byte_sequence; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); + + test_str = incomplete_byte_sequence + valid; + REQUIRE((false == is_utf8_encoded(test_str))); + REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); } // Test invalid header byte - test_str = valid_code_point_lower_bound; + test_str = valid_utf8_byte_sequence; constexpr char cInvalidHeaderByte{'\xFF'}; test_str.front() = cInvalidHeaderByte; REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); // Test invalid continuation bytes - for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) { - test_str = valid_code_point_lower_bound; + for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) { + test_str = valid_utf8_byte_sequence; constexpr uint8_t cInvalidateMask{0x40}; test_str.at(idx) |= cInvalidateMask; + REQUIRE((false == is_utf8_encoded(test_str))); REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); } } + +TEST_CASE("validate_utf8_code_point_ranges", "[ffi][utils]") { + // Test 1 byte encoding code point range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (uint32_t code_point{0}; code_point <= 0x7F; ++code_point) { + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 2 byte encoding code point range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (uint32_t code_point{0x80}; code_point <= 0x7FF; ++code_point) { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 3 byte encoding code point range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (uint32_t code_point{0x800}; code_point <= 0xFFFF; ++code_point) { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))); + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } + + // Test 4 byte encoding code point range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (uint32_t code_point{0x1'0000}; code_point <= 0x10'FFFF; ++code_point) { + REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))); + } + + // Test 4 byte encoding code point out of range + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + for (uint32_t code_point{0x10'FFFF + 1}; code_point <= 0x1F'FFFF; ++code_point) { + REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); + } +} From db6b089abea85b14e9cd5a87429ca63e67692efb Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Mon, 24 Jun 2024 19:04:46 -0400 Subject: [PATCH 07/13] Update comments --- components/core/src/clp/ffi/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index 81059d587..c29f21826 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -10,7 +10,7 @@ namespace clp::ffi { /** * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to - * make the string human readable. + * make the string compatible with JSON specification. * @param raw The raw string to escape. * @return The escaped string on success. * @return std::nullopt if the string contains none-UTF8 encoded byte sequence. From 6696553114c8af231353ad9fe6e185a4b930cadc Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Wed, 26 Jun 2024 01:16:01 -0400 Subject: [PATCH 08/13] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/core/src/clp/ffi/utils.hpp | 48 ++++++++++-------------- components/core/tests/test-ffi_utils.cpp | 11 +++--- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index c29f21826..af65a4c84 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -9,11 +9,11 @@ namespace clp::ffi { /** - * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to - * make the string compatible with JSON specification. + * Validates whether the given string is UTF-8 encoded, and escapes any characters to make the + * string compatible with the JSON specification. * @param raw The raw string to escape. * @return The escaped string on success. - * @return std::nullopt if the string contains none-UTF8 encoded byte sequence. + * @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences. */ [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw ) -> std::optional; @@ -27,8 +27,7 @@ namespace clp::ffi { /** * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using * the given handler. - * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. Signature: - * (std::string_view::const_iterator it_ascii_char) -> void + * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. * @param src * @param escape_handler * @return Whether the input is a valid UTF-8 encoded string. @@ -39,25 +38,16 @@ generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) namespace utils_hpp { /** - * Validates whether the given byte is a valid UTF-8 header with continuation bytes, and set code - * point and code point range accordingly. - * The valid code point range is defined as following: - * .----------------------------------------------------------. - * | Continuation Length | First Code Point | Last Code Point | - * |---------------------|------------------|-----------------| - * | 1 Byte | 0x80 | 0x7FF | - * | 2 Byte | 0x800 | 0xFFFF | - * | 3 Byte | 0x10000 | 0x10FFFF | - * |---------------------|------------------|-----------------| - * @param header Input byte to validate - * @param num_continuation_bytes Outputs the number of continuation bytes corresponded to the header - * byte, if the header is valid. - * @param code_point Outputs the code extracted from the header byte, if the header is valid. - * @param code_point_lower_bound Outputs the lower bound of the valid code point range corresponded - * with the header byte, if the header if valid. - * @param code_point_upper_bound Outputs the upper bound of the valid code point range corresponded - * with the header byte, if the header if valid. - * @return Whether the input byte is a valid header byte. + * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses + * the byte, and returns the parsed properties as well as associated properties. + * @param header Byte to validate. + * @param num_continuation_bytes Returns the number of continuation bytes expected. + * @param code_point Returns the code point bits parsed from the lead byte. + * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 + * character. + * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 + * character. + * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. */ [[nodiscard]] auto validate_header_byte_and_set_code_point( uint8_t header, @@ -75,16 +65,16 @@ namespace utils_hpp { /* * @param byte - * @return Whether the input byte is a valid UTF-8 continuation byte. A valid UTF-8 continuation - * byte should match 0b10xx_xxxx. + * @return Whether the input byte is a valid UTF-8 continuation byte. */ [[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; /** - * Updates the code point by applying the payload of the given continuation byte. + * Parses the code-point bits from the given continuation byte and combines them with the given + * code point. * @param code_point * @param continuation_byte - * @return Updated code point. + * @return The updated code point. */ [[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; } // namespace utils_hpp @@ -130,7 +120,7 @@ auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_han } if (0 != num_continuation_bytes_to_validate) { - // Incomplete continuation byte sequence + // Incomplete UTF-8 character return false; } diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp index d2eb0d11f..536f329c8 100644 --- a/components/core/tests/test-ffi_utils.cpp +++ b/components/core/tests/test-ffi_utils.cpp @@ -20,14 +20,14 @@ namespace { * Gets an expected escaped string by first convert the raw string into a json string and then dumps * the a printable string using nlohmann::json. * @param raw - * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped. + * @return The input string after escaping any characters that are invalid in JSON strings. */ [[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string; /** - * Generates a UTF-8 encoded byte sequence of a given code point with the given number of - * continuation bytes. The range of the code point is not validated, which means the generated byte - * sequence can be overlong. + * Generates a UTF-8 encoded byte sequence with the given code point and number of continuation + * bytes. The range of the code point is not validated, which means the generated byte sequence can + * be invalid (overlong or exceeding the valid range of UTF-8 code points). * @param code_point * @param num_continuation_bytes * @return The encoded UTF-8 byte sequence. @@ -98,7 +98,7 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { actual = validate_and_escape_utf8_string(test_str); REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); - // Test valid UTF8 chars with continuation bytes + // Test valid UTF-8 chars with continuation bytes std::vector const valid_utf8{ "\n", "\xF0\xA0\x80\x8F", // https://en.wiktionary.org/wiki/%F0%A0%80%8F @@ -147,6 +147,7 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") { test_str = valid_utf8_byte_sequence; constexpr char cInvalidHeaderByte{'\xFF'}; test_str.front() = cInvalidHeaderByte; + REQUIRE((false == is_utf8_encoded(test_str))); REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); // Test invalid continuation bytes From 9aefc250f128d14a30f70a06eae5d426a8b0a906 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Wed, 26 Jun 2024 01:43:52 -0400 Subject: [PATCH 09/13] Apply suggestions from code review Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/core/tests/test-ffi_utils.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp index 536f329c8..ff190bcf6 100644 --- a/components/core/tests/test-ffi_utils.cpp +++ b/components/core/tests/test-ffi_utils.cpp @@ -17,8 +17,6 @@ using clp::ffi::validate_and_escape_utf8_string; namespace { /** - * Gets an expected escaped string by first convert the raw string into a json string and then dumps - * the a printable string using nlohmann::json. * @param raw * @return The input string after escaping any characters that are invalid in JSON strings. */ From d47fc77434884b766a076484fd75fbcbcc4a12fc Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 05:10:53 -0400 Subject: [PATCH 10/13] Apply code review comments --- components/core/CMakeLists.txt | 4 +- components/core/src/clp/ffi/utils.cpp | 106 +++---------- components/core/src/clp/ffi/utils.hpp | 113 +------------- components/core/src/clp/utf8_utils.cpp | 55 +++++++ components/core/src/clp/utf8_utils.hpp | 143 ++++++++++++++++++ ...test-ffi_utils.cpp => test-utf8_utils.cpp} | 102 +++++++------ 6 files changed, 280 insertions(+), 243 deletions(-) create mode 100644 components/core/src/clp/utf8_utils.cpp create mode 100644 components/core/src/clp/utf8_utils.hpp rename components/core/tests/{test-ffi_utils.cpp => test-utf8_utils.cpp} (68%) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 50abbc295..ed15ef132 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -438,6 +438,8 @@ set(SOURCE_FILES_unitTest src/clp/TraceableException.hpp src/clp/time_types.hpp src/clp/type_utils.hpp + src/clp/utf8_utils.cpp + src/clp/utf8_utils.hpp src/clp/Utils.cpp src/clp/Utils.hpp src/clp/VariableDictionaryEntry.cpp @@ -472,8 +474,8 @@ set(SOURCE_FILES_unitTest tests/test-StreamingCompression.cpp tests/test-string_utils.cpp tests/test-TimestampPattern.cpp + tests/test-utf8_utils.cpp tests/test-Utils.cpp - tests/test-ffi_utils.cpp ) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest}) target_include_directories(unitTest diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp index e074311b4..3f77564d2 100644 --- a/components/core/src/clp/ffi/utils.cpp +++ b/components/core/src/clp/ffi/utils.cpp @@ -9,13 +9,16 @@ #include #include +#include "../utf8_utils.hpp" + using std::string; using std::string_view; namespace clp::ffi { auto validate_and_escape_utf8_string(string_view raw) -> std::optional { - string_view::const_iterator bookmark{raw.cbegin()}; - string escaped; + string_view::const_iterator next_char_to_copy_it{raw.cbegin()}; + std::optional ret_val; + auto& escaped{ret_val.emplace()}; escaped.reserve(raw.size() + (raw.size() / 2)); auto escape_handler = [&](string_view::const_iterator it) -> void { @@ -23,36 +26,36 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional { // used by `snprintf` to append '\0' constexpr size_t cControlCharacterBufSize{7}; std::array buf{}; - std::string_view escaped_content; + std::string_view escaped_char; bool escape_required{true}; switch (*it) { case '\b': - escaped_content = "\\b"; + escaped_char = "\\b"; break; case '\t': - escaped_content = "\\t"; + escaped_char = "\\t"; break; case '\n': - escaped_content = "\\n"; + escaped_char = "\\n"; break; case '\f': - escaped_content = "\\f"; + escaped_char = "\\f"; break; case '\r': - escaped_content = "\\r"; + escaped_char = "\\r"; break; case '\\': - escaped_content = "\\\\"; + escaped_char = "\\\\"; break; case '"': - escaped_content = "\\\""; + escaped_char = "\\\""; break; default: { constexpr uint8_t cLargestControlCharacter{0x1F}; auto const byte{static_cast(*it)}; if (cLargestControlCharacter >= byte) { std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte); - escaped_content = {buf.data(), buf.size() - 1}; + escaped_char = {buf.data(), buf.size() - 1}; } else { escape_required = false; } @@ -60,87 +63,20 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional { } } if (escape_required) { - escaped.append(bookmark, it); - escaped.append(escaped_content.cbegin(), escaped_content.cend()); - bookmark = it + 1; + escaped.append(next_char_to_copy_it, it); + escaped += escaped_char; + next_char_to_copy_it = it + 1; } }; - if (false == generic_validate_utf8_string(raw, escape_handler)) { + if (false == validate_utf8_string(raw, escape_handler)) { return std::nullopt; } - if (raw.cend() != bookmark) { - escaped.append(bookmark, raw.cend()); - } - - return escaped; -} - -auto is_utf8_encoded(string_view str) -> bool { - auto escape_handler = []([[maybe_unused]] string_view::const_iterator it) -> void {}; - return generic_validate_utf8_string(str, escape_handler); -} - -namespace utils_hpp { -auto validate_header_byte_and_set_code_point( - uint8_t header, - size_t& num_continuation_bytes, - uint32_t& code_point, - uint32_t& code_point_lower_bound, - uint32_t& code_point_upper_bound -) -> bool { - constexpr uint8_t cThreeByteContinuationMask{0xF8}; // 0b1111_1xxx - constexpr uint8_t cValidThreeByteContinuation{0xF0}; // 0b1111_0xxx - constexpr uint8_t cTwoByteContinuationMask{0xF0}; // 0b1111_xxxx - constexpr uint8_t cValidTwoByteContinuation{0xE0}; // 0b1110_xxxx - constexpr uint8_t cOneByteContinuationMask{0xE0}; // 0b111x_xxxx - constexpr uint8_t cValidOneByteContinuation{0xC0}; // 0b110x_xxxx - - if ((header & cThreeByteContinuationMask) == cValidThreeByteContinuation) { - num_continuation_bytes = 3; - code_point = (~cThreeByteContinuationMask & header); - // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - code_point_lower_bound = 0x1'0000; - code_point_upper_bound = 0x10'FFFF; - // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - } else if ((header & cTwoByteContinuationMask) == cValidTwoByteContinuation) { - num_continuation_bytes = 2; - code_point = (~cTwoByteContinuationMask & header); - // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - code_point_lower_bound = 0x800; - code_point_upper_bound = 0xFFFF; - // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - } else if ((header & cOneByteContinuationMask) == cValidOneByteContinuation) { - num_continuation_bytes = 1; - code_point = (~cOneByteContinuationMask & header); - // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - code_point_lower_bound = 0x80; - code_point_upper_bound = 0x7FF; - // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - } else { - return false; + if (raw.cend() != next_char_to_copy_it) { + escaped.append(next_char_to_copy_it, raw.cend()); } - return true; -} - -auto is_ascii_char(uint8_t byte) -> bool { - constexpr uint8_t cLargestValidASCIIChar{0x7F}; - return cLargestValidASCIIChar >= byte; -} -auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { - constexpr uint8_t cContinuationByteMask{0xC0}; - constexpr uint8_t cValidMaskedContinuationByte{0x80}; - return (byte & cContinuationByteMask) == cValidMaskedContinuationByte; + return ret_val; } - -auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { - constexpr uint32_t cContinuationBytePayloadMask{0x3F}; - constexpr uint8_t cNumContinuationBytePayloadBits{6}; - return (code_point << cNumContinuationBytePayloadBits) - + (continuation_byte & cContinuationBytePayloadMask); -} -} // namespace utils_hpp - } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index af65a4c84..160ed687b 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -1,8 +1,6 @@ #ifndef CLP_FFI_UTILS_HPP #define CLP_FFI_UTILS_HPP -#include -#include #include #include #include @@ -17,115 +15,6 @@ namespace clp::ffi { */ [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw ) -> std::optional; - -/** - * @param str - * @return Whether the input is a valid UTF-8 encoded string. - */ -[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; - -/** - * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using - * the given handler. - * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. - * @param src - * @param escape_handler - * @return Whether the input is a valid UTF-8 encoded string. - */ -template -[[nodiscard]] auto -generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; - -namespace utils_hpp { -/** - * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses - * the byte, and returns the parsed properties as well as associated properties. - * @param header Byte to validate. - * @param num_continuation_bytes Returns the number of continuation bytes expected. - * @param code_point Returns the code point bits parsed from the lead byte. - * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 - * character. - * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 - * character. - * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. - */ -[[nodiscard]] auto validate_header_byte_and_set_code_point( - uint8_t header, - size_t& num_continuation_bytes, - uint32_t& code_point, - uint32_t& code_point_lower_bound, - uint32_t& code_point_upper_bound -) -> bool; - -/** - * @param byte - * @return Whether the given byte is a valid ASCII character. - */ -[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; - -/* - * @param byte - * @return Whether the input byte is a valid UTF-8 continuation byte. - */ -[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; - -/** - * Parses the code-point bits from the given continuation byte and combines them with the given - * code point. - * @param code_point - * @param continuation_byte - * @return The updated code point. - */ -[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; -} // namespace utils_hpp - -template -auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { - size_t num_continuation_bytes_to_validate{0}; - uint32_t code_point{}; - uint32_t code_point_lower_bound{}; - uint32_t code_point_upper_bound{}; - - for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) { - auto const byte{static_cast(*it)}; - if (0 == num_continuation_bytes_to_validate) { - if (utils_hpp::is_ascii_char(byte)) { - escape_handler(it); - } else { - if (false - == utils_hpp::validate_header_byte_and_set_code_point( - byte, - num_continuation_bytes_to_validate, - code_point, - code_point_lower_bound, - code_point_upper_bound - )) - { - return false; - } - } - } else { - if (false == utils_hpp::is_valid_utf8_continuation_byte(byte)) { - return false; - } - code_point = utils_hpp::update_code_point(code_point, byte); - --num_continuation_bytes_to_validate; - if (0 != num_continuation_bytes_to_validate) { - continue; - } - if (code_point < code_point_lower_bound || code_point_upper_bound < code_point) { - return false; - } - } - } - - if (0 != num_continuation_bytes_to_validate) { - // Incomplete UTF-8 character - return false; - } - - return true; -} } // namespace clp::ffi -#endif // CLP_UTILS_HPP +#endif // CLP_FFI_UTILS_HPP diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp new file mode 100644 index 000000000..08a03f608 --- /dev/null +++ b/components/core/src/clp/utf8_utils.cpp @@ -0,0 +1,55 @@ +#include "utf8_utils.hpp" + +#include +#include +#include + +namespace clp { +auto is_utf8_encoded(std::string_view str) -> bool { + auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {}; + return validate_utf8_string(str, escape_handler); +} + +namespace utf8_utils_internal { +auto parse_and_validate_lead_byte( + uint8_t byte, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool { + if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) { + num_continuation_bytes = 3; + code_point = (~cFourByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound; + } else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) { + num_continuation_bytes = 2; + code_point = (~cThreeByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound; + } else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) { + num_continuation_bytes = 1; + code_point = (~cTwoByteUtf8CharHeaderMask & byte); + code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound; + code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound; + } else { + return false; + } + return true; +} + +auto is_ascii_char(uint8_t byte) -> bool { + return cOneByteUtf8CharCodePointUpperBound >= byte; +} + +auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { + return (byte & cContinuationByteMask) == cContinuationByte; +} + +auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { + return (code_point << cNumContinuationByteCodePointBits) + + (continuation_byte & cContinuationByteCodePointMask); +} +} // namespace utf8_utils_internal +} // namespace clp diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp new file mode 100644 index 000000000..c3dc8177a --- /dev/null +++ b/components/core/src/clp/utf8_utils.hpp @@ -0,0 +1,143 @@ +#ifndef CLP_UTF8_UTILS_HPP +#define CLP_UTF8_UTILS_HPP + +#include +#include +#include + +namespace clp { +// Constants +// Lead byte signature +constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx +constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx +constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx +constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx +constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx +constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx + +// Code point ranges (inclusive) +constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0}; +constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F}; +constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80}; +constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF}; +constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800}; +constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF}; +constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000}; +constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF}; + +// Continuation byte +constexpr uint32_t cContinuationByteMask{0xC0}; +constexpr uint32_t cContinuationByte{0x80}; +constexpr uint32_t cContinuationByteCodePointMask{0x3F}; +constexpr uint8_t cNumContinuationByteCodePointBits{6}; + +/** + * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using + * the given handler. + * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. + * @param src + * @param escape_handler + * @return Whether the input is a valid UTF-8 encoded string. + */ +template +requires std::is_invocable_v +[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool; + +/** + * @param str + * @return Whether the input is a valid UTF-8 encoded string. + */ +[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool; + +namespace utf8_utils_internal { +/** + * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses + * the byte, and returns the parsed properties as well as associated properties. + * @param byte Byte to validate. + * @param num_continuation_bytes Returns the number of continuation bytes expected. + * @param code_point Returns the code point bits parsed from the lead byte. + * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8 + * character. + * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8 + * character. + * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character. + */ +[[nodiscard]] auto parse_and_validate_lead_byte( + uint8_t byte, + size_t& num_continuation_bytes, + uint32_t& code_point, + uint32_t& code_point_lower_bound, + uint32_t& code_point_upper_bound +) -> bool; + +/** + * @param byte + * @return Whether the given byte is a valid ASCII character. + */ +[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool; + +/* + * @param byte + * @return Whether the input byte is a valid UTF-8 continuation byte. + */ +[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool; + +/** + * Parses the code-point bits from the given continuation byte and combines them with the given + * code point. + * @param code_point + * @param continuation_byte + * @return The updated code point. + */ +[[nodiscard]] auto +parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t; +} // namespace utf8_utils_internal + +template +requires std::is_invocable_v +auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool { + size_t num_continuation_bytes_to_validate{0}; + uint32_t code_point{}; + uint32_t code_point_lower_bound{}; + uint32_t code_point_upper_bound{}; + + for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) { + auto const byte{static_cast(*it)}; + if (0 == num_continuation_bytes_to_validate) { + if (utf8_utils_internal::is_ascii_char(byte)) { + escape_handler(it); + } else if (false + == utf8_utils_internal::parse_and_validate_lead_byte( + byte, + num_continuation_bytes_to_validate, + code_point, + code_point_lower_bound, + code_point_upper_bound + )) + { + return false; + } + } else { + if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) { + return false; + } + code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte); + --num_continuation_bytes_to_validate; + if (0 == num_continuation_bytes_to_validate + && (code_point < code_point_lower_bound || code_point_upper_bound < code_point)) + { + return false; + } + } + } + + if (0 != num_continuation_bytes_to_validate) { + // Incomplete UTF-8 character + return false; + } + + return true; +} +} // namespace clp + +#endif // CLP_UTF8_UTILS_HPP diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-utf8_utils.cpp similarity index 68% rename from components/core/tests/test-ffi_utils.cpp rename to components/core/tests/test-utf8_utils.cpp index ff190bcf6..94e45cac0 100644 --- a/components/core/tests/test-ffi_utils.cpp +++ b/components/core/tests/test-utf8_utils.cpp @@ -11,9 +11,10 @@ #include #include "../src/clp/ffi/utils.hpp" +#include "../src/clp/utf8_utils.hpp" -using clp::ffi::is_utf8_encoded; using clp::ffi::validate_and_escape_utf8_string; +using clp::is_utf8_encoded; namespace { /** @@ -44,35 +45,35 @@ auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_by -> std::string { REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3)); std::vector encoded_bytes; - while (true) { + while (encoded_bytes.size() < num_continuation_bytes) { auto const least_significant_byte{static_cast(code_point)}; - if (encoded_bytes.size() < num_continuation_bytes) { - constexpr uint8_t cContinuationPayloadMask{0x3F}; // 0b0011_1111 - constexpr uint8_t cContinuationSignature{0x80}; // 0b1000_0000 - constexpr uint8_t cNumContinuationBytePayloadBits{6}; - encoded_bytes.push_back(static_cast( - (least_significant_byte & cContinuationPayloadMask) | cContinuationSignature - )); - code_point >>= cNumContinuationBytePayloadBits; - } else { - constexpr uint8_t cHeaderPayloadMask{0x1F}; // 0b0001_1111 - constexpr int8_t cHeaderSignature{static_cast(0xC0)}; // 0b1100_0000 - auto const num_bits_shift{num_continuation_bytes - 1}; - auto const header_payload_mask{ - static_cast(cHeaderPayloadMask >> num_bits_shift) - }; - auto const header_signature{static_cast(cHeaderSignature >> num_bits_shift)}; - encoded_bytes.push_back(static_cast( - (least_significant_byte & header_payload_mask) | header_signature - )); - break; - } + encoded_bytes.push_back(static_cast( + (least_significant_byte & ~clp::cContinuationByteMask) | clp::cContinuationByte + )); + code_point >>= clp::cNumContinuationByteCodePointBits; } + + uint8_t lead_byte_code_point_mask{}; + uint8_t lead_byte_header{}; + if (1 == num_continuation_bytes) { + lead_byte_code_point_mask = static_cast(~clp::cTwoByteUtf8CharHeaderMask); + lead_byte_header = clp::cTwoByteUtf8CharHeader; + } else if (2 == num_continuation_bytes) { + lead_byte_code_point_mask = static_cast(~clp::cThreeByteUtf8CharHeaderMask); + lead_byte_header = clp::cThreeByteUtf8CharHeader; + } else { // 3 == num_continuation_bytes + lead_byte_code_point_mask = static_cast(~clp::cFourByteUtf8CharHeaderMask); + lead_byte_header = clp::cFourByteUtf8CharHeader; + } + encoded_bytes.push_back(static_cast( + (static_cast(code_point) & lead_byte_code_point_mask) | lead_byte_header + )); + return {encoded_bytes.rbegin(), encoded_bytes.rend()}; } } // namespace -TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { +TEST_CASE("escape_utf8_string_basic", "[utf8_utils]") { std::string test_str; std::optional actual; @@ -114,7 +115,7 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") { REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str))); } -TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") { +TEST_CASE("escape_utf8_string_with_invalid_continuation", "[utf8_utils]") { std::string test_str; auto const valid_utf8_byte_sequence = GENERATE( @@ -124,13 +125,13 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") { ); // Test incomplete continuation bytes - auto const it_begin{valid_utf8_byte_sequence.cbegin()}; + auto const begin_it{valid_utf8_byte_sequence.cbegin()}; std::string const valid{"Valid"}; - for (auto it_end{valid_utf8_byte_sequence.cend() - 1}; - valid_utf8_byte_sequence.cbegin() != it_end; - --it_end) + for (auto end_it{valid_utf8_byte_sequence.cend() - 1}; + valid_utf8_byte_sequence.cbegin() != end_it; + --end_it) { - std::string const incomplete_byte_sequence{it_begin, it_end}; + std::string const incomplete_byte_sequence{begin_it, end_it}; test_str = valid + incomplete_byte_sequence; REQUIRE((false == is_utf8_encoded(test_str))); @@ -141,56 +142,67 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") { REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); } - // Test invalid header byte + // Test invalid lead byte test_str = valid_utf8_byte_sequence; - constexpr char cInvalidHeaderByte{'\xFF'}; - test_str.front() = cInvalidHeaderByte; + constexpr char cInvalidLeadByte{'\xFF'}; + test_str.front() = cInvalidLeadByte; REQUIRE((false == is_utf8_encoded(test_str))); REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); // Test invalid continuation bytes for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) { test_str = valid_utf8_byte_sequence; - constexpr uint8_t cInvalidateMask{0x40}; - test_str.at(idx) |= cInvalidateMask; + constexpr uint8_t cInvalidContinuationByteMask{0x40}; + test_str.at(idx) |= cInvalidContinuationByteMask; REQUIRE((false == is_utf8_encoded(test_str))); REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value())); } } -TEST_CASE("validate_utf8_code_point_ranges", "[ffi][utils]") { +TEST_CASE("validate_utf8_code_point_ranges", "[utf8_utils]") { // Test 1 byte encoding code point range - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - for (uint32_t code_point{0}; code_point <= 0x7F; ++code_point) { + for (auto code_point{clp::cOneByteUtf8CharCodePointLowerBound}; + code_point <= clp::cOneByteUtf8CharCodePointUpperBound; + ++code_point) + { + REQUIRE(is_utf8_encoded(std::string{static_cast(code_point)})); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)))); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); } // Test 2 byte encoding code point range - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - for (uint32_t code_point{0x80}; code_point <= 0x7FF; ++code_point) { + for (auto code_point{clp::cTwoByteUtf8CharCodePointLowerBound}; + code_point <= clp::cTwoByteUtf8CharCodePointUpperBound; + ++code_point) + { REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)))); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); } // Test 3 byte encoding code point range - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - for (uint32_t code_point{0x800}; code_point <= 0xFFFF; ++code_point) { + for (auto code_point{clp::cThreeByteUtf8CharCodePointLowerBound}; + code_point <= clp::cThreeByteUtf8CharCodePointUpperBound; + ++code_point) + { REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))); REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); } // Test 4 byte encoding code point range - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - for (uint32_t code_point{0x1'0000}; code_point <= 0x10'FFFF; ++code_point) { + for (auto code_point{clp::cFourByteUtf8CharCodePointLowerBound}; + code_point <= clp::cFourByteUtf8CharCodePointUpperBound; + ++code_point) + { REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))); } // Test 4 byte encoding code point out of range // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - for (uint32_t code_point{0x10'FFFF + 1}; code_point <= 0x1F'FFFF; ++code_point) { + for (auto code_point{clp::cFourByteUtf8CharCodePointUpperBound + 1}; code_point <= 0x1F'FFFF; + ++code_point) + { REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)))); } } From ebc1c4f0caf52f2060a1935b5b0c8515d74d9a04 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 16:29:59 -0400 Subject: [PATCH 11/13] Use auto --- components/core/src/clp/utf8_utils.cpp | 6 +++--- components/core/src/clp/utf8_utils.hpp | 19 ++++++++++--------- components/core/tests/test-utf8_utils.cpp | 5 +++-- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp index 08a03f608..06fafd659 100644 --- a/components/core/src/clp/utf8_utils.cpp +++ b/components/core/src/clp/utf8_utils.cpp @@ -44,12 +44,12 @@ auto is_ascii_char(uint8_t byte) -> bool { } auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool { - return (byte & cContinuationByteMask) == cContinuationByte; + return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader; } auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t { - return (code_point << cNumContinuationByteCodePointBits) - + (continuation_byte & cContinuationByteCodePointMask); + return (code_point << cUtf8NumContinuationByteCodePointBits) + + (continuation_byte & cUtf8ContinuationByteCodePointMask); } } // namespace utf8_utils_internal } // namespace clp diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp index c3dc8177a..fe9569b00 100644 --- a/components/core/src/clp/utf8_utils.hpp +++ b/components/core/src/clp/utf8_utils.hpp @@ -8,12 +8,12 @@ namespace clp { // Constants // Lead byte signature -constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx -constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx -constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx -constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0}; // 0b111x_xxxx constexpr uint8_t cTwoByteUtf8CharHeader{0xC0}; // 0b110x_xxxx +constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0}; // 0b1111_xxxx +constexpr uint8_t cThreeByteUtf8CharHeader{0xE0}; // 0b1110_xxxx +constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8}; // 0b1111_1xxx +constexpr uint8_t cFourByteUtf8CharHeader{0xF0}; // 0b1111_0xxx // Code point ranges (inclusive) constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0}; @@ -26,10 +26,10 @@ constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000}; constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF}; // Continuation byte -constexpr uint32_t cContinuationByteMask{0xC0}; -constexpr uint32_t cContinuationByte{0x80}; -constexpr uint32_t cContinuationByteCodePointMask{0x3F}; -constexpr uint8_t cNumContinuationByteCodePointBits{6}; +constexpr uint32_t cUtf8ContinuationByteMask{0xC0}; +constexpr uint32_t cUtf8ContinuationByteHeader{0x80}; +constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F}; +constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6}; /** * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using @@ -101,7 +101,8 @@ auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> uint32_t code_point_lower_bound{}; uint32_t code_point_upper_bound{}; - for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) { + // NOLINTNEXTLINE(readability-qualified-auto) + for (auto it{src.cbegin()}; it != src.cend(); ++it) { auto const byte{static_cast(*it)}; if (0 == num_continuation_bytes_to_validate) { if (utf8_utils_internal::is_ascii_char(byte)) { diff --git a/components/core/tests/test-utf8_utils.cpp b/components/core/tests/test-utf8_utils.cpp index 94e45cac0..77324eaf9 100644 --- a/components/core/tests/test-utf8_utils.cpp +++ b/components/core/tests/test-utf8_utils.cpp @@ -48,9 +48,10 @@ auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_by while (encoded_bytes.size() < num_continuation_bytes) { auto const least_significant_byte{static_cast(code_point)}; encoded_bytes.push_back(static_cast( - (least_significant_byte & ~clp::cContinuationByteMask) | clp::cContinuationByte + (least_significant_byte & ~clp::cUtf8ContinuationByteMask) + | clp::cUtf8ContinuationByteHeader )); - code_point >>= clp::cNumContinuationByteCodePointBits; + code_point >>= clp::cUtf8NumContinuationByteCodePointBits; } uint8_t lead_byte_code_point_mask{}; From 690379c9b05c7aa6c32f6f0b26a6f06594ecf0b1 Mon Sep 17 00:00:00 2001 From: LinZhihao-723 Date: Wed, 26 Jun 2024 16:43:07 -0400 Subject: [PATCH 12/13] Add append option --- components/core/src/clp/ffi/utils.cpp | 23 +++++++++++++++-------- components/core/src/clp/ffi/utils.hpp | 10 ++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp index 3f77564d2..c85c47701 100644 --- a/components/core/src/clp/ffi/utils.cpp +++ b/components/core/src/clp/ffi/utils.cpp @@ -16,10 +16,17 @@ using std::string_view; namespace clp::ffi { auto validate_and_escape_utf8_string(string_view raw) -> std::optional { - string_view::const_iterator next_char_to_copy_it{raw.cbegin()}; std::optional ret_val; auto& escaped{ret_val.emplace()}; escaped.reserve(raw.size() + (raw.size() / 2)); + if (false == validate_and_append_escaped_utf8_string(raw, escaped)) { + return std::nullopt; + } + return ret_val; +} + +auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool { + string_view::const_iterator next_char_to_copy_it{src.cbegin()}; auto escape_handler = [&](string_view::const_iterator it) -> void { // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte @@ -63,20 +70,20 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional { } } if (escape_required) { - escaped.append(next_char_to_copy_it, it); - escaped += escaped_char; + dst.append(next_char_to_copy_it, it); + dst += escaped_char; next_char_to_copy_it = it + 1; } }; - if (false == validate_utf8_string(raw, escape_handler)) { - return std::nullopt; + if (false == validate_utf8_string(src, escape_handler)) { + return false; } - if (raw.cend() != next_char_to_copy_it) { - escaped.append(next_char_to_copy_it, raw.cend()); + if (src.cend() != next_char_to_copy_it) { + dst.append(next_char_to_copy_it, src.cend()); } - return ret_val; + return true; } } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index 160ed687b..8a90169a1 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -15,6 +15,16 @@ namespace clp::ffi { */ [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw ) -> std::optional; + +/** + * Validates whether the given string is UTF-8 encoded, and append the src to the dst by escaping + * any characters to make the string compatible with the JSON specification. + * @param src The source string to validate and escape. + * @param dst Outputs the destination string with escaped src appended. + * @return Whether the src is a valid UTF-8 encoded string. + */ +[[nodiscard]] auto +validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool; } // namespace clp::ffi #endif // CLP_FFI_UTILS_HPP From 118ae4d0757d31398fff3be306af2fd980d5a74b Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Wed, 26 Jun 2024 23:59:19 -0400 Subject: [PATCH 13/13] Update components/core/src/clp/ffi/utils.hpp Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- components/core/src/clp/ffi/utils.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp index 8a90169a1..26823da9c 100644 --- a/components/core/src/clp/ffi/utils.hpp +++ b/components/core/src/clp/ffi/utils.hpp @@ -17,11 +17,12 @@ namespace clp::ffi { ) -> std::optional; /** - * Validates whether the given string is UTF-8 encoded, and append the src to the dst by escaping - * any characters to make the string compatible with the JSON specification. - * @param src The source string to validate and escape. - * @param dst Outputs the destination string with escaped src appended. - * @return Whether the src is a valid UTF-8 encoded string. + * Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any + * characters to make the appended string compatible with the JSON specification. + * @param src The string to validate and escape. + * @param dst Returns `dst` with an escaped version of `src` appended. + * @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded, + * `dst` may be modified. */ [[nodiscard]] auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;