From 51baa2c6227e18ac1ab5dace722073fe9ec18270 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Thu, 20 Jun 2024 04:26:26 -0400
Subject: [PATCH 01/13] Implement utf8 string escape

---
 components/core/CMakeLists.txt           |   7 +-
 components/core/src/clp/ffi/utils.cpp    | 231 +++++++++++++++++++++++
 components/core/src/clp/ffi/utils.hpp    |  18 ++
 components/core/tests/test-ffi_utils.cpp | 168 +++++++++++++++++
 4 files changed, 423 insertions(+), 1 deletion(-)
 create mode 100644 components/core/src/clp/ffi/utils.cpp
 create mode 100644 components/core/src/clp/ffi/utils.hpp
 create mode 100644 components/core/tests/test-ffi_utils.cpp
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 99d3c8469..e5f999aa2 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -324,6 +324,8 @@ set(SOURCE_FILES_unitTest
         src/clp/ffi/search/Subquery.hpp
         src/clp/ffi/search/WildcardToken.cpp
         src/clp/ffi/search/WildcardToken.hpp
+        src/clp/ffi/utils.cpp
+        src/clp/ffi/utils.hpp
         src/clp/FileDescriptor.cpp
         src/clp/FileDescriptor.hpp
         src/clp/FileReader.cpp
@@ -471,7 +473,10 @@ set(SOURCE_FILES_unitTest
         tests/test-string_utils.cpp
         tests/test-TimestampPattern.cpp
         tests/test-Utils.cpp
-        )
+        src/clp/ffi/utils.hpp
+        src/clp/ffi/utils.cpp
+        tests/test-ffi_utils.cpp
+)
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
 target_include_directories(unitTest
         PRIVATE
diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
new file mode 100644
index 000000000..b4677b47e
--- /dev/null
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -0,0 +1,231 @@
+#include "utils.hpp"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+
+using std::string;
+using std::string_view;
+
+namespace clp::ffi {
+namespace {
+/*
+ * @param byte
+ * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte
+ * should match 0b10xx_xxxx.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Appends a single-byte utf8 character into the given string, and escapes it if necessary.
+ * @param character Single-byte utf8 character.
+ * @parma escaped_string Input string where the character(s) are appended to.
+ */
+auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void;
+
+/**
+ * Validates whether the given code point is a valid UTF8 encoding with the given length.
+ * The valid range is defined as following:
+ * .---------------------------------------------.
+ * | Length | First Code Point | Last Code Point |
+ * |--------|------------------|-----------------|
+ * | 1 Byte | 0x00             | 0x7F            |
+ * | 2 Byte | 0x80             | 0x7FF           |
+ * | 3 Byte | 0x8FF            | 0xFFFF          |
+ * | 4 Byte | 0x10000          | 0x10FFFF        |
+ * |--------|------------------|-----------------|
+ * @param code_point
+ * @param encoding_length
+ * @return Whether the code point is a valid encoding.
+ */
+[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool;
+
+/**
+ * Updates the code point by applying the payload of the given continuation byte.
+ * @param code_point
+ * @param continuation_byte
+ * @return Updated code point.
+ */
+[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    constexpr uint8_t cContinuationByteMask{0xC0};
+    constexpr uint8_t cValidMaskedContinuationByte{0x80};
+    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
+}
+
+auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void {
+    switch (character) {
+        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        case 0x08:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('b');
+            break;
+        case 0x09:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('t');
+            break;
+        case 0x0A:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('n');
+            break;
+        case 0x0C:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('f');
+            break;
+        case 0x0D:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('r');
+            break;
+        case 0x22:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('\"');
+            break;
+        case 0x5C:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('\\');
+            break;
+        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        default: {
+            constexpr uint8_t cControlCharacter{0x1F};
+            if (cControlCharacter >= character) {
+                // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the
+                // last byte used by `snprintf` to append '\0'
+                constexpr size_t cControlCharacterBufSize{7};
+                std::array<char, cControlCharacterBufSize> buf{};
+                std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character);
+                escaped_string.append(buf.cbegin(), buf.cend() - 1);
+            } else {
+                escaped_string.push_back(static_cast<char>(character));
+            }
+            break;
+        }
+    }
+}
+
+auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool {
+    switch (encoding_length) {
+        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        case 1:
+            return code_point <= 0x7F;
+        case 2:
+            return (0x80 <= code_point && code_point <= 0x7FF);
+        case 3:
+            return (0x800 <= code_point && code_point <= 0xFFFF);
+        case 4:
+            return (0x1'0000 <= code_point && code_point <= 0x10'FFFF);
+        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        default:
+            return false;
+    }
+}
+
+auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
+    constexpr uint32_t cContinuationBytePayloadMask{0x3F};
+    constexpr uint8_t cNumContinuationBytePayloadBits{6};
+    return (code_point << cNumContinuationBytePayloadBits)
+           + (continuation_byte & cContinuationBytePayloadMask);
+}
+}  // namespace
+
+auto escape_utf8_string(string_view raw) -> std::optional<string> {
+    string_view::const_iterator bookmark_it{};
+    size_t encoding_length{};
+    enum class State : uint8_t {
+        HeadByteToValidate = 0,
+        OneContinuationByteToValidate,
+        TwoContinuationBytesToValidate,
+        ThreeContinuationBytesToValidate
+    };
+    State state{State::HeadByteToValidate};
+    string escaped;
+    escaped.reserve(raw.size() + (raw.size() >> 2));
+
+    uint32_t code_point{};
+    auto validate_encoding_length_and_set_state
+            = [&encoding_length, &state, &code_point](uint8_t byte) -> bool {
+        constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
+        constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
+        constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
+        constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
+        constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
+        constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
+        if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
+            encoding_length = 4;
+            code_point = (~cThreeByteContinuationMask & byte);
+            state = State::ThreeContinuationBytesToValidate;
+        } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
+            encoding_length = 3;
+            code_point = (~cTwoByteContinuationMask & byte);
+            state = State::TwoContinuationBytesToValidate;
+        } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) {
+            encoding_length = 2;
+            code_point = (~cOneByteContinuationMask & byte);
+            state = State::OneContinuationByteToValidate;
+        } else {
+            return false;
+        }
+        return true;
+    };
+
+    // For multibyte encoded values, we will incrementally build the code point, and validate its
+    // range in the end.
+    for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        switch (state) {
+            case State::HeadByteToValidate: {
+                if (is_valid_code_point(static_cast<uint32_t>(byte), 1)) {
+                    escape_and_append_single_byte_utf8_char(byte, escaped);
+                } else {
+                    if (false == validate_encoding_length_and_set_state(byte)) {
+                        return std::nullopt;
+                    }
+                    bookmark_it = it;
+                }
+                break;
+            }
+            case State::OneContinuationByteToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+
+                if (false == is_valid_code_point(code_point, encoding_length)) {
+                    return std::nullopt;
+                }
+                escaped.append(bookmark_it, bookmark_it + encoding_length);
+                state = State::HeadByteToValidate;
+                break;
+            case State::TwoContinuationBytesToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+                state = State::OneContinuationByteToValidate;
+                break;
+            case State::ThreeContinuationBytesToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+                state = State::TwoContinuationBytesToValidate;
+                break;
+            default:
+                return std::nullopt;
+        }
+    }
+
+    if (State::HeadByteToValidate != state) {
+        // Incomplete multibyte UTF8 sequence
+        return std::nullopt;
+    }
+
+    return std::move(escaped);
+}
+}  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
new file mode 100644
index 000000000..cd1a60340
--- /dev/null
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -0,0 +1,18 @@
+#ifndef CLP_FFI_UTILS_HPP
+#define CLP_FFI_UTILS_HPP
+
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace clp::ffi {
+/**
+ * Escapes a UTF8 encoded string.
+ * @param raw The raw string to escape.
+ * @return The escaped string on success.
+ * @return std::nullopt if the string contains none-UTF8 encoded byte sequence.
+ */
+[[nodiscard]] auto escape_utf8_string(std::string_view raw) -> std::optional<std::string>;
+}  // namespace clp::ffi
+
+#endif  // CLP_UTILS_HPP
diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp
new file mode 100644
index 000000000..372200f14
--- /dev/null
+++ b/components/core/tests/test-ffi_utils.cpp
@@ -0,0 +1,168 @@
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+#include <json/single_include/nlohmann/json.hpp>
+
+#include "../src/clp/ffi/utils.hpp"
+
+using clp::ffi::escape_utf8_string;
+
+namespace {
+/**
+ * Gets an expected escaped string by first convert the raw string into a json string and then dumps
+ * the a printable string using nlohmann::json.
+ * @param raw
+ * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped.
+ */
+[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
+
+auto get_expected_escaped_string(std::string_view raw) -> std::string {
+    nlohmann::json const json_str = raw;  // Don't use '{}' initializer
+    auto const dumped_str{json_str.dump()};
+    return {dumped_str.begin() + 1, dumped_str.end() - 1};
+}
+}  // namespace
+
+TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
+    std::string test_str;
+    std::optional<std::string> actual;
+
+    // Test empty string
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string that has nothing to escape
+    test_str = "This string has nothing to escape :)";
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string with all single byte UTF8 characters, which include all characters we escape
+    test_str.clear();
+    for (uint8_t i{0}; i <= static_cast<uint8_t>(INT8_MAX); ++i) {
+        test_str.push_back(static_cast<char>(i));
+    }
+    // Shuffle characters randomly, ensure control characters are not grouped together.
+    // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp)
+    std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{});
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test valid UTF8 chars with continuation bytes
+    std::vector<std::string> const valid_utf8{
+            "\n",
+            "\xF0\xA0\x80\x8F",  // https://en.wiktionary.org/wiki/%F0%A0%80%8F
+            "a",
+            "\xE4\xB8\xAD",  // https://en.wiktionary.org/wiki/%E4%B8%AD
+            "\x1F",
+            "\xC2\xA2",  // ¢
+            "\\"
+    };
+    test_str.clear();
+    for (auto const& str : valid_utf8) {
+        test_str.append(str);
+    }
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+}
+
+TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") {
+    std::string test_str;
+    std::optional<std::string> actual;
+
+    // Test UTF8 code point range validation
+    auto const valid_code_point_lower_bound = GENERATE(
+            std::string_view{"\xC2\x80"},
+            std::string_view{"\xE0\xA0\x80"},
+            std::string_view{"\xF0\x90\x80\x80"}
+    );
+
+    auto const valid_code_point_upper_bound = GENERATE(
+            std::string_view{"\xDF\xBF"},
+            std::string_view{"\xEF\xBF\xBF"},
+            std::string_view{"\xF4\x8F\xBF\xBF"}
+    );
+
+    test_str = valid_code_point_lower_bound;
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    test_str = valid_code_point_upper_bound;
+    actual = escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test invalid code point: 0x7F (only need one byte)
+    test_str = "\xC1\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xE0\x81\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x81\x81\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x73 (only need one byte)
+    test_str = "\xC1\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xE0\x81\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x81\x81\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x7FF (only need 2 bytes)
+    test_str = "\xE0\x9F\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x80\x9F\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x7F3 (only need 2 bytes)
+    test_str = "\xE0\x9F\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x80\x9F\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0xFFFF (only need 3 bytes)
+    test_str = "\xF0\x8F\xBF\xBF";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0xFFF3 (only need 3 bytes)
+    test_str = "\xF0\x8F\xBF\xB3";
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test incomplete continuation bytes
+    std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()};
+    std::string const valid{"Valid"};
+    for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1};
+         valid_code_point_lower_bound.cbegin() != it_end;
+         --it_end)
+    {
+        std::string const incomplete_byte_sequence{it_begin, it_end};
+        REQUIRE((false == escape_utf8_string(valid + incomplete_byte_sequence).has_value()));
+        REQUIRE((false == escape_utf8_string(incomplete_byte_sequence + valid).has_value()));
+    }
+
+    // Test invalid header byte
+    test_str = valid_code_point_lower_bound;
+    constexpr char cInvalidHeaderByte{'\xFF'};
+    test_str.front() = cInvalidHeaderByte;
+    REQUIRE((false == escape_utf8_string(test_str).has_value()));
+
+    // Test invalid continuation bytes
+    for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) {
+        test_str = valid_code_point_lower_bound;
+        constexpr uint8_t cInvalidateMask{0x40};
+        test_str.at(idx) |= cInvalidateMask;
+        REQUIRE((false == escape_utf8_string(test_str).has_value()));
+    }
+}

From 90a01632b7131f90856116c09b9924dc6817ffe9 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Thu, 20 Jun 2024 04:30:31 -0400
Subject: [PATCH 02/13] Refactoring

---
 components/core/src/clp/ffi/utils.cpp    | 462 +++++++++++------------
 components/core/src/clp/ffi/utils.hpp    |  38 +-
 components/core/tests/test-ffi_utils.cpp | 341 ++++++++---------
 3 files changed, 424 insertions(+), 417 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
index b4677b47e..43cd4269c 100644
--- a/components/core/src/clp/ffi/utils.cpp
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -1,231 +1,231 @@
-#include "utils.hpp"
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <utility>
-
-using std::string;
-using std::string_view;
-
-namespace clp::ffi {
-namespace {
-/*
- * @param byte
- * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte
- * should match 0b10xx_xxxx.
- */
-[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
-
-/**
- * Appends a single-byte utf8 character into the given string, and escapes it if necessary.
- * @param character Single-byte utf8 character.
- * @parma escaped_string Input string where the character(s) are appended to.
- */
-auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void;
-
-/**
- * Validates whether the given code point is a valid UTF8 encoding with the given length.
- * The valid range is defined as following:
- * .---------------------------------------------.
- * | Length | First Code Point | Last Code Point |
- * |--------|------------------|-----------------|
- * | 1 Byte | 0x00             | 0x7F            |
- * | 2 Byte | 0x80             | 0x7FF           |
- * | 3 Byte | 0x8FF            | 0xFFFF          |
- * | 4 Byte | 0x10000          | 0x10FFFF        |
- * |--------|------------------|-----------------|
- * @param code_point
- * @param encoding_length
- * @return Whether the code point is a valid encoding.
- */
-[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool;
-
-/**
- * Updates the code point by applying the payload of the given continuation byte.
- * @param code_point
- * @param continuation_byte
- * @return Updated code point.
- */
-[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
-
-auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
-    constexpr uint8_t cContinuationByteMask{0xC0};
-    constexpr uint8_t cValidMaskedContinuationByte{0x80};
-    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
-}
-
-auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void {
-    switch (character) {
-        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        case 0x08:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('b');
-            break;
-        case 0x09:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('t');
-            break;
-        case 0x0A:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('n');
-            break;
-        case 0x0C:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('f');
-            break;
-        case 0x0D:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('r');
-            break;
-        case 0x22:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('\"');
-            break;
-        case 0x5C:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('\\');
-            break;
-        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        default: {
-            constexpr uint8_t cControlCharacter{0x1F};
-            if (cControlCharacter >= character) {
-                // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the
-                // last byte used by `snprintf` to append '\0'
-                constexpr size_t cControlCharacterBufSize{7};
-                std::array<char, cControlCharacterBufSize> buf{};
-                std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character);
-                escaped_string.append(buf.cbegin(), buf.cend() - 1);
-            } else {
-                escaped_string.push_back(static_cast<char>(character));
-            }
-            break;
-        }
-    }
-}
-
-auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool {
-    switch (encoding_length) {
-        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        case 1:
-            return code_point <= 0x7F;
-        case 2:
-            return (0x80 <= code_point && code_point <= 0x7FF);
-        case 3:
-            return (0x800 <= code_point && code_point <= 0xFFFF);
-        case 4:
-            return (0x1'0000 <= code_point && code_point <= 0x10'FFFF);
-        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        default:
-            return false;
-    }
-}
-
-auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
-    constexpr uint32_t cContinuationBytePayloadMask{0x3F};
-    constexpr uint8_t cNumContinuationBytePayloadBits{6};
-    return (code_point << cNumContinuationBytePayloadBits)
-           + (continuation_byte & cContinuationBytePayloadMask);
-}
-}  // namespace
-
-auto escape_utf8_string(string_view raw) -> std::optional<string> {
-    string_view::const_iterator bookmark_it{};
-    size_t encoding_length{};
-    enum class State : uint8_t {
-        HeadByteToValidate = 0,
-        OneContinuationByteToValidate,
-        TwoContinuationBytesToValidate,
-        ThreeContinuationBytesToValidate
-    };
-    State state{State::HeadByteToValidate};
-    string escaped;
-    escaped.reserve(raw.size() + (raw.size() >> 2));
-
-    uint32_t code_point{};
-    auto validate_encoding_length_and_set_state
-            = [&encoding_length, &state, &code_point](uint8_t byte) -> bool {
-        constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
-        constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
-        constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
-        constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
-        constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
-        constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
-        if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
-            encoding_length = 4;
-            code_point = (~cThreeByteContinuationMask & byte);
-            state = State::ThreeContinuationBytesToValidate;
-        } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
-            encoding_length = 3;
-            code_point = (~cTwoByteContinuationMask & byte);
-            state = State::TwoContinuationBytesToValidate;
-        } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) {
-            encoding_length = 2;
-            code_point = (~cOneByteContinuationMask & byte);
-            state = State::OneContinuationByteToValidate;
-        } else {
-            return false;
-        }
-        return true;
-    };
-
-    // For multibyte encoded values, we will incrementally build the code point, and validate its
-    // range in the end.
-    for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) {
-        auto const byte{static_cast<uint8_t>(*it)};
-        switch (state) {
-            case State::HeadByteToValidate: {
-                if (is_valid_code_point(static_cast<uint32_t>(byte), 1)) {
-                    escape_and_append_single_byte_utf8_char(byte, escaped);
-                } else {
-                    if (false == validate_encoding_length_and_set_state(byte)) {
-                        return std::nullopt;
-                    }
-                    bookmark_it = it;
-                }
-                break;
-            }
-            case State::OneContinuationByteToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-
-                if (false == is_valid_code_point(code_point, encoding_length)) {
-                    return std::nullopt;
-                }
-                escaped.append(bookmark_it, bookmark_it + encoding_length);
-                state = State::HeadByteToValidate;
-                break;
-            case State::TwoContinuationBytesToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-                state = State::OneContinuationByteToValidate;
-                break;
-            case State::ThreeContinuationBytesToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-                state = State::TwoContinuationBytesToValidate;
-                break;
-            default:
-                return std::nullopt;
-        }
-    }
-
-    if (State::HeadByteToValidate != state) {
-        // Incomplete multibyte UTF8 sequence
-        return std::nullopt;
-    }
-
-    return std::move(escaped);
-}
-}  // namespace clp::ffi
+#include "utils.hpp"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+
+using std::string;
+using std::string_view;
+
+namespace clp::ffi {
+namespace {
+/*
+ * @param byte
+ * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte
+ * should match 0b10xx_xxxx.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Appends a single-byte utf8 character into the given string, and escapes it if necessary.
+ * @param character Single-byte utf8 character.
+ * @parma escaped_string Input string where the character(s) are appended to.
+ */
+auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void;
+
+/**
+ * Validates whether the given code point is a valid UTF8 encoding with the given length.
+ * The valid range is defined as following:
+ * .---------------------------------------------.
+ * | Length | First Code Point | Last Code Point |
+ * |--------|------------------|-----------------|
+ * | 1 Byte | 0x00             | 0x7F            |
+ * | 2 Byte | 0x80             | 0x7FF           |
+ * | 3 Byte | 0x8FF            | 0xFFFF          |
+ * | 4 Byte | 0x10000          | 0x10FFFF        |
+ * |--------|------------------|-----------------|
+ * @param code_point
+ * @param encoding_length
+ * @return Whether the code point is a valid encoding.
+ */
+[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool;
+
+/**
+ * Updates the code point by applying the payload of the given continuation byte.
+ * @param code_point
+ * @param continuation_byte
+ * @return Updated code point.
+ */
+[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    constexpr uint8_t cContinuationByteMask{0xC0};
+    constexpr uint8_t cValidMaskedContinuationByte{0x80};
+    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
+}
+
+auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void {
+    switch (character) {
+        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        case 0x08:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('b');
+            break;
+        case 0x09:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('t');
+            break;
+        case 0x0A:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('n');
+            break;
+        case 0x0C:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('f');
+            break;
+        case 0x0D:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('r');
+            break;
+        case 0x22:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('\"');
+            break;
+        case 0x5C:
+            escaped_string.push_back('\\');
+            escaped_string.push_back('\\');
+            break;
+        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        default: {
+            constexpr uint8_t cControlCharacter{0x1F};
+            if (cControlCharacter >= character) {
+                // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the
+                // last byte used by `snprintf` to append '\0'
+                constexpr size_t cControlCharacterBufSize{7};
+                std::array<char, cControlCharacterBufSize> buf{};
+                std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character);
+                escaped_string.append(buf.cbegin(), buf.cend() - 1);
+            } else {
+                escaped_string.push_back(static_cast<char>(character));
+            }
+            break;
+        }
+    }
+}
+
+auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool {
+    switch (encoding_length) {
+        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        case 1:
+            return code_point <= 0x7F;
+        case 2:
+            return (0x80 <= code_point && code_point <= 0x7FF);
+        case 3:
+            return (0x800 <= code_point && code_point <= 0xFFFF);
+        case 4:
+            return (0x1'0000 <= code_point && code_point <= 0x10'FFFF);
+        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        default:
+            return false;
+    }
+}
+
+auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
+    constexpr uint32_t cContinuationBytePayloadMask{0x3F};
+    constexpr uint8_t cNumContinuationBytePayloadBits{6};
+    return (code_point << cNumContinuationBytePayloadBits)
+           + (continuation_byte & cContinuationBytePayloadMask);
+}
+}  // namespace
+
+auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
+    string_view::const_iterator bookmark_it{};
+    size_t encoding_length{};
+    enum class State : uint8_t {
+        HeadByteToValidate = 0,
+        OneContinuationByteToValidate,
+        TwoContinuationBytesToValidate,
+        ThreeContinuationBytesToValidate
+    };
+    State state{State::HeadByteToValidate};
+    string escaped;
+    escaped.reserve(raw.size() + (raw.size() >> 2));
+
+    uint32_t code_point{};
+    auto validate_encoding_length_and_set_state
+            = [&encoding_length, &state, &code_point](uint8_t byte) -> bool {
+        constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
+        constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
+        constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
+        constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
+        constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
+        constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
+        if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
+            encoding_length = 4;
+            code_point = (~cThreeByteContinuationMask & byte);
+            state = State::ThreeContinuationBytesToValidate;
+        } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
+            encoding_length = 3;
+            code_point = (~cTwoByteContinuationMask & byte);
+            state = State::TwoContinuationBytesToValidate;
+        } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) {
+            encoding_length = 2;
+            code_point = (~cOneByteContinuationMask & byte);
+            state = State::OneContinuationByteToValidate;
+        } else {
+            return false;
+        }
+        return true;
+    };
+
+    // For multibyte encoded values, we will incrementally build the code point, and validate its
+    // range in the end.
+    for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        switch (state) {
+            case State::HeadByteToValidate: {
+                if (is_valid_code_point(static_cast<uint32_t>(byte), 1)) {
+                    escape_and_append_single_byte_utf8_char(byte, escaped);
+                } else {
+                    if (false == validate_encoding_length_and_set_state(byte)) {
+                        return std::nullopt;
+                    }
+                    bookmark_it = it;
+                }
+                break;
+            }
+            case State::OneContinuationByteToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+
+                if (false == is_valid_code_point(code_point, encoding_length)) {
+                    return std::nullopt;
+                }
+                escaped.append(bookmark_it, bookmark_it + encoding_length);
+                state = State::HeadByteToValidate;
+                break;
+            case State::TwoContinuationBytesToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+                state = State::OneContinuationByteToValidate;
+                break;
+            case State::ThreeContinuationBytesToValidate:
+                if (false == is_valid_utf8_continuation_byte(byte)) {
+                    return std::nullopt;
+                }
+                code_point = update_code_point(code_point, byte);
+                state = State::TwoContinuationBytesToValidate;
+                break;
+            default:
+                return std::nullopt;
+        }
+    }
+
+    if (State::HeadByteToValidate != state) {
+        // Incomplete multibyte UTF8 sequence
+        return std::nullopt;
+    }
+
+    return std::move(escaped);
+}
+}  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index cd1a60340..acd977b80 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -1,18 +1,20 @@
-#ifndef CLP_FFI_UTILS_HPP
-#define CLP_FFI_UTILS_HPP
-
-#include <optional>
-#include <string>
-#include <string_view>
-
-namespace clp::ffi {
-/**
- * Escapes a UTF8 encoded string.
- * @param raw The raw string to escape.
- * @return The escaped string on success.
- * @return std::nullopt if the string contains none-UTF8 encoded byte sequence.
- */
-[[nodiscard]] auto escape_utf8_string(std::string_view raw) -> std::optional<std::string>;
-}  // namespace clp::ffi
-
-#endif  // CLP_UTILS_HPP
+#ifndef CLP_FFI_UTILS_HPP
+#define CLP_FFI_UTILS_HPP
+
+#include <optional>
+#include <string>
+#include <string_view>
+
+namespace clp::ffi {
+/**
+ * Validates whether the given string is UTF8 encoded, and escapes any characters to generate to
+ * make the string human readable.
+ * @param raw The raw string to escape.
+ * @return The escaped string on success.
+ * @return std::nullopt if the string contains none-UTF8 encoded byte sequence.
+ */
+[[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
+) -> std::optional<std::string>;
+}  // namespace clp::ffi
+
+#endif  // CLP_UTILS_HPP
diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp
index 372200f14..4deb16865 100644
--- a/components/core/tests/test-ffi_utils.cpp
+++ b/components/core/tests/test-ffi_utils.cpp
@@ -1,168 +1,173 @@
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <optional>
-#include <random>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include <Catch2/single_include/catch2/catch.hpp>
-#include <json/single_include/nlohmann/json.hpp>
-
-#include "../src/clp/ffi/utils.hpp"
-
-using clp::ffi::escape_utf8_string;
-
-namespace {
-/**
- * Gets an expected escaped string by first convert the raw string into a json string and then dumps
- * the a printable string using nlohmann::json.
- * @param raw
- * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped.
- */
-[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
-
-auto get_expected_escaped_string(std::string_view raw) -> std::string {
-    nlohmann::json const json_str = raw;  // Don't use '{}' initializer
-    auto const dumped_str{json_str.dump()};
-    return {dumped_str.begin() + 1, dumped_str.end() - 1};
-}
-}  // namespace
-
-TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
-    std::string test_str;
-    std::optional<std::string> actual;
-
-    // Test empty string
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    // Test string that has nothing to escape
-    test_str = "This string has nothing to escape :)";
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    // Test string with all single byte UTF8 characters, which include all characters we escape
-    test_str.clear();
-    for (uint8_t i{0}; i <= static_cast<uint8_t>(INT8_MAX); ++i) {
-        test_str.push_back(static_cast<char>(i));
-    }
-    // Shuffle characters randomly, ensure control characters are not grouped together.
-    // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp)
-    std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{});
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    // Test valid UTF8 chars with continuation bytes
-    std::vector<std::string> const valid_utf8{
-            "\n",
-            "\xF0\xA0\x80\x8F",  // https://en.wiktionary.org/wiki/%F0%A0%80%8F
-            "a",
-            "\xE4\xB8\xAD",  // https://en.wiktionary.org/wiki/%E4%B8%AD
-            "\x1F",
-            "\xC2\xA2",  // ¢
-            "\\"
-    };
-    test_str.clear();
-    for (auto const& str : valid_utf8) {
-        test_str.append(str);
-    }
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-}
-
-TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") {
-    std::string test_str;
-    std::optional<std::string> actual;
-
-    // Test UTF8 code point range validation
-    auto const valid_code_point_lower_bound = GENERATE(
-            std::string_view{"\xC2\x80"},
-            std::string_view{"\xE0\xA0\x80"},
-            std::string_view{"\xF0\x90\x80\x80"}
-    );
-
-    auto const valid_code_point_upper_bound = GENERATE(
-            std::string_view{"\xDF\xBF"},
-            std::string_view{"\xEF\xBF\xBF"},
-            std::string_view{"\xF4\x8F\xBF\xBF"}
-    );
-
-    test_str = valid_code_point_lower_bound;
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    test_str = valid_code_point_upper_bound;
-    actual = escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    // Test invalid code point: 0x7F (only need one byte)
-    test_str = "\xC1\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xE0\x81\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x81\x81\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x73 (only need one byte)
-    test_str = "\xC1\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xE0\x81\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x81\x81\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x7FF (only need 2 bytes)
-    test_str = "\xE0\x9F\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x80\x9F\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x7F3 (only need 2 bytes)
-    test_str = "\xE0\x9F\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x80\x9F\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0xFFFF (only need 3 bytes)
-    test_str = "\xF0\x8F\xBF\xBF";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0xFFF3 (only need 3 bytes)
-    test_str = "\xF0\x8F\xBF\xB3";
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test incomplete continuation bytes
-    std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()};
-    std::string const valid{"Valid"};
-    for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1};
-         valid_code_point_lower_bound.cbegin() != it_end;
-         --it_end)
-    {
-        std::string const incomplete_byte_sequence{it_begin, it_end};
-        REQUIRE((false == escape_utf8_string(valid + incomplete_byte_sequence).has_value()));
-        REQUIRE((false == escape_utf8_string(incomplete_byte_sequence + valid).has_value()));
-    }
-
-    // Test invalid header byte
-    test_str = valid_code_point_lower_bound;
-    constexpr char cInvalidHeaderByte{'\xFF'};
-    test_str.front() = cInvalidHeaderByte;
-    REQUIRE((false == escape_utf8_string(test_str).has_value()));
-
-    // Test invalid continuation bytes
-    for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) {
-        test_str = valid_code_point_lower_bound;
-        constexpr uint8_t cInvalidateMask{0x40};
-        test_str.at(idx) |= cInvalidateMask;
-        REQUIRE((false == escape_utf8_string(test_str).has_value()));
-    }
-}
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+#include <json/single_include/nlohmann/json.hpp>
+
+#include "../src/clp/ffi/utils.hpp"
+
+using clp::ffi::validate_and_escape_utf8_string;
+
+namespace {
+/**
+ * Gets an expected escaped string by first convert the raw string into a json string and then dumps
+ * the a printable string using nlohmann::json.
+ * @param raw
+ * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped.
+ */
+[[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
+
+auto get_expected_escaped_string(std::string_view raw) -> std::string {
+    nlohmann::json const json_str = raw;  // Don't use '{}' initializer
+    auto const dumped_str{json_str.dump()};
+    return {dumped_str.begin() + 1, dumped_str.end() - 1};
+}
+}  // namespace
+
+TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
+    std::string test_str;
+    std::optional<std::string> actual;
+
+    // Test empty string
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string that has nothing to escape
+    test_str = "This string has nothing to escape :)";
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test string with all single byte UTF8 characters, which include all characters we escape
+    test_str.clear();
+    for (uint8_t i{0}; i <= static_cast<uint8_t>(INT8_MAX); ++i) {
+        test_str.push_back(static_cast<char>(i));
+    }
+    // Shuffle characters randomly, ensure control characters are not grouped together.
+    // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp)
+    std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{});
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test valid UTF8 chars with continuation bytes
+    std::vector<std::string> const valid_utf8{
+            "\n",
+            "\xF0\xA0\x80\x8F",  // https://en.wiktionary.org/wiki/%F0%A0%80%8F
+            "a",
+            "\xE4\xB8\xAD",  // https://en.wiktionary.org/wiki/%E4%B8%AD
+            "\x1F",
+            "\xC2\xA2",  // ¢
+            "\\"
+    };
+    test_str.clear();
+    for (auto const& str : valid_utf8) {
+        test_str.append(str);
+    }
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+}
+
+TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") {
+    std::string test_str;
+    std::optional<std::string> actual;
+
+    // Test UTF8 code point range validation
+    auto const valid_code_point_lower_bound = GENERATE(
+            std::string_view{"\xC2\x80"},
+            std::string_view{"\xE0\xA0\x80"},
+            std::string_view{"\xF0\x90\x80\x80"}
+    );
+
+    auto const valid_code_point_upper_bound = GENERATE(
+            std::string_view{"\xDF\xBF"},
+            std::string_view{"\xEF\xBF\xBF"},
+            std::string_view{"\xF4\x8F\xBF\xBF"}
+    );
+
+    test_str = valid_code_point_lower_bound;
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    test_str = valid_code_point_upper_bound;
+    actual = validate_and_escape_utf8_string(test_str);
+    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
+
+    // Test invalid code point: 0x7F (only need one byte)
+    test_str = "\xC1\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xE0\x81\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x81\x81\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x73 (only need one byte)
+    test_str = "\xC1\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xE0\x81\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x81\x81\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x7FF (only need 2 bytes)
+    test_str = "\xE0\x9F\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x80\x9F\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0x7F3 (only need 2 bytes)
+    test_str = "\xE0\x9F\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    test_str = "\xF0\x80\x9F\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0xFFFF (only need 3 bytes)
+    test_str = "\xF0\x8F\xBF\xBF";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid code point: 0xFFF3 (only need 3 bytes)
+    test_str = "\xF0\x8F\xBF\xB3";
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test incomplete continuation bytes
+    std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()};
+    std::string const valid{"Valid"};
+    for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1};
+         valid_code_point_lower_bound.cbegin() != it_end;
+         --it_end)
+    {
+        std::string const incomplete_byte_sequence{it_begin, it_end};
+        REQUIRE(
+                (false
+                 == validate_and_escape_utf8_string(valid + incomplete_byte_sequence).has_value())
+        );
+        REQUIRE(
+                (false
+                 == validate_and_escape_utf8_string(incomplete_byte_sequence + valid).has_value())
+        );
+    }
+
+    // Test invalid header byte
+    test_str = valid_code_point_lower_bound;
+    constexpr char cInvalidHeaderByte{'\xFF'};
+    test_str.front() = cInvalidHeaderByte;
+    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+    // Test invalid continuation bytes
+    for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) {
+        test_str = valid_code_point_lower_bound;
+        constexpr uint8_t cInvalidateMask{0x40};
+        test_str.at(idx) |= cInvalidateMask;
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+    }
+}

From 690d1b4a231abd5d6ff435e18a28d9530749522c Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Thu, 20 Jun 2024 04:50:01 -0400
Subject: [PATCH 03/13] Update cmake

---
 components/core/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index e5f999aa2..c8ad15f79 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -473,8 +473,6 @@ set(SOURCE_FILES_unitTest
         tests/test-string_utils.cpp
         tests/test-TimestampPattern.cpp
         tests/test-Utils.cpp
-        src/clp/ffi/utils.hpp
-        src/clp/ffi/utils.cpp
         tests/test-ffi_utils.cpp
 )
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})

From fd3b91cf46c8b800479afd13d856b3a3e886d16c Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Sat, 22 Jun 2024 17:18:25 -0400
Subject: [PATCH 04/13] Update cmake format

---
 components/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index c8ad15f79..50abbc295 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -474,7 +474,7 @@ set(SOURCE_FILES_unitTest
         tests/test-TimestampPattern.cpp
         tests/test-Utils.cpp
         tests/test-ffi_utils.cpp
-)
+        )
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
 target_include_directories(unitTest
         PRIVATE

From ea9a924dcbef98f2e301b7171206b9ff0900cdc4 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Sat, 22 Jun 2024 17:20:58 -0400
Subject: [PATCH 05/13] Let compiler figure out which variable to capture

---
 components/core/src/clp/ffi/utils.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
index 43cd4269c..56fd7eb7c 100644
--- a/components/core/src/clp/ffi/utils.cpp
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -148,8 +148,7 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
     escaped.reserve(raw.size() + (raw.size() >> 2));
 
     uint32_t code_point{};
-    auto validate_encoding_length_and_set_state
-            = [&encoding_length, &state, &code_point](uint8_t byte) -> bool {
+    auto validate_encoding_length_and_set_state = [&](uint8_t byte) -> bool {
         constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
         constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
         constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx

From 191c4ff2bb2967361d37e9f0b40e75bd78cae4c5 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Mon, 24 Jun 2024 19:01:47 -0400
Subject: [PATCH 06/13] Refactoring according to the code review comments

---
 components/core/src/clp/ffi/utils.cpp    | 304 ++++++++---------------
 components/core/src/clp/ffi/utils.hpp    | 123 ++++++++-
 components/core/tests/test-ffi_utils.cpp | 182 ++++++++------
 3 files changed, 335 insertions(+), 274 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
index 56fd7eb7c..e074311b4 100644
--- a/components/core/src/clp/ffi/utils.cpp
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -8,122 +8,131 @@
 #include <string>
 #include <string_view>
 #include <tuple>
-#include <utility>
 
 using std::string;
 using std::string_view;
 
 namespace clp::ffi {
-namespace {
-/*
- * @param byte
- * @return Whether the input byte is a valid utf8 continuation byte. A valid utf8 continuation byte
- * should match 0b10xx_xxxx.
- */
-[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
+    string_view::const_iterator bookmark{raw.cbegin()};
+    string escaped;
+    escaped.reserve(raw.size() + (raw.size() / 2));
+
+    auto escape_handler = [&](string_view::const_iterator it) -> void {
+        // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
+        // used by `snprintf` to append '\0'
+        constexpr size_t cControlCharacterBufSize{7};
+        std::array<char, cControlCharacterBufSize> buf{};
+        std::string_view escaped_content;
+        bool escape_required{true};
+        switch (*it) {
+            case '\b':
+                escaped_content = "\\b";
+                break;
+            case '\t':
+                escaped_content = "\\t";
+                break;
+            case '\n':
+                escaped_content = "\\n";
+                break;
+            case '\f':
+                escaped_content = "\\f";
+                break;
+            case '\r':
+                escaped_content = "\\r";
+                break;
+            case '\\':
+                escaped_content = "\\\\";
+                break;
+            case '"':
+                escaped_content = "\\\"";
+                break;
+            default: {
+                constexpr uint8_t cLargestControlCharacter{0x1F};
+                auto const byte{static_cast<uint8_t>(*it)};
+                if (cLargestControlCharacter >= byte) {
+                    std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
+                    escaped_content = {buf.data(), buf.size() - 1};
+                } else {
+                    escape_required = false;
+                }
+                break;
+            }
+        }
+        if (escape_required) {
+            escaped.append(bookmark, it);
+            escaped.append(escaped_content.cbegin(), escaped_content.cend());
+            bookmark = it + 1;
+        }
+    };
 
-/**
- * Appends a single-byte utf8 character into the given string, and escapes it if necessary.
- * @param character Single-byte utf8 character.
- * @parma escaped_string Input string where the character(s) are appended to.
- */
-auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void;
+    if (false == generic_validate_utf8_string(raw, escape_handler)) {
+        return std::nullopt;
+    }
 
-/**
- * Validates whether the given code point is a valid UTF8 encoding with the given length.
- * The valid range is defined as following:
- * .---------------------------------------------.
- * | Length | First Code Point | Last Code Point |
- * |--------|------------------|-----------------|
- * | 1 Byte | 0x00             | 0x7F            |
- * | 2 Byte | 0x80             | 0x7FF           |
- * | 3 Byte | 0x8FF            | 0xFFFF          |
- * | 4 Byte | 0x10000          | 0x10FFFF        |
- * |--------|------------------|-----------------|
- * @param code_point
- * @param encoding_length
- * @return Whether the code point is a valid encoding.
- */
-[[nodiscard]] auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool;
+    if (raw.cend() != bookmark) {
+        escaped.append(bookmark, raw.cend());
+    }
 
-/**
- * Updates the code point by applying the payload of the given continuation byte.
- * @param code_point
- * @param continuation_byte
- * @return Updated code point.
- */
-[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+    return escaped;
+}
 
-auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
-    constexpr uint8_t cContinuationByteMask{0xC0};
-    constexpr uint8_t cValidMaskedContinuationByte{0x80};
-    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
+auto is_utf8_encoded(string_view str) -> bool {
+    auto escape_handler = []([[maybe_unused]] string_view::const_iterator it) -> void {};
+    return generic_validate_utf8_string(str, escape_handler);
 }
 
-auto escape_and_append_single_byte_utf8_char(uint8_t character, string& escaped_string) -> void {
-    switch (character) {
+namespace utils_hpp {
+auto validate_header_byte_and_set_code_point(
+        uint8_t header,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool {
+    constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
+    constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
+    constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
+    constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
+    constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
+    constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
+
+    if ((header & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
+        num_continuation_bytes = 3;
+        code_point = (~cThreeByteContinuationMask & header);
         // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        case 0x08:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('b');
-            break;
-        case 0x09:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('t');
-            break;
-        case 0x0A:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('n');
-            break;
-        case 0x0C:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('f');
-            break;
-        case 0x0D:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('r');
-            break;
-        case 0x22:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('\"');
-            break;
-        case 0x5C:
-            escaped_string.push_back('\\');
-            escaped_string.push_back('\\');
-            break;
+        code_point_lower_bound = 0x1'0000;
+        code_point_upper_bound = 0x10'FFFF;
         // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        default: {
-            constexpr uint8_t cControlCharacter{0x1F};
-            if (cControlCharacter >= character) {
-                // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the
-                // last byte used by `snprintf` to append '\0'
-                constexpr size_t cControlCharacterBufSize{7};
-                std::array<char, cControlCharacterBufSize> buf{};
-                std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", character);
-                escaped_string.append(buf.cbegin(), buf.cend() - 1);
-            } else {
-                escaped_string.push_back(static_cast<char>(character));
-            }
-            break;
-        }
-    }
-}
-
-auto is_valid_code_point(uint32_t code_point, size_t encoding_length) -> bool {
-    switch (encoding_length) {
+    } else if ((header & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
+        num_continuation_bytes = 2;
+        code_point = (~cTwoByteContinuationMask & header);
         // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        case 1:
-            return code_point <= 0x7F;
-        case 2:
-            return (0x80 <= code_point && code_point <= 0x7FF);
-        case 3:
-            return (0x800 <= code_point && code_point <= 0xFFFF);
-        case 4:
-            return (0x1'0000 <= code_point && code_point <= 0x10'FFFF);
+        code_point_lower_bound = 0x800;
+        code_point_upper_bound = 0xFFFF;
         // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        default:
-            return false;
+    } else if ((header & cOneByteContinuationMask) == cValidOneByteContinuation) {
+        num_continuation_bytes = 1;
+        code_point = (~cOneByteContinuationMask & header);
+        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+        code_point_lower_bound = 0x80;
+        code_point_upper_bound = 0x7FF;
+        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    } else {
+        return false;
     }
+    return true;
+}
+
+auto is_ascii_char(uint8_t byte) -> bool {
+    constexpr uint8_t cLargestValidASCIIChar{0x7F};
+    return cLargestValidASCIIChar >= byte;
+}
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    constexpr uint8_t cContinuationByteMask{0xC0};
+    constexpr uint8_t cValidMaskedContinuationByte{0x80};
+    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
 }
 
 auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
@@ -132,99 +141,6 @@ auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32
     return (code_point << cNumContinuationBytePayloadBits)
            + (continuation_byte & cContinuationBytePayloadMask);
 }
-}  // namespace
-
-auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
-    string_view::const_iterator bookmark_it{};
-    size_t encoding_length{};
-    enum class State : uint8_t {
-        HeadByteToValidate = 0,
-        OneContinuationByteToValidate,
-        TwoContinuationBytesToValidate,
-        ThreeContinuationBytesToValidate
-    };
-    State state{State::HeadByteToValidate};
-    string escaped;
-    escaped.reserve(raw.size() + (raw.size() >> 2));
-
-    uint32_t code_point{};
-    auto validate_encoding_length_and_set_state = [&](uint8_t byte) -> bool {
-        constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
-        constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
-        constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
-        constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
-        constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
-        constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
-        if ((byte & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
-            encoding_length = 4;
-            code_point = (~cThreeByteContinuationMask & byte);
-            state = State::ThreeContinuationBytesToValidate;
-        } else if ((byte & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
-            encoding_length = 3;
-            code_point = (~cTwoByteContinuationMask & byte);
-            state = State::TwoContinuationBytesToValidate;
-        } else if ((byte & cOneByteContinuationMask) == cValidOneByteContinuation) {
-            encoding_length = 2;
-            code_point = (~cOneByteContinuationMask & byte);
-            state = State::OneContinuationByteToValidate;
-        } else {
-            return false;
-        }
-        return true;
-    };
-
-    // For multibyte encoded values, we will incrementally build the code point, and validate its
-    // range in the end.
-    for (string_view::const_iterator it{raw.cbegin()}; it != raw.cend(); ++it) {
-        auto const byte{static_cast<uint8_t>(*it)};
-        switch (state) {
-            case State::HeadByteToValidate: {
-                if (is_valid_code_point(static_cast<uint32_t>(byte), 1)) {
-                    escape_and_append_single_byte_utf8_char(byte, escaped);
-                } else {
-                    if (false == validate_encoding_length_and_set_state(byte)) {
-                        return std::nullopt;
-                    }
-                    bookmark_it = it;
-                }
-                break;
-            }
-            case State::OneContinuationByteToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-
-                if (false == is_valid_code_point(code_point, encoding_length)) {
-                    return std::nullopt;
-                }
-                escaped.append(bookmark_it, bookmark_it + encoding_length);
-                state = State::HeadByteToValidate;
-                break;
-            case State::TwoContinuationBytesToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-                state = State::OneContinuationByteToValidate;
-                break;
-            case State::ThreeContinuationBytesToValidate:
-                if (false == is_valid_utf8_continuation_byte(byte)) {
-                    return std::nullopt;
-                }
-                code_point = update_code_point(code_point, byte);
-                state = State::TwoContinuationBytesToValidate;
-                break;
-            default:
-                return std::nullopt;
-        }
-    }
+}  // namespace utils_hpp
 
-    if (State::HeadByteToValidate != state) {
-        // Incomplete multibyte UTF8 sequence
-        return std::nullopt;
-    }
-
-    return std::move(escaped);
-}
 }  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index acd977b80..81059d587 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -1,13 +1,15 @@
 #ifndef CLP_FFI_UTILS_HPP
 #define CLP_FFI_UTILS_HPP
 
+#include <cstddef>
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <string_view>
 
 namespace clp::ffi {
 /**
- * Validates whether the given string is UTF8 encoded, and escapes any characters to generate to
+ * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to
  * make the string human readable.
  * @param raw The raw string to escape.
  * @return The escaped string on success.
@@ -15,6 +17,125 @@ namespace clp::ffi {
  */
 [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
 ) -> std::optional<std::string>;
+
+/**
+ * @param str
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
+
+/**
+ * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
+ * the given handler.
+ * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. Signature:
+ * (std::string_view::const_iterator it_ascii_char) -> void
+ * @param src
+ * @param escape_handler
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+template <typename EscapeHandler>
+[[nodiscard]] auto
+generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
+
+namespace utils_hpp {
+/**
+ * Validates whether the given byte is a valid UTF-8 header with continuation bytes, and set code
+ * point and code point range accordingly.
+ * The valid code point range is defined as following:
+ * .----------------------------------------------------------.
+ * | Continuation Length | First Code Point | Last Code Point |
+ * |---------------------|------------------|-----------------|
+ * | 1 Byte              | 0x80             | 0x7FF           |
+ * | 2 Byte              | 0x800            | 0xFFFF          |
+ * | 3 Byte              | 0x10000          | 0x10FFFF        |
+ * |---------------------|------------------|-----------------|
+ * @param header Input byte to validate
+ * @param num_continuation_bytes Outputs the number of continuation bytes corresponded to the header
+ * byte, if the header is valid.
+ * @param code_point Outputs the code extracted from the header byte, if the header is valid.
+ * @param code_point_lower_bound Outputs the lower bound of the valid code point range corresponded
+ * with the header byte, if the header if valid.
+ * @param code_point_upper_bound Outputs the upper bound of the valid code point range corresponded
+ * with the header byte, if the header if valid.
+ * @return Whether the input byte is a valid header byte.
+ */
+[[nodiscard]] auto validate_header_byte_and_set_code_point(
+        uint8_t header,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool;
+
+/**
+ * @param byte
+ * @return Whether the given byte is a valid ASCII character.
+ */
+[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
+
+/*
+ * @param byte
+ * @return Whether the input byte is a valid UTF-8 continuation byte. A valid UTF-8 continuation
+ * byte should match 0b10xx_xxxx.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Updates the code point by applying the payload of the given continuation byte.
+ * @param code_point
+ * @param continuation_byte
+ * @return Updated code point.
+ */
+[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+}  // namespace utils_hpp
+
+template <typename EscapeHandler>
+auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
+    size_t num_continuation_bytes_to_validate{0};
+    uint32_t code_point{};
+    uint32_t code_point_lower_bound{};
+    uint32_t code_point_upper_bound{};
+
+    for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        if (0 == num_continuation_bytes_to_validate) {
+            if (utils_hpp::is_ascii_char(byte)) {
+                escape_handler(it);
+            } else {
+                if (false
+                    == utils_hpp::validate_header_byte_and_set_code_point(
+                            byte,
+                            num_continuation_bytes_to_validate,
+                            code_point,
+                            code_point_lower_bound,
+                            code_point_upper_bound
+                    ))
+                {
+                    return false;
+                }
+            }
+        } else {
+            if (false == utils_hpp::is_valid_utf8_continuation_byte(byte)) {
+                return false;
+            }
+            code_point = utils_hpp::update_code_point(code_point, byte);
+            --num_continuation_bytes_to_validate;
+            if (0 != num_continuation_bytes_to_validate) {
+                continue;
+            }
+            if (code_point < code_point_lower_bound || code_point_upper_bound < code_point) {
+                return false;
+            }
+        }
+    }
+
+    if (0 != num_continuation_bytes_to_validate) {
+        // Incomplete continuation byte sequence
+        return false;
+    }
+
+    return true;
+}
 }  // namespace clp::ffi
 
 #endif  // CLP_UTILS_HPP
diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp
index 4deb16865..d2eb0d11f 100644
--- a/components/core/tests/test-ffi_utils.cpp
+++ b/components/core/tests/test-ffi_utils.cpp
@@ -12,6 +12,7 @@
 
 #include "../src/clp/ffi/utils.hpp"
 
+using clp::ffi::is_utf8_encoded;
 using clp::ffi::validate_and_escape_utf8_string;
 
 namespace {
@@ -23,11 +24,54 @@ namespace {
  */
 [[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
 
+/**
+ * Generates a UTF-8 encoded byte sequence of a given code point with the given number of
+ * continuation bytes. The range of the code point is not validated, which means the generated byte
+ * sequence can be overlong.
+ * @param code_point
+ * @param num_continuation_bytes
+ * @return The encoded UTF-8 byte sequence.
+ */
+[[nodiscard]] auto
+generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes) -> std::string;
+
 auto get_expected_escaped_string(std::string_view raw) -> std::string {
     nlohmann::json const json_str = raw;  // Don't use '{}' initializer
     auto const dumped_str{json_str.dump()};
+    // Strip the quotes that nlohmann::json adds
     return {dumped_str.begin() + 1, dumped_str.end() - 1};
 }
+
+auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_bytes)
+        -> std::string {
+    REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3));
+    std::vector<char> encoded_bytes;
+    while (true) {
+        auto const least_significant_byte{static_cast<uint8_t>(code_point)};
+        if (encoded_bytes.size() < num_continuation_bytes) {
+            constexpr uint8_t cContinuationPayloadMask{0x3F};  // 0b0011_1111
+            constexpr uint8_t cContinuationSignature{0x80};  // 0b1000_0000
+            constexpr uint8_t cNumContinuationBytePayloadBits{6};
+            encoded_bytes.push_back(static_cast<char>(
+                    (least_significant_byte & cContinuationPayloadMask) | cContinuationSignature
+            ));
+            code_point >>= cNumContinuationBytePayloadBits;
+        } else {
+            constexpr uint8_t cHeaderPayloadMask{0x1F};  // 0b0001_1111
+            constexpr int8_t cHeaderSignature{static_cast<int8_t>(0xC0)};  // 0b1100_0000
+            auto const num_bits_shift{num_continuation_bytes - 1};
+            auto const header_payload_mask{
+                    static_cast<uint8_t>(cHeaderPayloadMask >> num_bits_shift)
+            };
+            auto const header_signature{static_cast<uint8_t>(cHeaderSignature >> num_bits_shift)};
+            encoded_bytes.push_back(static_cast<char>(
+                    (least_significant_byte & header_payload_mask) | header_signature
+            ));
+            break;
+        }
+    }
+    return {encoded_bytes.rbegin(), encoded_bytes.rend()};
+}
 }  // namespace
 
 TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
@@ -43,12 +87,12 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
     actual = validate_and_escape_utf8_string(test_str);
     REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
 
-    // Test string with all single byte UTF8 characters, which include all characters we escape
+    // Test string with all single byte UTF-8 characters, including those we escape.
     test_str.clear();
     for (uint8_t i{0}; i <= static_cast<uint8_t>(INT8_MAX); ++i) {
         test_str.push_back(static_cast<char>(i));
     }
-    // Shuffle characters randomly, ensure control characters are not grouped together.
+    // Shuffle characters randomly
     // NOLINTNEXTLINE(cert-msc32-c, cert-msc51-cpp)
     std::shuffle(test_str.begin(), test_str.end(), std::default_random_engine{});
     actual = validate_and_escape_utf8_string(test_str);
@@ -72,102 +116,82 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
     REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
 }
 
-TEST_CASE("escape_utf8_string_with_continuation", "[ffi][utils]") {
+TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
     std::string test_str;
-    std::optional<std::string> actual;
 
-    // Test UTF8 code point range validation
-    auto const valid_code_point_lower_bound = GENERATE(
-            std::string_view{"\xC2\x80"},
-            std::string_view{"\xE0\xA0\x80"},
-            std::string_view{"\xF0\x90\x80\x80"}
+    auto const valid_utf8_byte_sequence = GENERATE(
+            generate_utf8_byte_sequence(0x80, 1),
+            generate_utf8_byte_sequence(0x800, 2),
+            generate_utf8_byte_sequence(0x1'0000, 3)
     );
 
-    auto const valid_code_point_upper_bound = GENERATE(
-            std::string_view{"\xDF\xBF"},
-            std::string_view{"\xEF\xBF\xBF"},
-            std::string_view{"\xF4\x8F\xBF\xBF"}
-    );
-
-    test_str = valid_code_point_lower_bound;
-    actual = validate_and_escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    test_str = valid_code_point_upper_bound;
-    actual = validate_and_escape_utf8_string(test_str);
-    REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
-
-    // Test invalid code point: 0x7F (only need one byte)
-    test_str = "\xC1\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xE0\x81\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x81\x81\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x73 (only need one byte)
-    test_str = "\xC1\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xE0\x81\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x81\x81\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x7FF (only need 2 bytes)
-    test_str = "\xE0\x9F\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x80\x9F\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0x7F3 (only need 2 bytes)
-    test_str = "\xE0\x9F\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    test_str = "\xF0\x80\x9F\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0xFFFF (only need 3 bytes)
-    test_str = "\xF0\x8F\xBF\xBF";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
-    // Test invalid code point: 0xFFF3 (only need 3 bytes)
-    test_str = "\xF0\x8F\xBF\xB3";
-    REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
-
     // Test incomplete continuation bytes
-    std::string_view::const_iterator const it_begin{valid_code_point_lower_bound.cbegin()};
+    auto const it_begin{valid_utf8_byte_sequence.cbegin()};
     std::string const valid{"Valid"};
-    for (std::string_view::const_iterator it_end{valid_code_point_lower_bound.cend() - 1};
-         valid_code_point_lower_bound.cbegin() != it_end;
+    for (auto it_end{valid_utf8_byte_sequence.cend() - 1};
+         valid_utf8_byte_sequence.cbegin() != it_end;
          --it_end)
     {
         std::string const incomplete_byte_sequence{it_begin, it_end};
-        REQUIRE(
-                (false
-                 == validate_and_escape_utf8_string(valid + incomplete_byte_sequence).has_value())
-        );
-        REQUIRE(
-                (false
-                 == validate_and_escape_utf8_string(incomplete_byte_sequence + valid).has_value())
-        );
+
+        test_str = valid + incomplete_byte_sequence;
+        REQUIRE((false == is_utf8_encoded(test_str)));
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
+
+        test_str = incomplete_byte_sequence + valid;
+        REQUIRE((false == is_utf8_encoded(test_str)));
+        REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
     }
 
     // Test invalid header byte
-    test_str = valid_code_point_lower_bound;
+    test_str = valid_utf8_byte_sequence;
     constexpr char cInvalidHeaderByte{'\xFF'};
     test_str.front() = cInvalidHeaderByte;
     REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
 
     // Test invalid continuation bytes
-    for (size_t idx{1}; idx < valid_code_point_lower_bound.size(); ++idx) {
-        test_str = valid_code_point_lower_bound;
+    for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) {
+        test_str = valid_utf8_byte_sequence;
         constexpr uint8_t cInvalidateMask{0x40};
         test_str.at(idx) |= cInvalidateMask;
+        REQUIRE((false == is_utf8_encoded(test_str)));
         REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
     }
 }
+
+TEST_CASE("validate_utf8_code_point_ranges", "[ffi][utils]") {
+    // Test 1 byte encoding code point range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (uint32_t code_point{0}; code_point <= 0x7F; ++code_point) {
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 2 byte encoding code point range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (uint32_t code_point{0x80}; code_point <= 0x7FF; ++code_point) {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 3 byte encoding code point range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (uint32_t code_point{0x800}; code_point <= 0xFFFF; ++code_point) {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)));
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+
+    // Test 4 byte encoding code point range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (uint32_t code_point{0x1'0000}; code_point <= 0x10'FFFF; ++code_point) {
+        REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)));
+    }
+
+    // Test 4 byte encoding code point out of range
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+    for (uint32_t code_point{0x10'FFFF + 1}; code_point <= 0x1F'FFFF; ++code_point) {
+        REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
+    }
+}

From db6b089abea85b14e9cd5a87429ca63e67692efb Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Mon, 24 Jun 2024 19:04:46 -0400
Subject: [PATCH 07/13] Update comments

---
 components/core/src/clp/ffi/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index 81059d587..c29f21826 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -10,7 +10,7 @@
 namespace clp::ffi {
 /**
  * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to
- * make the string human readable.
+ * make the string compatible with JSON specification.
  * @param raw The raw string to escape.
  * @return The escaped string on success.
  * @return std::nullopt if the string contains none-UTF8 encoded byte sequence.

From 6696553114c8af231353ad9fe6e185a4b930cadc Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:16:01 -0400
Subject: [PATCH 08/13] Apply suggestions from code review

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 components/core/src/clp/ffi/utils.hpp    | 48 ++++++++++--------------
 components/core/tests/test-ffi_utils.cpp | 11 +++---
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index c29f21826..af65a4c84 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -9,11 +9,11 @@
 
 namespace clp::ffi {
 /**
- * Validates whether the given string is UTF-8 encoded, and escapes any characters to generate to
- * make the string compatible with JSON specification.
+ * Validates whether the given string is UTF-8 encoded, and escapes any characters to make the
+ * string compatible with the JSON specification.
  * @param raw The raw string to escape.
  * @return The escaped string on success.
- * @return std::nullopt if the string contains none-UTF8 encoded byte sequence.
+ * @return std::nullopt if the string contains any non-UTF-8-encoded byte sequences.
  */
 [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
 ) -> std::optional<std::string>;
@@ -27,8 +27,7 @@ namespace clp::ffi {
 /**
  * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
  * the given handler.
- * @tparam EscapeHandler Method to optionally escape any ASCII character in the string. Signature:
- * (std::string_view::const_iterator it_ascii_char) -> void
+ * @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
  * @param src
  * @param escape_handler
  * @return Whether the input is a valid UTF-8 encoded string.
@@ -39,25 +38,16 @@ generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler)
 
 namespace utils_hpp {
 /**
- * Validates whether the given byte is a valid UTF-8 header with continuation bytes, and set code
- * point and code point range accordingly.
- * The valid code point range is defined as following:
- * .----------------------------------------------------------.
- * | Continuation Length | First Code Point | Last Code Point |
- * |---------------------|------------------|-----------------|
- * | 1 Byte              | 0x80             | 0x7FF           |
- * | 2 Byte              | 0x800            | 0xFFFF          |
- * | 3 Byte              | 0x10000          | 0x10FFFF        |
- * |---------------------|------------------|-----------------|
- * @param header Input byte to validate
- * @param num_continuation_bytes Outputs the number of continuation bytes corresponded to the header
- * byte, if the header is valid.
- * @param code_point Outputs the code extracted from the header byte, if the header is valid.
- * @param code_point_lower_bound Outputs the lower bound of the valid code point range corresponded
- * with the header byte, if the header if valid.
- * @param code_point_upper_bound Outputs the upper bound of the valid code point range corresponded
- * with the header byte, if the header if valid.
- * @return Whether the input byte is a valid header byte.
+ * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
+ * the byte, and returns the parsed properties as well as associated properties.
+ * @param header Byte to validate.
+ * @param num_continuation_bytes Returns the number of continuation bytes expected.
+ * @param code_point Returns the code point bits parsed from the lead byte.
+ * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
+ * character.
+ * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
+ * character.
+ * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
  */
 [[nodiscard]] auto validate_header_byte_and_set_code_point(
         uint8_t header,
@@ -75,16 +65,16 @@ namespace utils_hpp {
 
 /*
  * @param byte
- * @return Whether the input byte is a valid UTF-8 continuation byte. A valid UTF-8 continuation
- * byte should match 0b10xx_xxxx.
+ * @return Whether the input byte is a valid UTF-8 continuation byte.
  */
 [[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
 
 /**
- * Updates the code point by applying the payload of the given continuation byte.
+ * Parses the code-point bits from the given continuation byte and combines them with the given
+ * code point.
  * @param code_point
  * @param continuation_byte
- * @return Updated code point.
+ * @return The updated code point.
  */
 [[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
 }  // namespace utils_hpp
@@ -130,7 +120,7 @@ auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_han
     }
 
     if (0 != num_continuation_bytes_to_validate) {
-        // Incomplete continuation byte sequence
+        // Incomplete UTF-8 character
         return false;
     }
 
diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp
index d2eb0d11f..536f329c8 100644
--- a/components/core/tests/test-ffi_utils.cpp
+++ b/components/core/tests/test-ffi_utils.cpp
@@ -20,14 +20,14 @@ namespace {
  * Gets an expected escaped string by first convert the raw string into a json string and then dumps
  * the a printable string using nlohmann::json.
  * @param raw
- * @return Escaped string dumped by nlohmann::json, with surrounding '"' dropped.
+ * @return The input string after escaping any characters that are invalid in JSON strings.
  */
 [[nodiscard]] auto get_expected_escaped_string(std::string_view raw) -> std::string;
 
 /**
- * Generates a UTF-8 encoded byte sequence of a given code point with the given number of
- * continuation bytes. The range of the code point is not validated, which means the generated byte
- * sequence can be overlong.
+ * Generates a UTF-8 encoded byte sequence with the given code point and number of continuation
+ * bytes. The range of the code point is not validated, which means the generated byte sequence can
+ * be invalid (overlong or exceeding the valid range of UTF-8 code points).
  * @param code_point
  * @param num_continuation_bytes
  * @return The encoded UTF-8 byte sequence.
@@ -98,7 +98,7 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
     actual = validate_and_escape_utf8_string(test_str);
     REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
 
-    // Test valid UTF8 chars with continuation bytes
+    // Test valid UTF-8 chars with continuation bytes
     std::vector<std::string> const valid_utf8{
             "\n",
             "\xF0\xA0\x80\x8F",  // https://en.wiktionary.org/wiki/%F0%A0%80%8F
@@ -147,6 +147,7 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
     test_str = valid_utf8_byte_sequence;
     constexpr char cInvalidHeaderByte{'\xFF'};
     test_str.front() = cInvalidHeaderByte;
+    REQUIRE((false == is_utf8_encoded(test_str)));
     REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
 
     // Test invalid continuation bytes

From 9aefc250f128d14a30f70a06eae5d426a8b0a906 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:43:52 -0400
Subject: [PATCH 09/13] Apply suggestions from code review

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 components/core/tests/test-ffi_utils.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-ffi_utils.cpp
index 536f329c8..ff190bcf6 100644
--- a/components/core/tests/test-ffi_utils.cpp
+++ b/components/core/tests/test-ffi_utils.cpp
@@ -17,8 +17,6 @@ using clp::ffi::validate_and_escape_utf8_string;
 
 namespace {
 /**
- * Gets an expected escaped string by first convert the raw string into a json string and then dumps
- * the a printable string using nlohmann::json.
  * @param raw
  * @return The input string after escaping any characters that are invalid in JSON strings.
  */

From d47fc77434884b766a076484fd75fbcbcc4a12fc Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Wed, 26 Jun 2024 05:10:53 -0400
Subject: [PATCH 10/13] Apply code review comments

---
 components/core/CMakeLists.txt                |   4 +-
 components/core/src/clp/ffi/utils.cpp         | 106 +++----------
 components/core/src/clp/ffi/utils.hpp         | 113 +-------------
 components/core/src/clp/utf8_utils.cpp        |  55 +++++++
 components/core/src/clp/utf8_utils.hpp        | 143 ++++++++++++++++++
 ...test-ffi_utils.cpp => test-utf8_utils.cpp} | 102 +++++++------
 6 files changed, 280 insertions(+), 243 deletions(-)
 create mode 100644 components/core/src/clp/utf8_utils.cpp
 create mode 100644 components/core/src/clp/utf8_utils.hpp
 rename components/core/tests/{test-ffi_utils.cpp => test-utf8_utils.cpp} (68%)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 50abbc295..ed15ef132 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -438,6 +438,8 @@ set(SOURCE_FILES_unitTest
         src/clp/TraceableException.hpp
         src/clp/time_types.hpp
         src/clp/type_utils.hpp
+        src/clp/utf8_utils.cpp
+        src/clp/utf8_utils.hpp
         src/clp/Utils.cpp
         src/clp/Utils.hpp
         src/clp/VariableDictionaryEntry.cpp
@@ -472,8 +474,8 @@ set(SOURCE_FILES_unitTest
         tests/test-StreamingCompression.cpp
         tests/test-string_utils.cpp
         tests/test-TimestampPattern.cpp
+        tests/test-utf8_utils.cpp
         tests/test-Utils.cpp
-        tests/test-ffi_utils.cpp
         )
 add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
 target_include_directories(unitTest
diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
index e074311b4..3f77564d2 100644
--- a/components/core/src/clp/ffi/utils.cpp
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -9,13 +9,16 @@
 #include <string_view>
 #include <tuple>
 
+#include "../utf8_utils.hpp"
+
 using std::string;
 using std::string_view;
 
 namespace clp::ffi {
 auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
-    string_view::const_iterator bookmark{raw.cbegin()};
-    string escaped;
+    string_view::const_iterator next_char_to_copy_it{raw.cbegin()};
+    std::optional<std::string> ret_val;
+    auto& escaped{ret_val.emplace()};
     escaped.reserve(raw.size() + (raw.size() / 2));
 
     auto escape_handler = [&](string_view::const_iterator it) -> void {
@@ -23,36 +26,36 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
         // used by `snprintf` to append '\0'
         constexpr size_t cControlCharacterBufSize{7};
         std::array<char, cControlCharacterBufSize> buf{};
-        std::string_view escaped_content;
+        std::string_view escaped_char;
         bool escape_required{true};
         switch (*it) {
             case '\b':
-                escaped_content = "\\b";
+                escaped_char = "\\b";
                 break;
             case '\t':
-                escaped_content = "\\t";
+                escaped_char = "\\t";
                 break;
             case '\n':
-                escaped_content = "\\n";
+                escaped_char = "\\n";
                 break;
             case '\f':
-                escaped_content = "\\f";
+                escaped_char = "\\f";
                 break;
             case '\r':
-                escaped_content = "\\r";
+                escaped_char = "\\r";
                 break;
             case '\\':
-                escaped_content = "\\\\";
+                escaped_char = "\\\\";
                 break;
             case '"':
-                escaped_content = "\\\"";
+                escaped_char = "\\\"";
                 break;
             default: {
                 constexpr uint8_t cLargestControlCharacter{0x1F};
                 auto const byte{static_cast<uint8_t>(*it)};
                 if (cLargestControlCharacter >= byte) {
                     std::ignore = snprintf(buf.data(), buf.size(), "\\u00%02x", byte);
-                    escaped_content = {buf.data(), buf.size() - 1};
+                    escaped_char = {buf.data(), buf.size() - 1};
                 } else {
                     escape_required = false;
                 }
@@ -60,87 +63,20 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
             }
         }
         if (escape_required) {
-            escaped.append(bookmark, it);
-            escaped.append(escaped_content.cbegin(), escaped_content.cend());
-            bookmark = it + 1;
+            escaped.append(next_char_to_copy_it, it);
+            escaped += escaped_char;
+            next_char_to_copy_it = it + 1;
         }
     };
 
-    if (false == generic_validate_utf8_string(raw, escape_handler)) {
+    if (false == validate_utf8_string(raw, escape_handler)) {
         return std::nullopt;
     }
 
-    if (raw.cend() != bookmark) {
-        escaped.append(bookmark, raw.cend());
-    }
-
-    return escaped;
-}
-
-auto is_utf8_encoded(string_view str) -> bool {
-    auto escape_handler = []([[maybe_unused]] string_view::const_iterator it) -> void {};
-    return generic_validate_utf8_string(str, escape_handler);
-}
-
-namespace utils_hpp {
-auto validate_header_byte_and_set_code_point(
-        uint8_t header,
-        size_t& num_continuation_bytes,
-        uint32_t& code_point,
-        uint32_t& code_point_lower_bound,
-        uint32_t& code_point_upper_bound
-) -> bool {
-    constexpr uint8_t cThreeByteContinuationMask{0xF8};  // 0b1111_1xxx
-    constexpr uint8_t cValidThreeByteContinuation{0xF0};  // 0b1111_0xxx
-    constexpr uint8_t cTwoByteContinuationMask{0xF0};  // 0b1111_xxxx
-    constexpr uint8_t cValidTwoByteContinuation{0xE0};  // 0b1110_xxxx
-    constexpr uint8_t cOneByteContinuationMask{0xE0};  // 0b111x_xxxx
-    constexpr uint8_t cValidOneByteContinuation{0xC0};  // 0b110x_xxxx
-
-    if ((header & cThreeByteContinuationMask) == cValidThreeByteContinuation) {
-        num_continuation_bytes = 3;
-        code_point = (~cThreeByteContinuationMask & header);
-        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        code_point_lower_bound = 0x1'0000;
-        code_point_upper_bound = 0x10'FFFF;
-        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    } else if ((header & cTwoByteContinuationMask) == cValidTwoByteContinuation) {
-        num_continuation_bytes = 2;
-        code_point = (~cTwoByteContinuationMask & header);
-        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        code_point_lower_bound = 0x800;
-        code_point_upper_bound = 0xFFFF;
-        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    } else if ((header & cOneByteContinuationMask) == cValidOneByteContinuation) {
-        num_continuation_bytes = 1;
-        code_point = (~cOneByteContinuationMask & header);
-        // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        code_point_lower_bound = 0x80;
-        code_point_upper_bound = 0x7FF;
-        // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    } else {
-        return false;
+    if (raw.cend() != next_char_to_copy_it) {
+        escaped.append(next_char_to_copy_it, raw.cend());
     }
-    return true;
-}
-
-auto is_ascii_char(uint8_t byte) -> bool {
-    constexpr uint8_t cLargestValidASCIIChar{0x7F};
-    return cLargestValidASCIIChar >= byte;
-}
 
-auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
-    constexpr uint8_t cContinuationByteMask{0xC0};
-    constexpr uint8_t cValidMaskedContinuationByte{0x80};
-    return (byte & cContinuationByteMask) == cValidMaskedContinuationByte;
+    return ret_val;
 }
-
-auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
-    constexpr uint32_t cContinuationBytePayloadMask{0x3F};
-    constexpr uint8_t cNumContinuationBytePayloadBits{6};
-    return (code_point << cNumContinuationBytePayloadBits)
-           + (continuation_byte & cContinuationBytePayloadMask);
-}
-}  // namespace utils_hpp
-
 }  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index af65a4c84..160ed687b 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -1,8 +1,6 @@
 #ifndef CLP_FFI_UTILS_HPP
 #define CLP_FFI_UTILS_HPP
 
-#include <cstddef>
-#include <cstdint>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -17,115 +15,6 @@ namespace clp::ffi {
  */
 [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
 ) -> std::optional<std::string>;
-
-/**
- * @param str
- * @return Whether the input is a valid UTF-8 encoded string.
- */
-[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
-
-/**
- * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
- * the given handler.
- * @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
- * @param src
- * @param escape_handler
- * @return Whether the input is a valid UTF-8 encoded string.
- */
-template <typename EscapeHandler>
-[[nodiscard]] auto
-generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
-
-namespace utils_hpp {
-/**
- * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
- * the byte, and returns the parsed properties as well as associated properties.
- * @param header Byte to validate.
- * @param num_continuation_bytes Returns the number of continuation bytes expected.
- * @param code_point Returns the code point bits parsed from the lead byte.
- * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
- * character.
- * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
- * character.
- * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
- */
-[[nodiscard]] auto validate_header_byte_and_set_code_point(
-        uint8_t header,
-        size_t& num_continuation_bytes,
-        uint32_t& code_point,
-        uint32_t& code_point_lower_bound,
-        uint32_t& code_point_upper_bound
-) -> bool;
-
-/**
- * @param byte
- * @return Whether the given byte is a valid ASCII character.
- */
-[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
-
-/*
- * @param byte
- * @return Whether the input byte is a valid UTF-8 continuation byte.
- */
-[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
-
-/**
- * Parses the code-point bits from the given continuation byte and combines them with the given
- * code point.
- * @param code_point
- * @param continuation_byte
- * @return The updated code point.
- */
-[[nodiscard]] auto update_code_point(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
-}  // namespace utils_hpp
-
-template <typename EscapeHandler>
-auto generic_validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
-    size_t num_continuation_bytes_to_validate{0};
-    uint32_t code_point{};
-    uint32_t code_point_lower_bound{};
-    uint32_t code_point_upper_bound{};
-
-    for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) {
-        auto const byte{static_cast<uint8_t>(*it)};
-        if (0 == num_continuation_bytes_to_validate) {
-            if (utils_hpp::is_ascii_char(byte)) {
-                escape_handler(it);
-            } else {
-                if (false
-                    == utils_hpp::validate_header_byte_and_set_code_point(
-                            byte,
-                            num_continuation_bytes_to_validate,
-                            code_point,
-                            code_point_lower_bound,
-                            code_point_upper_bound
-                    ))
-                {
-                    return false;
-                }
-            }
-        } else {
-            if (false == utils_hpp::is_valid_utf8_continuation_byte(byte)) {
-                return false;
-            }
-            code_point = utils_hpp::update_code_point(code_point, byte);
-            --num_continuation_bytes_to_validate;
-            if (0 != num_continuation_bytes_to_validate) {
-                continue;
-            }
-            if (code_point < code_point_lower_bound || code_point_upper_bound < code_point) {
-                return false;
-            }
-        }
-    }
-
-    if (0 != num_continuation_bytes_to_validate) {
-        // Incomplete UTF-8 character
-        return false;
-    }
-
-    return true;
-}
 }  // namespace clp::ffi
 
-#endif  // CLP_UTILS_HPP
+#endif  // CLP_FFI_UTILS_HPP
diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp
new file mode 100644
index 000000000..08a03f608
--- /dev/null
+++ b/components/core/src/clp/utf8_utils.cpp
@@ -0,0 +1,55 @@
+#include "utf8_utils.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+auto is_utf8_encoded(std::string_view str) -> bool {
+    auto escape_handler = []([[maybe_unused]] std::string_view::const_iterator it) -> void {};
+    return validate_utf8_string(str, escape_handler);
+}
+
+namespace utf8_utils_internal {
+auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool {
+    if ((byte & cFourByteUtf8CharHeaderMask) == cFourByteUtf8CharHeader) {
+        num_continuation_bytes = 3;
+        code_point = (~cFourByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cFourByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cFourByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cThreeByteUtf8CharHeaderMask) == cThreeByteUtf8CharHeader) {
+        num_continuation_bytes = 2;
+        code_point = (~cThreeByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cThreeByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cThreeByteUtf8CharCodePointUpperBound;
+    } else if ((byte & cTwoByteUtf8CharHeaderMask) == cTwoByteUtf8CharHeader) {
+        num_continuation_bytes = 1;
+        code_point = (~cTwoByteUtf8CharHeaderMask & byte);
+        code_point_lower_bound = cTwoByteUtf8CharCodePointLowerBound;
+        code_point_upper_bound = cTwoByteUtf8CharCodePointUpperBound;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+auto is_ascii_char(uint8_t byte) -> bool {
+    return cOneByteUtf8CharCodePointUpperBound >= byte;
+}
+
+auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
+    return (byte & cContinuationByteMask) == cContinuationByte;
+}
+
+auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
+    return (code_point << cNumContinuationByteCodePointBits)
+           + (continuation_byte & cContinuationByteCodePointMask);
+}
+}  // namespace utf8_utils_internal
+}  // namespace clp
diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp
new file mode 100644
index 000000000..c3dc8177a
--- /dev/null
+++ b/components/core/src/clp/utf8_utils.hpp
@@ -0,0 +1,143 @@
+#ifndef CLP_UTF8_UTILS_HPP
+#define CLP_UTF8_UTILS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <string_view>
+
+namespace clp {
+// Constants
+// Lead byte signature
+constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8};  // 0b1111_1xxx
+constexpr uint8_t cFourByteUtf8CharHeader{0xF0};  // 0b1111_0xxx
+constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0};  // 0b1111_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeader{0xE0};  // 0b1110_xxxx
+constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0};  // 0b111x_xxxx
+constexpr uint8_t cTwoByteUtf8CharHeader{0xC0};  // 0b110x_xxxx
+
+// Code point ranges (inclusive)
+constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
+constexpr uint32_t cOneByteUtf8CharCodePointUpperBound{0x7F};
+constexpr uint32_t cTwoByteUtf8CharCodePointLowerBound{0x80};
+constexpr uint32_t cTwoByteUtf8CharCodePointUpperBound{0x7FF};
+constexpr uint32_t cThreeByteUtf8CharCodePointLowerBound{0x800};
+constexpr uint32_t cThreeByteUtf8CharCodePointUpperBound{0xFFFF};
+constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
+constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};
+
+// Continuation byte
+constexpr uint32_t cContinuationByteMask{0xC0};
+constexpr uint32_t cContinuationByte{0x80};
+constexpr uint32_t cContinuationByteCodePointMask{0x3F};
+constexpr uint8_t cNumContinuationByteCodePointBits{6};
+
+/**
+ * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
+ * the given handler.
+ * @tparam EscapeHandler Method to optionally escape any ASCII character in the string.
+ * @param src
+ * @param escape_handler
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+[[nodiscard]] auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool;
+
+/**
+ * @param str
+ * @return Whether the input is a valid UTF-8 encoded string.
+ */
+[[nodiscard]] auto is_utf8_encoded(std::string_view str) -> bool;
+
+namespace utf8_utils_internal {
+/**
+ * Validates whether the given byte is a valid lead byte for a multi-byte UTF-8 character, parses
+ * the byte, and returns the parsed properties as well as associated properties.
+ * @param byte Byte to validate.
+ * @param num_continuation_bytes Returns the number of continuation bytes expected.
+ * @param code_point Returns the code point bits parsed from the lead byte.
+ * @param code_point_lower_bound Returns the lower bound of the code point range for the UTF-8
+ * character.
+ * @param code_point_upper_bound Returns the upper bound of the code point range for the UTF-8
+ * character.
+ * @return Whether the input byte is a valid lead byte for a multi-byte UTF-8 character.
+ */
+[[nodiscard]] auto parse_and_validate_lead_byte(
+        uint8_t byte,
+        size_t& num_continuation_bytes,
+        uint32_t& code_point,
+        uint32_t& code_point_lower_bound,
+        uint32_t& code_point_upper_bound
+) -> bool;
+
+/**
+ * @param byte
+ * @return Whether the given byte is a valid ASCII character.
+ */
+[[nodiscard]] auto is_ascii_char(uint8_t byte) -> bool;
+
+/*
+ * @param byte
+ * @return Whether the input byte is a valid UTF-8 continuation byte.
+ */
+[[nodiscard]] auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool;
+
+/**
+ * Parses the code-point bits from the given continuation byte and combines them with the given
+ * code point.
+ * @param code_point
+ * @param continuation_byte
+ * @return The updated code point.
+ */
+[[nodiscard]] auto
+parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t;
+}  // namespace utf8_utils_internal
+
+template <typename EscapeHandler>
+requires std::is_invocable_v<EscapeHandler, std::string_view::const_iterator>
+auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) -> bool {
+    size_t num_continuation_bytes_to_validate{0};
+    uint32_t code_point{};
+    uint32_t code_point_lower_bound{};
+    uint32_t code_point_upper_bound{};
+
+    for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) {
+        auto const byte{static_cast<uint8_t>(*it)};
+        if (0 == num_continuation_bytes_to_validate) {
+            if (utf8_utils_internal::is_ascii_char(byte)) {
+                escape_handler(it);
+            } else if (false
+                       == utf8_utils_internal::parse_and_validate_lead_byte(
+                               byte,
+                               num_continuation_bytes_to_validate,
+                               code_point,
+                               code_point_lower_bound,
+                               code_point_upper_bound
+                       ))
+            {
+                return false;
+            }
+        } else {
+            if (false == utf8_utils_internal::is_valid_utf8_continuation_byte(byte)) {
+                return false;
+            }
+            code_point = utf8_utils_internal::parse_continuation_byte(code_point, byte);
+            --num_continuation_bytes_to_validate;
+            if (0 == num_continuation_bytes_to_validate
+                && (code_point < code_point_lower_bound || code_point_upper_bound < code_point))
+            {
+                return false;
+            }
+        }
+    }
+
+    if (0 != num_continuation_bytes_to_validate) {
+        // Incomplete UTF-8 character
+        return false;
+    }
+
+    return true;
+}
+}  // namespace clp
+
+#endif  // CLP_UTF8_UTILS_HPP
diff --git a/components/core/tests/test-ffi_utils.cpp b/components/core/tests/test-utf8_utils.cpp
similarity index 68%
rename from components/core/tests/test-ffi_utils.cpp
rename to components/core/tests/test-utf8_utils.cpp
index ff190bcf6..94e45cac0 100644
--- a/components/core/tests/test-ffi_utils.cpp
+++ b/components/core/tests/test-utf8_utils.cpp
@@ -11,9 +11,10 @@
 #include <json/single_include/nlohmann/json.hpp>
 
 #include "../src/clp/ffi/utils.hpp"
+#include "../src/clp/utf8_utils.hpp"
 
-using clp::ffi::is_utf8_encoded;
 using clp::ffi::validate_and_escape_utf8_string;
+using clp::is_utf8_encoded;
 
 namespace {
 /**
@@ -44,35 +45,35 @@ auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_by
         -> std::string {
     REQUIRE((1 <= num_continuation_bytes && num_continuation_bytes <= 3));
     std::vector<char> encoded_bytes;
-    while (true) {
+    while (encoded_bytes.size() < num_continuation_bytes) {
         auto const least_significant_byte{static_cast<uint8_t>(code_point)};
-        if (encoded_bytes.size() < num_continuation_bytes) {
-            constexpr uint8_t cContinuationPayloadMask{0x3F};  // 0b0011_1111
-            constexpr uint8_t cContinuationSignature{0x80};  // 0b1000_0000
-            constexpr uint8_t cNumContinuationBytePayloadBits{6};
-            encoded_bytes.push_back(static_cast<char>(
-                    (least_significant_byte & cContinuationPayloadMask) | cContinuationSignature
-            ));
-            code_point >>= cNumContinuationBytePayloadBits;
-        } else {
-            constexpr uint8_t cHeaderPayloadMask{0x1F};  // 0b0001_1111
-            constexpr int8_t cHeaderSignature{static_cast<int8_t>(0xC0)};  // 0b1100_0000
-            auto const num_bits_shift{num_continuation_bytes - 1};
-            auto const header_payload_mask{
-                    static_cast<uint8_t>(cHeaderPayloadMask >> num_bits_shift)
-            };
-            auto const header_signature{static_cast<uint8_t>(cHeaderSignature >> num_bits_shift)};
-            encoded_bytes.push_back(static_cast<char>(
-                    (least_significant_byte & header_payload_mask) | header_signature
-            ));
-            break;
-        }
+        encoded_bytes.push_back(static_cast<char>(
+                (least_significant_byte & ~clp::cContinuationByteMask) | clp::cContinuationByte
+        ));
+        code_point >>= clp::cNumContinuationByteCodePointBits;
     }
+
+    uint8_t lead_byte_code_point_mask{};
+    uint8_t lead_byte_header{};
+    if (1 == num_continuation_bytes) {
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cTwoByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cTwoByteUtf8CharHeader;
+    } else if (2 == num_continuation_bytes) {
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cThreeByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cThreeByteUtf8CharHeader;
+    } else {  // 3 == num_continuation_bytes
+        lead_byte_code_point_mask = static_cast<uint8_t>(~clp::cFourByteUtf8CharHeaderMask);
+        lead_byte_header = clp::cFourByteUtf8CharHeader;
+    }
+    encoded_bytes.push_back(static_cast<char>(
+            (static_cast<uint8_t>(code_point) & lead_byte_code_point_mask) | lead_byte_header
+    ));
+
     return {encoded_bytes.rbegin(), encoded_bytes.rend()};
 }
 }  // namespace
 
-TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
+TEST_CASE("escape_utf8_string_basic", "[utf8_utils]") {
     std::string test_str;
     std::optional<std::string> actual;
 
@@ -114,7 +115,7 @@ TEST_CASE("escape_utf8_string_basic", "[ffi][utils]") {
     REQUIRE((actual.has_value() && actual.value() == get_expected_escaped_string(test_str)));
 }
 
-TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
+TEST_CASE("escape_utf8_string_with_invalid_continuation", "[utf8_utils]") {
     std::string test_str;
 
     auto const valid_utf8_byte_sequence = GENERATE(
@@ -124,13 +125,13 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
     );
 
     // Test incomplete continuation bytes
-    auto const it_begin{valid_utf8_byte_sequence.cbegin()};
+    auto const begin_it{valid_utf8_byte_sequence.cbegin()};
     std::string const valid{"Valid"};
-    for (auto it_end{valid_utf8_byte_sequence.cend() - 1};
-         valid_utf8_byte_sequence.cbegin() != it_end;
-         --it_end)
+    for (auto end_it{valid_utf8_byte_sequence.cend() - 1};
+         valid_utf8_byte_sequence.cbegin() != end_it;
+         --end_it)
     {
-        std::string const incomplete_byte_sequence{it_begin, it_end};
+        std::string const incomplete_byte_sequence{begin_it, end_it};
 
         test_str = valid + incomplete_byte_sequence;
         REQUIRE((false == is_utf8_encoded(test_str)));
@@ -141,56 +142,67 @@ TEST_CASE("escape_utf8_string_with_invalid_continuation", "[ffi][utils]") {
         REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
     }
 
-    // Test invalid header byte
+    // Test invalid lead byte
     test_str = valid_utf8_byte_sequence;
-    constexpr char cInvalidHeaderByte{'\xFF'};
-    test_str.front() = cInvalidHeaderByte;
+    constexpr char cInvalidLeadByte{'\xFF'};
+    test_str.front() = cInvalidLeadByte;
     REQUIRE((false == is_utf8_encoded(test_str)));
     REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
 
     // Test invalid continuation bytes
     for (size_t idx{1}; idx < valid_utf8_byte_sequence.size(); ++idx) {
         test_str = valid_utf8_byte_sequence;
-        constexpr uint8_t cInvalidateMask{0x40};
-        test_str.at(idx) |= cInvalidateMask;
+        constexpr uint8_t cInvalidContinuationByteMask{0x40};
+        test_str.at(idx) |= cInvalidContinuationByteMask;
         REQUIRE((false == is_utf8_encoded(test_str)));
         REQUIRE((false == validate_and_escape_utf8_string(test_str).has_value()));
     }
 }
 
-TEST_CASE("validate_utf8_code_point_ranges", "[ffi][utils]") {
+TEST_CASE("validate_utf8_code_point_ranges", "[utf8_utils]") {
     // Test 1 byte encoding code point range
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    for (uint32_t code_point{0}; code_point <= 0x7F; ++code_point) {
+    for (auto code_point{clp::cOneByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cOneByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
+        REQUIRE(is_utf8_encoded(std::string{static_cast<char>(code_point)}));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1))));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
     }
 
     // Test 2 byte encoding code point range
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    for (uint32_t code_point{0x80}; code_point <= 0x7FF; ++code_point) {
+    for (auto code_point{clp::cTwoByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cTwoByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
         REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 1)));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2))));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
     }
 
     // Test 3 byte encoding code point range
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    for (uint32_t code_point{0x800}; code_point <= 0xFFFF; ++code_point) {
+    for (auto code_point{clp::cThreeByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cThreeByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
         REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 2)));
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
     }
 
     // Test 4 byte encoding code point range
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    for (uint32_t code_point{0x1'0000}; code_point <= 0x10'FFFF; ++code_point) {
+    for (auto code_point{clp::cFourByteUtf8CharCodePointLowerBound};
+         code_point <= clp::cFourByteUtf8CharCodePointUpperBound;
+         ++code_point)
+    {
         REQUIRE(is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3)));
     }
 
     // Test 4 byte encoding code point out of range
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-    for (uint32_t code_point{0x10'FFFF + 1}; code_point <= 0x1F'FFFF; ++code_point) {
+    for (auto code_point{clp::cFourByteUtf8CharCodePointUpperBound + 1}; code_point <= 0x1F'FFFF;
+         ++code_point)
+    {
         REQUIRE((false == is_utf8_encoded(generate_utf8_byte_sequence(code_point, 3))));
     }
 }

From ebc1c4f0caf52f2060a1935b5b0c8515d74d9a04 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Wed, 26 Jun 2024 16:29:59 -0400
Subject: [PATCH 11/13] Use auto

---
 components/core/src/clp/utf8_utils.cpp    |  6 +++---
 components/core/src/clp/utf8_utils.hpp    | 19 ++++++++++---------
 components/core/tests/test-utf8_utils.cpp |  5 +++--
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/components/core/src/clp/utf8_utils.cpp b/components/core/src/clp/utf8_utils.cpp
index 08a03f608..06fafd659 100644
--- a/components/core/src/clp/utf8_utils.cpp
+++ b/components/core/src/clp/utf8_utils.cpp
@@ -44,12 +44,12 @@ auto is_ascii_char(uint8_t byte) -> bool {
 }
 
 auto is_valid_utf8_continuation_byte(uint8_t byte) -> bool {
-    return (byte & cContinuationByteMask) == cContinuationByte;
+    return (byte & cUtf8ContinuationByteMask) == cUtf8ContinuationByteHeader;
 }
 
 auto parse_continuation_byte(uint32_t code_point, uint8_t continuation_byte) -> uint32_t {
-    return (code_point << cNumContinuationByteCodePointBits)
-           + (continuation_byte & cContinuationByteCodePointMask);
+    return (code_point << cUtf8NumContinuationByteCodePointBits)
+           + (continuation_byte & cUtf8ContinuationByteCodePointMask);
 }
 }  // namespace utf8_utils_internal
 }  // namespace clp
diff --git a/components/core/src/clp/utf8_utils.hpp b/components/core/src/clp/utf8_utils.hpp
index c3dc8177a..fe9569b00 100644
--- a/components/core/src/clp/utf8_utils.hpp
+++ b/components/core/src/clp/utf8_utils.hpp
@@ -8,12 +8,12 @@
 namespace clp {
 // Constants
 // Lead byte signature
-constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8};  // 0b1111_1xxx
-constexpr uint8_t cFourByteUtf8CharHeader{0xF0};  // 0b1111_0xxx
-constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0};  // 0b1111_xxxx
-constexpr uint8_t cThreeByteUtf8CharHeader{0xE0};  // 0b1110_xxxx
 constexpr uint8_t cTwoByteUtf8CharHeaderMask{0xE0};  // 0b111x_xxxx
 constexpr uint8_t cTwoByteUtf8CharHeader{0xC0};  // 0b110x_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeaderMask{0xF0};  // 0b1111_xxxx
+constexpr uint8_t cThreeByteUtf8CharHeader{0xE0};  // 0b1110_xxxx
+constexpr uint8_t cFourByteUtf8CharHeaderMask{0xF8};  // 0b1111_1xxx
+constexpr uint8_t cFourByteUtf8CharHeader{0xF0};  // 0b1111_0xxx
 
 // Code point ranges (inclusive)
 constexpr uint32_t cOneByteUtf8CharCodePointLowerBound{0};
@@ -26,10 +26,10 @@ constexpr uint32_t cFourByteUtf8CharCodePointLowerBound{0x1'0000};
 constexpr uint32_t cFourByteUtf8CharCodePointUpperBound{0x10'FFFF};
 
 // Continuation byte
-constexpr uint32_t cContinuationByteMask{0xC0};
-constexpr uint32_t cContinuationByte{0x80};
-constexpr uint32_t cContinuationByteCodePointMask{0x3F};
-constexpr uint8_t cNumContinuationByteCodePointBits{6};
+constexpr uint32_t cUtf8ContinuationByteMask{0xC0};
+constexpr uint32_t cUtf8ContinuationByteHeader{0x80};
+constexpr uint32_t cUtf8ContinuationByteCodePointMask{0x3F};
+constexpr uint8_t cUtf8NumContinuationByteCodePointBits{6};
 
 /**
  * Validates whether the given string is UTF-8 encoded, optionally escaping ASCII characters using
@@ -101,7 +101,8 @@ auto validate_utf8_string(std::string_view src, EscapeHandler escape_handler) ->
     uint32_t code_point_lower_bound{};
     uint32_t code_point_upper_bound{};
 
-    for (std::string_view::const_iterator it{src.cbegin()}; it != src.cend(); ++it) {
+    // NOLINTNEXTLINE(readability-qualified-auto)
+    for (auto it{src.cbegin()}; it != src.cend(); ++it) {
         auto const byte{static_cast<uint8_t>(*it)};
         if (0 == num_continuation_bytes_to_validate) {
             if (utf8_utils_internal::is_ascii_char(byte)) {
diff --git a/components/core/tests/test-utf8_utils.cpp b/components/core/tests/test-utf8_utils.cpp
index 94e45cac0..77324eaf9 100644
--- a/components/core/tests/test-utf8_utils.cpp
+++ b/components/core/tests/test-utf8_utils.cpp
@@ -48,9 +48,10 @@ auto generate_utf8_byte_sequence(uint32_t code_point, size_t num_continuation_by
     while (encoded_bytes.size() < num_continuation_bytes) {
         auto const least_significant_byte{static_cast<uint8_t>(code_point)};
         encoded_bytes.push_back(static_cast<char>(
-                (least_significant_byte & ~clp::cContinuationByteMask) | clp::cContinuationByte
+                (least_significant_byte & ~clp::cUtf8ContinuationByteMask)
+                | clp::cUtf8ContinuationByteHeader
         ));
-        code_point >>= clp::cNumContinuationByteCodePointBits;
+        code_point >>= clp::cUtf8NumContinuationByteCodePointBits;
     }
 
     uint8_t lead_byte_code_point_mask{};

From 690379c9b05c7aa6c32f6f0b26a6f06594ecf0b1 Mon Sep 17 00:00:00 2001
From: LinZhihao-723 <zh.lin@mail.utoronto.ca>
Date: Wed, 26 Jun 2024 16:43:07 -0400
Subject: [PATCH 12/13] Add append option

---
 components/core/src/clp/ffi/utils.cpp | 23 +++++++++++++++--------
 components/core/src/clp/ffi/utils.hpp | 10 ++++++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.cpp b/components/core/src/clp/ffi/utils.cpp
index 3f77564d2..c85c47701 100644
--- a/components/core/src/clp/ffi/utils.cpp
+++ b/components/core/src/clp/ffi/utils.cpp
@@ -16,10 +16,17 @@ using std::string_view;
 
 namespace clp::ffi {
 auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
-    string_view::const_iterator next_char_to_copy_it{raw.cbegin()};
     std::optional<std::string> ret_val;
     auto& escaped{ret_val.emplace()};
     escaped.reserve(raw.size() + (raw.size() / 2));
+    if (false == validate_and_append_escaped_utf8_string(raw, escaped)) {
+        return std::nullopt;
+    }
+    return ret_val;
+}
+
+auto validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool {
+    string_view::const_iterator next_char_to_copy_it{src.cbegin()};
 
     auto escape_handler = [&](string_view::const_iterator it) -> void {
         // Allocate 6 + 1 size buffer to format control characters as "\u00bb", with the last byte
@@ -63,20 +70,20 @@ auto validate_and_escape_utf8_string(string_view raw) -> std::optional<string> {
             }
         }
         if (escape_required) {
-            escaped.append(next_char_to_copy_it, it);
-            escaped += escaped_char;
+            dst.append(next_char_to_copy_it, it);
+            dst += escaped_char;
             next_char_to_copy_it = it + 1;
         }
     };
 
-    if (false == validate_utf8_string(raw, escape_handler)) {
-        return std::nullopt;
+    if (false == validate_utf8_string(src, escape_handler)) {
+        return false;
     }
 
-    if (raw.cend() != next_char_to_copy_it) {
-        escaped.append(next_char_to_copy_it, raw.cend());
+    if (src.cend() != next_char_to_copy_it) {
+        dst.append(next_char_to_copy_it, src.cend());
     }
 
-    return ret_val;
+    return true;
 }
 }  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index 160ed687b..8a90169a1 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -15,6 +15,16 @@ namespace clp::ffi {
  */
 [[nodiscard]] auto validate_and_escape_utf8_string(std::string_view raw
 ) -> std::optional<std::string>;
+
+/**
+ * Validates whether the given string is UTF-8 encoded, and append the src to the dst by escaping
+ * any characters to make the string compatible with the JSON specification.
+ * @param src The source string to validate and escape.
+ * @param dst Outputs the destination string with escaped src appended.
+ * @return Whether the src is a valid UTF-8 encoded string.
+ */
+[[nodiscard]] auto
+validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;
 }  // namespace clp::ffi
 
 #endif  // CLP_FFI_UTILS_HPP

From 118ae4d0757d31398fff3be306af2fd980d5a74b Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Wed, 26 Jun 2024 23:59:19 -0400
Subject: [PATCH 13/13] Update components/core/src/clp/ffi/utils.hpp

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 components/core/src/clp/ffi/utils.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp/ffi/utils.hpp b/components/core/src/clp/ffi/utils.hpp
index 8a90169a1..26823da9c 100644
--- a/components/core/src/clp/ffi/utils.hpp
+++ b/components/core/src/clp/ffi/utils.hpp
@@ -17,11 +17,12 @@ namespace clp::ffi {
 ) -> std::optional<std::string>;
 
 /**
- * Validates whether the given string is UTF-8 encoded, and append the src to the dst by escaping
- * any characters to make the string compatible with the JSON specification.
- * @param src The source string to validate and escape.
- * @param dst Outputs the destination string with escaped src appended.
- * @return Whether the src is a valid UTF-8 encoded string.
+ * Validates whether `src` is UTF-8 encoded, and appends `src` to `dst` while escaping any
+ * characters to make the appended string compatible with the JSON specification.
+ * @param src The string to validate and escape.
+ * @param dst Returns `dst` with an escaped version of `src` appended.
+ * @return Whether `src` is a valid UTF-8-encoded string. NOTE: Even if `src` is not UTF-8 encoded,
+ * `dst` may be modified.
  */
 [[nodiscard]] auto
 validate_and_append_escaped_utf8_string(std::string_view src, std::string& dst) -> bool;