From 5c7d8f5c92c0dd40528f316e112c3c87a50d0dfe Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Tue, 15 Oct 2024 16:42:17 -0600
Subject: [PATCH 1/8] utils: improve performance of JSON escaping

The following change in the utils write utility, improve the handling
of characters that needs escaping by optimizing the character check
with a lookup table.

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 src/flb_utils.c | 73 ++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/flb_utils.c b/src/flb_utils.c
index 010254f23c5..ba1661c1751 100644
--- a/src/flb_utils.c
+++ b/src/flb_utils.c
@@ -756,6 +756,30 @@ static inline void encoded_to_buf(char *out, const char *in, int len)
     }
 }
 
+/* Structure to hold escape sequence */
+struct escape_seq {
+    const char *seq;
+};
+
+/* Lookup table for escape sequences */
+static const struct escape_seq json_escape_table[128] = {
+    ['\"'] = {"\\\""},
+    ['\\'] = {"\\\\"},
+    ['\n'] = {"\\n"},
+    ['\r'] = {"\\r"},
+    ['\t'] = {"\\t"},
+    ['\b'] = {"\\b"},
+    ['\f'] = {"\\f"},
+    [0x00] = {"\\u0000"}, [0x01] = {"\\u0001"}, [0x02] = {"\\u0002"}, [0x03] = {"\\u0003"},
+    [0x04] = {"\\u0004"}, [0x05] = {"\\u0005"}, [0x06] = {"\\u0006"}, [0x07] = {"\\u0007"},
+    [0x0B] = {"\\u000b"}, [0x0E] = {"\\u000e"}, [0x0F] = {"\\u000f"},
+    [0x10] = {"\\u0010"}, [0x11] = {"\\u0011"}, [0x12] = {"\\u0012"}, [0x13] = {"\\u0013"},
+    [0x14] = {"\\u0014"}, [0x15] = {"\\u0015"}, [0x16] = {"\\u0016"}, [0x17] = {"\\u0017"},
+    [0x18] = {"\\u0018"}, [0x19] = {"\\u0019"}, [0x1A] = {"\\u001a"}, [0x1B] = {"\\u001b"},
+    [0x1C] = {"\\u001c"}, [0x1D] = {"\\u001d"}, [0x1E] = {"\\u001e"}, [0x1F] = {"\\u001f"},
+    [0x7F] = {"\\u007f"}
+};
+
 /*
  * Write string pointed by 'str' to the destination buffer 'buf'. It's make sure
  * to escape special characters and convert utf-8 byte characters to string
@@ -795,43 +819,25 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
         }
 
         c = (uint32_t) str[i];
-        if (c == '\"') {
-            *p++ = '\\';
-            *p++ = '\"';
-        }
-        else if (c == '\\') {
-            *p++ = '\\';
-            *p++ = '\\';
-        }
-        else if (c == '\n') {
-            *p++ = '\\';
-            *p++ = 'n';
-        }
-        else if (c == '\r') {
-            *p++ = '\\';
-            *p++ = 'r';
-        }
-        else if (c == '\t') {
-            *p++ = '\\';
-            *p++ = 't';
-        }
-        else if (c == '\b') {
-            *p++ = '\\';
-            *p++ = 'b';
-        }
-        else if (c == '\f') {
-            *p++ = '\\';
-            *p++ = 'f';
-        }
-        else if (c < 32 || c == 0x7f) {
-            if ((available - written) < 6) {
-                return FLB_FALSE;
+
+        /* Use the lookup table for known escape sequences */
+        if (c < 128 && json_escape_table[c].seq) {
+            /*
+             * check the length for the string, for simple escaping is always
+             * 2 bytes and 6 bytes for unicode
+             */
+            if (json_escape_table[c].seq[1] == 'u') {
+                len = 6;
+            }
+            else {
+                len = 2;
             }
-            len = snprintf(tmp, sizeof(tmp) - 1, "\\u%.4hhx", (unsigned char) c);
+
             if ((available - written) < len) {
                 return FLB_FALSE;
             }
-            encoded_to_buf(p, tmp, len);
+
+            memcpy(p, json_escape_table[c].seq, len);
             p += len;
         }
         else if (c >= 0x80 && c <= 0xFFFF) {
@@ -978,7 +984,6 @@ int flb_utils_write_str(char *buf, int *off, size_t size,
     return FLB_TRUE;
 }
 
-
 int flb_utils_write_str_buf(const char *str, size_t str_len, char **out, size_t *out_size)
 {
     int ret;

From 4c5ae0b40ab8832a6166a8c8b0936d9879c12613 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Thu, 17 Oct 2024 19:37:29 -0600
Subject: [PATCH 2/8] simd: new SIMD support for SSE and NEON

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 include/fluent-bit/flb_simd.h | 250 ++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 include/fluent-bit/flb_simd.h

diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h
new file mode 100644
index 00000000000..0f66d5e1bed
--- /dev/null
+++ b/include/fluent-bit/flb_simd.h
@@ -0,0 +1,250 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+
+/*  Fluent Bit
+ *  ==========
+ *  Copyright (C) 2015-2024 The Fluent Bit Authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef FLB_SIMD_H
+#define FLB_SIMD_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+/* Only enable SIMD support if it has not been explicity disabled */
+#ifndef FLB_SIMD_DISABLED
+
+#if (defined(__x86_64__) || defined(_M_AMD64))
+/*
+ * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
+ * that compilers targeting this architecture understand SSE2 intrinsics.
+ *
+ * We use emmintrin.h rather than the comprehensive header immintrin.h in
+ * order to exclude extensions beyond SSE2. This is because MSVC, at least,
+ * will allow the use of intrinsics that haven't been enabled at compile
+ * time.
+ */
+#include <emmintrin.h>
+#define FLB_SIMD_SSE2
+typedef __m128i flb_vector8;
+typedef __m128i flb_vector32;
+
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+/*
+ * We use the Neon instructions if the compiler provides access to them (as
+ * indicated by __ARM_NEON) and we are on aarch64.  While Neon support is
+ * technically optional for aarch64, it appears that all available 64-bit
+ * hardware does have it.  Neon exists in some 32-bit hardware too, but we
+ * could not realistically use it there without a run-time check, which seems
+ * not worth the trouble for now.
+ */
+#include <arm_neon.h>
+#define FLB_SIMD_NEON
+typedef uint8x16_t flb_vector8;
+typedef uint32x4_t flb_vector32;
+
+#else
+/*
+ * If no SIMD instructions are available, we can in some cases emulate vector
+ * operations using bitwise operations on unsigned integers.  Note that many
+ * of the functions in this file presently do not have non-SIMD
+ * implementations.  In particular, none of the functions involving Vector32
+ * are implemented without SIMD since it's likely not worthwhile to represent
+ * two 32-bit integers using a uint64.
+ */
+#define FLB_SIMD_NONE
+typedef uint64 flb_vector8;
+#endif
+
+#else
+#define FLB_SIMD_NONE
+#endif /* FLB_SIMD_DISABLED */
+
+/* element-wise comparisons to a scalar */
+static inline bool flb_vector8_has(const flb_vector8 v, const uint8_t c);
+static inline bool flb_vector8_has_zero(const flb_vector8 v);
+static inline bool flb_vector8_has_le(const flb_vector8 v, const uint8_t c);
+static inline bool flb_vector8_is_highbit_set(const flb_vector8 v);
+
+/*
+ * Load a chunk of memory into the given vector.
+ */
+static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s)
+{
+#if defined(FLB_SIMD_SSE2)
+	*v = _mm_loadu_si128((const __m128i *) s);
+#elif defined(FLB_SIMD_NEON)
+	*v = vld1q_u8(s);
+#else
+	memcpy(v, s, sizeof(flb_vector8));
+#endif
+}
+
+/*
+ * Convenience function equivalent to vector8_has(v, 0)
+ */
+static inline bool flb_vector8_has_zero(const flb_vector8 v)
+{
+#if defined(FLB_SIMD_NONE)
+	/*
+	 * We cannot call vector8_has() here, because that would lead to a
+	 * circular definition.
+	 */
+	return flb_vector8_has_le(v, 0);
+#else
+	return flb_vector8_has(v, 0);
+#endif
+}
+
+
+/*
+ * Return the result of subtracting the respective elements of the input
+ * vectors using saturation (i.e., if the operation would yield a value less
+ * than zero, zero is returned instead).  For more information on saturation
+ * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
+ */
+#ifndef FLB_SIMD_NONE
+static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vector8 v2)
+{
+#ifdef FLB_SIMD_SSE2
+	return _mm_subs_epu8(v1, v2);
+#elif defined(FLB_SIMD_NEON)
+	return vqsubq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+/*
+ * Return a vector with all bits set in each lane where the corresponding
+ * lanes in the inputs are equal.
+ */
+#ifndef FLB_SIMD_NONE
+static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8 v2)
+{
+#ifdef FLB_SIMD_SSE2
+	return _mm_cmpeq_epi8(v1, v2);
+#elif defined(FLB_SIMD_NEON)
+	return vceqq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
+#ifndef FLB_SIMD_NONE
+static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vector32 v2)
+{
+#ifdef FLB_SIMD_SSE2
+	return _mm_cmpeq_epi32(v1, v2);
+#elif defined(FLB_SIMD_NEON)
+	return vceqq_u32(v1, v2);
+#endif
+}
+#endif
+
+/*
+ * Create a vector with all elements set to the same value.
+ */
+static inline flb_vector8 flb_vector8_broadcast(const uint8_t c)
+{
+#if defined(FLB_SIMD_SSE2)
+	return _mm_set1_epi8(c);
+#elif defined(FLB_SIMD_NEON)
+	return vdupq_n_u8(c);
+#else
+	return ~UINT64CONST(0) / 0xFF * c;
+#endif
+}
+
+/*
+ * Return true if the high bit of any element is set
+ */
+static inline bool flb_vector8_is_highbit_set(const flb_vector8 v)
+{
+#ifdef FLB_SIMD_SSE2
+	return _mm_movemask_epi8(v) != 0;
+#elif defined(FLB_SIMD_NEON)
+	return vmaxvq_u8(v) > 0x7F;
+#else
+	return v & flb_vector8_broadcast(0x80);
+#endif
+}
+
+/*
+ * Return true if any elements in the vector are equal to the given scalar.
+ */
+static inline bool flb_vector8_has(const flb_vector8 v, const uint8_t c)
+{
+	bool result;
+
+#if defined(FLB_SIMD_NONE)
+	/* any bytes in v equal to c will evaluate to zero via XOR */
+	result = flb_vector8_has_zero(v ^ flb_vector8_broadcast(c));
+#else
+	result = flb_vector8_is_highbit_set(flb_vector8_eq(v, flb_vector8_broadcast(c)));
+#endif
+
+	return result;
+}
+
+static inline bool flb_vector8_has_le(const flb_vector8 v, const uint8_t c)
+{
+	bool result = false;
+
+#if defined(FLB_SIMD_NONE)
+
+	/*
+	 * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
+	 * but it only works if c+1 <= 128 and if the highest bit in v is not set.
+	 * Adapted from
+	 * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
+	 */
+	if ((int64) v >= 0 && c < 0x80)
+		result = (v - flb_vector8_broadcast(c + 1)) & ~v & flb_vector8_broadcast(0x80);
+	else {
+		/* one byte at a time */
+        int i;
+		for (i = 0; i < sizeof(flb_vector8); i++)
+		{
+			if (((const uint8 *) &v)[i] <= c) {
+				result = true;
+				break;
+			}
+		}
+	}
+#else
+	/*
+	 * Use saturating subtraction to find bytes <= c, which will present as
+	 * NUL bytes.  This approach is a workaround for the lack of unsigned
+	 * comparison instructions on some architectures.
+	 */
+	result = flb_vector8_has_zero(flb_vector8_ssub(v, flb_vector8_broadcast(c)));
+#endif
+
+	return result;
+}
+
+static inline char *flb_simd_info()
+{
+    #if defined(FLB_SIMD_SSE2)
+        return "SSE2";
+    #elif defined(FLB_SIMD_NEON)
+        return "NEON";
+    #elif defined(FLB_SIMD_NONE)
+        return "none";
+    #elif defined(FLB_SIMD_DISABLED)
+        return "none (disabled)";
+    #endif
+}
+
+#endif /* FLB_SIMD_H */

From 47b22a2ec3cca917257a53933362b2131ca827c1 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Thu, 17 Oct 2024 19:38:49 -0600
Subject: [PATCH 3/8] utils: do JSON escaping with SIMD support (faster!)

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 src/flb_utils.c | 373 +++++++++++++++++++++++++++++-------------------
 1 file changed, 225 insertions(+), 148 deletions(-)

diff --git a/src/flb_utils.c b/src/flb_utils.c
index ba1661c1751..80cb3c6252d 100644
--- a/src/flb_utils.c
+++ b/src/flb_utils.c
@@ -35,6 +35,7 @@
 #include <fluent-bit/flb_output.h>
 #include <fluent-bit/flb_utils.h>
 #include <fluent-bit/flb_utf8.h>
+#include <fluent-bit/flb_simd.h>
 
 #ifdef FLB_HAVE_AWS_ERROR_REPORTER
 #include <fluent-bit/aws/flb_aws_error_reporter.h>
@@ -577,7 +578,7 @@ int64_t flb_utils_size_to_bytes(const char *size)
 
     if (tmp[0] == 'K') {
         /* set upper bound (2**64/KB)/2 to avoid overflows */
-        if (val >= 9223372036854775 || val <= -9223372036854774)
+        if (val >= 9223372036854775.0 || val <= -9223372036854774.0)
         {
             return -1;
         }
@@ -785,201 +786,277 @@ static const struct escape_seq json_escape_table[128] = {
  * to escape special characters and convert utf-8 byte characters to string
  * representation.
  */
-int flb_utils_write_str(char *buf, int *off, size_t size,
-                        const char *str, size_t str_len)
+
+
+int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len)
 {
-    int i;
-    int b;
-    int ret;
-    int written = 0;
-    int required;
-    int len;
-    int hex_bytes;
-    int is_valid;
-    int utf_sequence_number;
-    int utf_sequence_length;
-    uint32_t codepoint;
-    uint32_t state = 0;
+    int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number;
+    int is_valid, copypos = 0, vlen;
+    uint32_t codepoint, state = 0;
     char tmp[16];
     size_t available;
     uint32_t c;
     char *p;
     uint8_t *s;
+    off_t offset = 0;
+
+    available = size - *off;
 
-    available = (size - *off);
-    required = str_len;
-    if (available <= required) {
+    /* Ensure we have some minimum space in the buffer */
+    if (available < str_len) {
         return FLB_FALSE;
     }
 
     p = buf + *off;
-    for (i = 0; i < str_len; i++) {
-        if ((available - written) < 2) {
-            return FLB_FALSE;
-        }
+    vlen = str_len & ~(sizeof(flb_vector8) - 1);  // Align length for SIMD
 
-        c = (uint32_t) str[i];
+    for (i = 0;;) {
+        /* SIMD optimization: Process chunk of input string */
+        for (; i < vlen; i += sizeof(flb_vector8)) {
+            flb_vector8 chunk;
+            flb_vector8_load(&chunk, (const uint8_t *)&str[i]);
 
-        /* Use the lookup table for known escape sequences */
-        if (c < 128 && json_escape_table[c].seq) {
             /*
-             * check the length for the string, for simple escaping is always
-             * 2 bytes and 6 bytes for unicode
+             * Look for the special characters we are interested in,
+             * if they are found we break the loop and escape them
+             * in a char-by-char basis. Otherwise the do a bulk copy
              */
-            if (json_escape_table[c].seq[1] == 'u') {
-                len = 6;
-            }
-            else {
-                len = 2;
+            if (flb_vector8_has_le(chunk, (unsigned char)0x1F) ||
+                flb_vector8_has(chunk, (unsigned char)'"') ||
+                flb_vector8_has(chunk, (unsigned char)'\\') ||
+                flb_vector8_has(chunk, (unsigned char)'\n') ||
+                flb_vector8_has(chunk, (unsigned char)'\r') ||
+                flb_vector8_has(chunk, (unsigned char)'\t') ||
+                flb_vector8_has(chunk, (unsigned char)'\b') ||
+                flb_vector8_has(chunk, (unsigned char)'\f')) {
+                break;
             }
+        }
 
-            if ((available - written) < len) {
+        /* Copy the chunk processed so far */
+        if (copypos < i) {
+            /* check if we have enough space */
+            if (available < i - copypos) {
                 return FLB_FALSE;
             }
 
-            memcpy(p, json_escape_table[c].seq, len);
-            p += len;
+            /* copy and adjust pointers */
+            memcpy(p, &str[copypos], i - copypos);
+            p += i - copypos;
+            offset += i - copypos;
+            available -= (i - copypos);
+            copypos = i;
         }
-        else if (c >= 0x80 && c <= 0xFFFF) {
-            hex_bytes = flb_utf8_len(str + i);
-            if (available - written < 6) {
-                return FLB_FALSE;
-            }
 
-            if (i + hex_bytes > str_len) {
-                break; /* skip truncated UTF-8 */
+        /* Process remaining characters one by one */
+        for (b = 0; b < sizeof(flb_vector8); b++) {
+            if (i == str_len) {
+                /* all characters has been processed */
+                goto done;
             }
 
-            state = FLB_UTF8_ACCEPT;
-            codepoint = 0;
+            c = (uint32_t) str[i];
 
-            for (b = 0; b < hex_bytes; b++) {
-                s = (unsigned char *) str + i + b;
-                ret = flb_utf8_decode(&state, &codepoint, *s);
-                if (ret == 0) {
-                    break;
+            /* Use lookup table for escaping known sequences */
+            if (c < 128 && json_escape_table[c].seq) {
+                /*
+                 * All characters in the table have a lenght of 2 or 6 bytes,
+                 * just check if the second byte starts with 'u' so we know
+                 * it's unicode and needs 6 bytes of space.
+                 */
+                if (json_escape_table[c].seq[1] == 'u') {
+                    len = 6;
+                }
+                else {
+                    len = 2;
                 }
-            }
 
-            if (state != FLB_UTF8_ACCEPT) {
-                /* Invalid UTF-8 hex, just skip utf-8 bytes */
-                flb_warn("[pack] invalid UTF-8 bytes found, skipping bytes");
-            }
-            else {
-                len = snprintf(tmp, sizeof(tmp) - 1, "\\u%.4x", codepoint);
-                if ((available - written) < len) {
+                /* check if we have anough space */
+                if (available < len) {
                     return FLB_FALSE;
                 }
-                encoded_to_buf(p, tmp, len);
+
+                /* copy the escape sequence */
+                memcpy(p, json_escape_table[c].seq, len);
                 p += len;
+                offset += len;
+                available -= len;
             }
-            i += (hex_bytes - 1);
-        }
-        else if (c > 0xFFFF) {
-            utf_sequence_length = flb_utf8_len(str + i);
+            /* Handle UTF-8 sequences from 0x80 to 0xFFFF */
+            else if (c >= 0x80 && c <= 0xFFFF) {
+                hex_bytes = flb_utf8_len(&str[i]);
+
+                /* Handle invalid or truncated sequence */
+                if (hex_bytes == 0 || i + hex_bytes > str_len) {
+                    /* check for the minimum space required */
+                    if (available < 3) {
+                        return FLB_FALSE;
+                    }
+
+                    /* insert replacement character (U+FFFD) */
+                    p[0] = 0xEF;
+                    p[1] = 0xBF;
+                    p[2] = 0xBD;
+                    p += 3;
+                    offset += 3;
+                    available -= 3;
+
+                    /* skip the original byte */
+                    i++;
+                    continue;
+                }
 
-            if (i + utf_sequence_length > str_len) {
-                break; /* skip truncated UTF-8 */
-            }
+                // Decode UTF-8 sequence
+                state = FLB_UTF8_ACCEPT;
+                codepoint = 0;
 
-            is_valid = FLB_TRUE;
-            for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length;
-                utf_sequence_number++) {
-                /* Leading characters must start with bits 11 */
-                if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
-                    /* Invalid unicode character. replace */
-                    flb_debug("[pack] unexpected UTF-8 leading byte, "
-                             "substituting character with replacement character");
-                    tmp[utf_sequence_number] = str[i];
-                    ++i; /* Consume invalid leading byte */
-                    utf_sequence_length = utf_sequence_number + 1;
-                    is_valid = FLB_FALSE;
-                    break;
+                for (b = 0; b < hex_bytes; b++) {
+                    s = (unsigned char *) &str[i + b];
+                    ret = flb_utf8_decode(&state, &codepoint, *s);
+                    if (ret == 0) {
+                        break;
+                    }
                 }
-                /* Trailing characters must start with bits 10 */
-                else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
-                    /* Invalid unicode character. replace */
-                    flb_debug("[pack] unexpected UTF-8 continuation byte, "
-                             "substituting character with replacement character");
-                    /* This byte, i, is the start of the next unicode character */
-                    utf_sequence_length = utf_sequence_number;
-                    is_valid = FLB_FALSE;
-                    break;
+
+                if (state != FLB_UTF8_ACCEPT) {
+                    flb_warn("[pack] Invalid UTF-8 bytes found, skipping.");
+                }
+                else {
+                    len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint);
+                    if (available < len) {
+                        return FLB_FALSE;  // Not enough space
+                    }
+                    memcpy(p, tmp, len);
+                    p += len;
+                    offset += len;
+                    available -= len;
                 }
 
-                tmp[utf_sequence_number] = str[i];
-                ++i;
+                i += hex_bytes;
             }
-            --i;
+            /* Handle sequences beyond 0xFFFF */
+            else if (c > 0xFFFF) {
+                utf_sequence_length = flb_utf8_len(str + i);
 
-            if (is_valid) {
-                if (available - written < utf_sequence_length) {
-                    return FLB_FALSE;
+                /* skip truncated UTF-8 ? */
+                if (i + utf_sequence_length > str_len) {
+                    i++;
+                    break;
                 }
 
-                encoded_to_buf(p, tmp, utf_sequence_length);
-                p += utf_sequence_length;
-            }
-            else {
-                if (available - written < utf_sequence_length * 3) {
-                    return FLB_FALSE;
+                is_valid = FLB_TRUE;
+                for (utf_sequence_number = 0; utf_sequence_number < utf_sequence_length; utf_sequence_number++) {
+                    /* Leading characters must start with bits 11 */
+                    if (utf_sequence_number == 0 && ((str[i] & 0xC0) != 0xC0)) {
+                        /* Invalid unicode character. replace */
+                        flb_debug("[pack] unexpected UTF-8 leading byte, "
+                                  "substituting character with replacement character");
+                        tmp[utf_sequence_number] = str[i];
+                        i++; /* Consume invalid leading byte */
+                        utf_sequence_length = utf_sequence_number + 1;
+                        is_valid = FLB_FALSE;
+                        break;
+                    }
+                    /* Trailing characters must start with bits 10 */
+                    else if (utf_sequence_number > 0 && ((str[i] & 0xC0) != 0x80)) {
+                        /* Invalid unicode character. replace */
+                        flb_debug("[pack] unexpected UTF-8 continuation byte, "
+                                "substituting character with replacement character");
+                        /* This byte, i, is the start of the next unicode character */
+                        utf_sequence_length = utf_sequence_number;
+                        is_valid = FLB_FALSE;
+                        break;
+                    }
+
+                    tmp[utf_sequence_number] = str[i];
+                    ++i;
                 }
 
-                /*
-                 * Utf-8 sequence is invalid. Map fragments to private use area
-                 * codepoints in range:
-                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
-                 * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
-                 */
-                for (b = 0; b < utf_sequence_length; ++b) {
+                --i;
+
+                if (is_valid) {
+                    if (available < utf_sequence_length) {
+                        return FLB_FALSE;  // Not enough space
+                    }
+
+                    encoded_to_buf(p, tmp, utf_sequence_length);
+                    p += utf_sequence_length;
+                    offset += utf_sequence_length;
+                    available -= utf_sequence_length;
+                }
+                else {
+                    if (available < utf_sequence_length * 3) {
+                        return FLB_FALSE;
+                    }
+
                     /*
-                     * Utf-8 private block invalid hex mapping. Format unicode charpoint
-                     * in the following format:
-                     *
-                     *      +--------+--------+--------+
-                     *      |1110PPPP|10PPPPHH|10HHHHHH|
-                     *      +--------+--------+--------+
-                     *
-                     * Where:
-                     *   P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
-                     *   H is Utf-8 fragment hex bits (1 byte)
-                     *   1 is bit 1
-                     *   0 is bit 0
-                     */
-
-                    /* unicode codepoint start */
-                    *p = 0xE0;
-
-                    /* print unicode private block header first 4 bits */
-                    *p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
-                    ++p;
-
-                    /* unicode codepoint middle */
-                    *p = 0x80;
-
-                    /* print end of unicode private block header last 4 bits */
-                    *p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);
-
-                    /* print hex fragment first 2 bits */
-                    *p |= (tmp[b] >> 6) & 0x03;
-                    ++p;
-
-                    /* unicode codepoint middle */
-                    *p = 0x80;
-
-                    /* print hex fragment last 6 bits */
-                    *p |= tmp[b] & 0x3f;
-                    ++p;
+                    * Utf-8 sequence is invalid. Map fragments to private use area
+                    * codepoints in range:
+                    * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>00 to
+                    * 0x<FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR>FF
+                    */
+                    for (b = 0; b < utf_sequence_length; ++b) {
+                        /*
+                        * Utf-8 private block invalid hex mapping. Format unicode charpoint
+                        * in the following format:
+                        *
+                        *      +--------+--------+--------+
+                        *      |1110PPPP|10PPPPHH|10HHHHHH|
+                        *      +--------+--------+--------+
+                        *
+                        * Where:
+                        *   P is FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR bits (1 byte)
+                        *   H is Utf-8 fragment hex bits (1 byte)
+                        *   1 is bit 1
+                        *   0 is bit 0
+                        */
+
+                        /* unicode codepoint start */
+                        *p = 0xE0;
+
+                        /* print unicode private block header first 4 bits */
+                        *p |= FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR >> 4;
+                        ++p;
+
+                        /* unicode codepoint middle */
+                        *p = 0x80;
+
+                        /* print end of unicode private block header last 4 bits */
+                        *p |= ((FLB_UTILS_FRAGMENT_PRIVATE_BLOCK_DESCRIPTOR << 2) & 0x3f);
+
+                        /* print hex fragment first 2 bits */
+                        *p |= (tmp[b] >> 6) & 0x03;
+                        ++p;
+
+                        /* unicode codepoint middle */
+                        *p = 0x80;
+
+                        /* print hex fragment last 6 bits */
+                        *p |= tmp[b] & 0x3f;
+                        ++p;
+
+                        offset += 3;
+                        available -= 3;
+                    }
                 }
             }
+            else {
+                if (available < 1) {
+                    return FLB_FALSE;  // No space for a single byte
+                }
+                *p++ = c;
+                offset++;
+                available--;
+            }
+
+            i++;
         }
-        else {
-            *p++ = c;
-        }
-        written = (p - (buf + *off));
+
+        copypos = i;
     }
 
-    *off += written;
+done:
+    *off += offset;  // Update the buffer offset
 
     return FLB_TRUE;
 }

From 34f365ccd528ab8eb49eb699e8d15e5a5554dad5 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Sat, 19 Oct 2024 16:38:05 -0600
Subject: [PATCH 4/8] utils: code cleanup and fix comments

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 src/flb_utils.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/flb_utils.c b/src/flb_utils.c
index 80cb3c6252d..211632260e3 100644
--- a/src/flb_utils.c
+++ b/src/flb_utils.c
@@ -787,7 +787,6 @@ static const struct escape_seq json_escape_table[128] = {
  * representation.
  */
 
-
 int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_t str_len)
 {
     int i, b, ret, len, hex_bytes, utf_sequence_length, utf_sequence_number;
@@ -808,8 +807,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
     }
 
     p = buf + *off;
-    vlen = str_len & ~(sizeof(flb_vector8) - 1);  // Align length for SIMD
 
+    /* align length to the nearest multiple of the vector size for safe SIMD processing */
+    vlen = str_len & ~(sizeof(flb_vector8) - 1);
     for (i = 0;;) {
         /* SIMD optimization: Process chunk of input string */
         for (; i < vlen; i += sizeof(flb_vector8)) {
@@ -821,14 +821,9 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
              * if they are found we break the loop and escape them
              * in a char-by-char basis. Otherwise the do a bulk copy
              */
-            if (flb_vector8_has_le(chunk, (unsigned char)0x1F) ||
-                flb_vector8_has(chunk, (unsigned char)'"') ||
-                flb_vector8_has(chunk, (unsigned char)'\\') ||
-                flb_vector8_has(chunk, (unsigned char)'\n') ||
-                flb_vector8_has(chunk, (unsigned char)'\r') ||
-                flb_vector8_has(chunk, (unsigned char)'\t') ||
-                flb_vector8_has(chunk, (unsigned char)'\b') ||
-                flb_vector8_has(chunk, (unsigned char)'\f')) {
+            if (flb_vector8_has_le(chunk, (unsigned char) 0x1F) ||
+                flb_vector8_has(chunk, (unsigned char)    '"') ||
+                flb_vector8_has(chunk, (unsigned char)    '\\')) {
                 break;
             }
         }
@@ -850,7 +845,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
 
         /* Process remaining characters one by one */
         for (b = 0; b < sizeof(flb_vector8); b++) {
-            if (i == str_len) {
+            if (i >= str_len) {
                 /* all characters has been processed */
                 goto done;
             }
@@ -906,7 +901,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
                     continue;
                 }
 
-                // Decode UTF-8 sequence
+                /* decode UTF-8 sequence */
                 state = FLB_UTF8_ACCEPT;
                 codepoint = 0;
 
@@ -922,6 +917,7 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
                     flb_warn("[pack] Invalid UTF-8 bytes found, skipping.");
                 }
                 else {
+
                     len = snprintf(tmp, sizeof(tmp), "\\u%.4x", codepoint);
                     if (available < len) {
                         return FLB_FALSE;  // Not enough space
@@ -1042,7 +1038,8 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
             }
             else {
                 if (available < 1) {
-                    return FLB_FALSE;  // No space for a single byte
+                    /*  no space for a single byte */
+                    return FLB_FALSE;
                 }
                 *p++ = c;
                 offset++;
@@ -1056,7 +1053,8 @@ int flb_utils_write_str(char *buf, int *off, size_t size, const char *str, size_
     }
 
 done:
-    *off += offset;  // Update the buffer offset
+    /* update the buffer offset */
+    *off += offset;
 
     return FLB_TRUE;
 }

From 173ddae45ffffa9251f7e803d415724d5f29d9f8 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Sat, 19 Oct 2024 16:38:41 -0600
Subject: [PATCH 5/8] simd: use new defs and fix fallback mechanism

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 include/fluent-bit/flb_simd.h | 70 ++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 29 deletions(-)

diff --git a/include/fluent-bit/flb_simd.h b/include/fluent-bit/flb_simd.h
index 0f66d5e1bed..9ae53b115fe 100644
--- a/include/fluent-bit/flb_simd.h
+++ b/include/fluent-bit/flb_simd.h
@@ -23,8 +23,11 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#include <fluent-bit/flb_info.h>
+#define UINT64CONST(x) (x##ULL)
+
 /* Only enable SIMD support if it has not been explicity disabled */
-#ifndef FLB_SIMD_DISABLED
+#ifdef FLB_HAVE_SIMD
 
 #if (defined(__x86_64__) || defined(_M_AMD64))
 /*
@@ -38,6 +41,7 @@
  */
 #include <emmintrin.h>
 #define FLB_SIMD_SSE2
+
 typedef __m128i flb_vector8;
 typedef __m128i flb_vector32;
 
@@ -65,11 +69,14 @@ typedef uint32x4_t flb_vector32;
  * two 32-bit integers using a uint64.
  */
 #define FLB_SIMD_NONE
-typedef uint64 flb_vector8;
+typedef uint64_t flb_vector8;
 #endif
 
 #else
 #define FLB_SIMD_NONE
+
+/* Original code aims to handle this as a uint64_t to search */
+typedef uint8_t flb_vector8;
 #endif /* FLB_SIMD_DISABLED */
 
 /* element-wise comparisons to a scalar */
@@ -88,7 +95,7 @@ static inline void flb_vector8_load(flb_vector8 *v, const uint8_t *s)
 #elif defined(FLB_SIMD_NEON)
 	*v = vld1q_u8(s);
 #else
-	memcpy(v, s, sizeof(flb_vector8));
+	memset(v, 0, sizeof(flb_vector8));
 #endif
 }
 
@@ -124,7 +131,7 @@ static inline flb_vector8 flb_vector8_ssub(const flb_vector8 v1, const flb_vecto
 	return vqsubq_u8(v1, v2);
 #endif
 }
-#endif							/* ! USE_NO_SIMD */
+#endif /* ! FLB_SIMD_NONE */
 
 /*
  * Return a vector with all bits set in each lane where the corresponding
@@ -139,7 +146,7 @@ static inline flb_vector8 flb_vector8_eq(const flb_vector8 v1, const flb_vector8
 	return vceqq_u8(v1, v2);
 #endif
 }
-#endif							/* ! USE_NO_SIMD */
+#endif /* ! FLB_SIMD_NONE */
 
 #ifndef FLB_SIMD_NONE
 static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vector32 v2)
@@ -150,7 +157,7 @@ static inline flb_vector32 flb_vector32_eq(const flb_vector32 v1, const flb_vect
 	return vceqq_u32(v1, v2);
 #endif
 }
-#endif
+#endif /* ! FLB_SIMD_NONE */
 
 /*
  * Create a vector with all elements set to the same value.
@@ -185,11 +192,10 @@ static inline bool flb_vector8_is_highbit_set(const flb_vector8 v)
  */
 static inline bool flb_vector8_has(const flb_vector8 v, const uint8_t c)
 {
-	bool result;
+	bool result = false;
 
 #if defined(FLB_SIMD_NONE)
-	/* any bytes in v equal to c will evaluate to zero via XOR */
-	result = flb_vector8_has_zero(v ^ flb_vector8_broadcast(c));
+	 return flb_vector8_has_zero(v ^ flb_vector8_broadcast(c));
 #else
 	result = flb_vector8_is_highbit_set(flb_vector8_eq(v, flb_vector8_broadcast(c)));
 #endif
@@ -202,26 +208,28 @@ static inline bool flb_vector8_has_le(const flb_vector8 v, const uint8_t c)
 	bool result = false;
 
 #if defined(FLB_SIMD_NONE)
-
 	/*
-	 * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
-	 * but it only works if c+1 <= 128 and if the highest bit in v is not set.
-	 * Adapted from
-	 * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
+	 *  To find bytes <= c, we can use bitwise operations to find bytes < c+1,
+	 *  but it only works if c+1 <= 128 and if the highest bit in v is not set.
+	 *
+	 *  Adapted from
+	 *
+	 *    https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
 	 */
-	if ((int64) v >= 0 && c < 0x80)
+	if ((int64_t) v >= 0 && c < 0x80) {
 		result = (v - flb_vector8_broadcast(c + 1)) & ~v & flb_vector8_broadcast(0x80);
+	}
 	else {
-		/* one byte at a time */
-        int i;
-		for (i = 0; i < sizeof(flb_vector8); i++)
-		{
-			if (((const uint8 *) &v)[i] <= c) {
+		size_t i;
+			for (i = 0; i < sizeof(flb_vector8); i++) {
+			if (((const uint8_t *) &v)[i] <= c) {
 				result = true;
 				break;
 			}
 		}
 	}
+
+	return result;
 #else
 	/*
 	 * Use saturating subtraction to find bytes <= c, which will present as
@@ -236,15 +244,19 @@ static inline bool flb_vector8_has_le(const flb_vector8 v, const uint8_t c)
 
 static inline char *flb_simd_info()
 {
-    #if defined(FLB_SIMD_SSE2)
-        return "SSE2";
-    #elif defined(FLB_SIMD_NEON)
-        return "NEON";
-    #elif defined(FLB_SIMD_NONE)
-        return "none";
-    #elif defined(FLB_SIMD_DISABLED)
-        return "none (disabled)";
+	#ifdef FLB_HAVE_SIMD
+		#if defined(FLB_SIMD_SSE2)
+			return "SSE2";
+		#elif defined(FLB_SIMD_NEON)
+			return "NEON";
+		#elif defined(FLB_SIMD_NONE)
+			return "none";
+		#else
+			return "unknown";
+		#endif
+	#else
+        return "disabled";
     #endif
 }
 
-#endif /* FLB_SIMD_H */
+#endif /* FLB_HAVE_SIMD */

From 0a2bd90337bca07adb0043902b8c2f300176199d Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Sat, 19 Oct 2024 16:43:21 -0600
Subject: [PATCH 6/8] build: add new build option FLB_SIMD (default: off)

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e6c5ac2613..eff73766e7d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -141,6 +141,7 @@ option(FLB_SIGNV4              "Enable AWS Signv4 support"    Yes)
 option(FLB_AWS                 "Enable AWS support"           Yes)
 option(FLB_STATIC_CONF         "Build binary using static configuration")
 option(FLB_STREAM_PROCESSOR    "Enable Stream Processor"                    Yes)
+option(FLB_SIMD                "Enable SIMD support"                         No)
 option(FLB_CORO_STACK_SIZE     "Set coroutine stack size")
 option(FLB_AVRO_ENCODER        "Build with Avro encoding support"            No)
 option(FLB_AWS_ERROR_REPORTER  "Build with aws error reporting support"      No)
@@ -181,6 +182,10 @@ option(FLB_EVENT_LOOP_KQUEUE    "Enable kqueue(2) event loop backend" No)
 option(FLB_EVENT_LOOP_SELECT    "Enable select(2) event loop backend" No)
 option(FLB_EVENT_LOOP_LIBEVENT  "Enable libevent event loop backend"  No)
 
+# SIMD support
+if(FLB_SIMD)
+  FLB_DEFINITION(FLB_HAVE_SIMD)
+endif()
 
 if(DEFINED FLB_NIGHTLY_BUILD AND NOT "${FLB_NIGHTLY_BUILD}" STREQUAL "")
   FLB_DEFINITION_VAL(FLB_NIGHTLY_BUILD ${FLB_NIGHTLY_BUILD})

From 2c341e56764a3cdd99dce681b2e60a3262609ba2 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Sat, 19 Oct 2024 16:43:43 -0600
Subject: [PATCH 7/8] bin: print SIMD status

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 src/flb_engine.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/flb_engine.c b/src/flb_engine.c
index 911c1389a79..bedc28477c5 100644
--- a/src/flb_engine.c
+++ b/src/flb_engine.c
@@ -54,6 +54,7 @@
 #include <fluent-bit/flb_downstream.h>
 #include <fluent-bit/flb_ring_buffer.h>
 #include <fluent-bit/flb_notification.h>
+#include <fluent-bit/flb_simd.h>
 
 #ifdef FLB_HAVE_METRICS
 #include <fluent-bit/flb_metrics_exporter.h>
@@ -799,6 +800,9 @@ int flb_engine_start(struct flb_config *config)
         return -1;
     }
 
+    /* Internals */
+    flb_info("[simd    ] %s", flb_simd_info());
+
     /* Init Metrics engine */
     cmt_initialize();
     flb_info("[cmetrics] version=%s", cmt_version());

From 652ac2f046d4604c5844933a9d4f0b71094dab70 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Sat, 19 Oct 2024 16:44:14 -0600
Subject: [PATCH 8/8] workflows: build with FLB_SIMD=On|Off

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 .github/workflows/unit-tests.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
index 0fa50f11890..36d1f85d841 100644
--- a/.github/workflows/unit-tests.yaml
+++ b/.github/workflows/unit-tests.yaml
@@ -42,6 +42,8 @@ jobs:
           - "-DFLB_COVERAGE=On"
           - "-DFLB_SANITIZE_MEMORY=On"
           - "-DFLB_SANITIZE_THREAD=On"
+          - "-DFLB_SIMD=On"
+          - "-DFLB_SIMD=Off"
         compiler:
           - gcc
           - clang