From bf4bf7cc6b11b50b0dc39897e5050587dccfa28e Mon Sep 17 00:00:00 2001
From: "Node.js GitHub Bot" <github-bot@iojs.org>
Date: Sun, 18 Aug 2024 00:29:37 +0000
Subject: [PATCH] deps: update simdutf to 5.5.0

PR-URL: https://github.com/nodejs/node/pull/54434
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Michael Dawson <midawson@redhat.com>
---
 deps/simdutf/simdutf.cpp | 1757 ++++++++++++++------------------------
 deps/simdutf/simdutf.h   |  231 +++--
 2 files changed, 778 insertions(+), 1210 deletions(-)
diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index 2e685aab872f93..b04ec9773e9215 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */
+/* auto-generated on 2024-09-04 18:13:32 +0200. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 // We include base64_tables once.
@@ -5643,7 +5643,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64
       buffer[idx] = uint8_t(code);
       if (is_eight_byte(c) && code <= 63) {
         idx++;
-      } else if (code > 64 || !is_eight_byte(c)) {
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       } else {
         // We have a space or a newline. We ignore it.
@@ -5698,7 +5698,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64
   }
 }
 
-// like base64_tail_decode, but it will not write past the end of the ouput buffer.
+// like base64_tail_decode, but it will not write past the end of the output buffer.
 // outlen is modified to reflect the number of bytes written.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
@@ -5747,7 +5747,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       buffer[idx] = uint8_t(code);
       if (is_eight_byte(c) && code <= 63) {
         idx++;
-      } else if (code > 64 || !is_eight_byte(c)) {
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
         outlen = size_t(dst - dstinit);
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       } else {
@@ -5908,6 +5908,99 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_optio
 
 #endif
 /* end file src/scalar/base64.h */
+/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char* buf, size_t len, char* utf8_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        size_t final_pos = pos + 16;
+        while(pos < final_pos) {
+          utf8_output[utf8_pos++] = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    unsigned char byte = data[pos];
+    if((byte & 0x80) == 0) { // if ASCII
+      // will generate one UTF-8 bytes
+      utf8_output[utf8_pos++] = char(byte);
+      pos++;
+    } else {
+      // will generate two UTF-8 bytes
+      utf8_output[utf8_pos++] = char((byte>>6) | 0b11000000);
+      utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_pos;
+}
+
+inline size_t convert_safe(const char* buf, size_t len, char* utf8_output, size_t utf8_len) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t skip_pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len && utf8_pos < utf8_len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos >= skip_pos && pos + 16 <= len && utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+        ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
+        utf8_pos += 16;
+        pos += 16;
+      } else {
+				// At least one of the next 16 bytes are not ASCII, we will process them one by one
+        skip_pos = pos + 16;
+      }
+    } else {
+      const auto byte = data[pos];
+      if((byte & 0x80) == 0) { // if ASCII
+        // will generate one UTF-8 bytes
+        utf8_output[utf8_pos++] = char(byte);
+        pos++;
+      } else if (utf8_pos + 2 <= utf8_len) {
+        // will generate two UTF-8 bytes
+        utf8_output[utf8_pos++] = char((byte>>6) | 0b11000000);
+        utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+        pos++;
+      } else {
+        break;
+      }
+    }
+  }
+  return utf8_pos;
+}
+
+} // latin1_to_utf8 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
 
 namespace simdutf {
 bool implementation::supported_by_runtime_system() const {
@@ -6074,7 +6167,7 @@ class detect_best_supported_implementation_on_first_use final : public implement
   }
 
   simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf8(buf, len,utf8_output);
+    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
   }
 
   simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
@@ -7247,6 +7340,31 @@ simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, si
 }
 
 
+
+simdutf_warn_unused size_t convert_latin1_to_utf8_safe(const char * buf, size_t len, char* utf8_output, size_t utf8_len) noexcept {
+  const auto start{utf8_output};
+
+  while (true) {
+    // convert_latin1_to_utf8 will never write more than input length * 2
+    auto read_len = std::min(len, utf8_len >> 1);
+    if (read_len <= 16) {
+      break;
+    }
+
+    const auto write_len = simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+
+    utf8_output += write_len;
+    utf8_len -= write_len;
+    buf += read_len;
+    len -= read_len;
+  }
+
+  utf8_output += scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+
+  return utf8_output - start;
+}
+
+
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
   return base64_to_binary_safe_impl<char>(input, length, output, outlen, options);
 }
@@ -13578,59 +13696,6 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
 #endif
 /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
 
-/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF8_H
-#define SIMDUTF_LATIN1_TO_UTF8_H
-
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf8 {
-
-inline size_t convert(const char* buf, size_t len, char* utf8_output) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf8_output++ = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-
-    unsigned char byte = data[pos];
-    if((byte & 0x80) == 0) { // if ASCII
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(byte);
-      pos++;
-    } else {
-      // will generate two UTF-8 bytes
-      *utf8_output++ = char((byte>>6) | 0b11000000);
-      *utf8_output++ = char((byte & 0b111111) | 0b10000000);
-      pos++;
-    }
-  }
-  return utf8_output - start;
-}
-
-} // latin1_to_utf8 namespace
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
 /* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
 #ifndef SIMDUTF_LATIN1_TO_UTF16_H
 #define SIMDUTF_LATIN1_TO_UTF16_H
@@ -14127,26 +14192,32 @@ inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output
   size_t pos = 0;
 
   while (pos < len) {
-  utf32_char = (uint32_t)data[pos];
-
-  if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-      *latin1_output++ = char(buf[pos]);
-      *latin1_output++ = char(buf[pos+1]);
-      pos += 2;
-      continue;
-    }
-  }
-  *latin1_output++ = (char)(utf32_char & 0xFF);
-  pos++;
-
+      utf32_char = (uint32_t) data[pos];
+
+      if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+          uint64_t v;
+          ::memcpy(&v, data + pos, sizeof(uint64_t));
+          if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+              *latin1_output++ = char(buf[pos]);
+              *latin1_output++ = char(buf[pos + 1]);
+              pos += 2;
+              continue;
+          } else {
+              // output can not be represented in latin1
+              return 0;
+          }
+      }
+      if ((utf32_char & 0xFFFFFF00) == 0) {
+          *latin1_output++ = char(utf32_char);
+      } else {
+          // output can not be represented in latin1
+          return 0;
+      }
+      pos++;
   }
   return latin1_output - start;
 }
 
-
 } // utf32_to_latin1 namespace
 } // unnamed namespace
 } // namespace scalar
@@ -14272,213 +14343,6 @@ simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in
   return composed;
 }
 
-/* begin file src/arm64/arm_detect_encodings.cpp */
-template<class checker>
-// len is known to be a multiple of 2 when this is called
-int arm_detect_encodings(const char * buf, size_t len) {
-    const char* start = buf;
-    const char* end = buf + len;
-
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
-
-    int out = 0;
-
-    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-
-    uint32x4_t currentmax = vmovq_n_u32(0x0);
-
-    checker check{};
-
-    while(end - buf >= 64) {
-        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
-        uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
-
-        const auto u0 = simd16<uint16_t>(in);
-        const auto u1 = simd16<uint16_t>(secondin);
-        const auto u2 = simd16<uint16_t>(thirdin);
-        const auto u3 = simd16<uint16_t>(fourthin);
-
-        const auto v0 = u0.shr<8>();
-        const auto v1 = u1.shr<8>();
-        const auto v2 = u2.shr<8>();
-        const auto v3 = u3.shr<8>();
-
-        const auto in16 = simd16<uint16_t>::pack(v0, v1);
-        const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
-
-        const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64();
-        const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64();
-
-        // Check for surrogates
-        if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) {
-            // Cannot be UTF8
-            is_utf8 = false;
-            // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
-            // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
-            // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
-            // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
-            // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
-
-            if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) {
-                is_utf32 = false;
-                // Code from arm_validate_utf16le.cpp
-                // Not efficient, we do not process surrogates_wordmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
-
-                const auto v_fc = simd8<uint8_t>::splat(0xfc);
-                const auto v_dc = simd8<uint8_t>::splat(0xdc);
-
-                const uint64_t V0 = ~surrogates_wordmask0;
-
-                const auto vH0 = ((in16 & v_fc) ==  v_dc);
-                const uint64_t H0 = vH0.to_bitmask64();
-
-                const uint64_t L0 = ~H0 & surrogates_wordmask0;
-
-                const uint64_t a0 = L0 & (H0 >> 4);
-
-                const uint64_t b0 = a0 << 4;
-
-                const uint64_t c0 = V0 | a0 | b0;
-                if (c0 == ~0ull) {
-                    input += 16;
-                } else if (c0 == 0xfffffffffffffffull) {
-                    input += 15;
-                } else {
-                    is_utf16 = false;
-                    break;
-                }
-
-                while (input + 16 < end16) {
-                    const auto in0 = simd16<uint16_t>(input);
-                    const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
-                    const auto t0 = in0.shr<8>();
-                    const auto t1 = in1.shr<8>();
-                    const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
-
-                    const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
-                    if(surrogates_wordmask == 0) {
-                        input += 16;
-                    } else {
-                        const uint64_t V = ~surrogates_wordmask;
-
-                        const auto vH = ((in_16 & v_fc) ==  v_dc);
-                        const uint64_t H = vH.to_bitmask64();
-
-                        const uint64_t L = ~H & surrogates_wordmask;
-
-                        const uint64_t a = L & (H >> 4);
-
-                        const uint64_t b = a << 4;
-
-                        const uint64_t c = V | a | b;
-                        if (c == ~0ull) {
-                            input += 16;
-                        } else if (c == 0xfffffffffffffffull) {
-                            input += 15;
-                        } else {
-                            is_utf16 = false;
-                            break;
-                        }
-                    }
-                }
-            } else {
-                is_utf16 = false;
-                // Check for UTF-32
-                if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
-
-                    // Must start checking for surrogates
-                    uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
-                    const uint32x4_t offset = vmovq_n_u32(0xffff2000);
-                    const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
-
-                    const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
-                    const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
-                    const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
-                    const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
-
-                    currentmax = vmaxq_u32(in32,currentmax);
-                    currentmax = vmaxq_u32(secondin32,currentmax);
-                    currentmax = vmaxq_u32(thirdin32,currentmax);
-                    currentmax = vmaxq_u32(fourthin32,currentmax);
-
-                    currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
-                    currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
-                    currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax);
-                    currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax);
-
-                    while (input + 4 < end32) {
-                        const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-                        currentmax = vmaxq_u32(in_32,currentmax);
-                        currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
-                        input += 4;
-                    }
-
-                    uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(vmaxvq_u32(forbidden_words) != 0) {
-                        is_utf32 = false;
-                    }
-                } else {
-                    is_utf32 = false;
-                }
-            }
-            break;
-        }
-        // If no surrogate, validate under other encodings as well
-
-        // UTF-32 validation
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
-
-        // UTF-8 validation
-        // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
-        simd::simd8x64<uint8_t> in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin));
-        check.check_next_input(in8);
-
-        buf += 64;
-    }
-
-    // Check which encodings are possible
-
-    if (is_utf8) {
-        if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
-            std::memset(block, 0x20, 64);
-            std::memcpy(block, buf, len - (buf - start));
-            simd::simd8x64<uint8_t> in(block);
-            check.check_next_input(in);
-        }
-        if (!check.errors()) {
-            out |= simdutf::encoding_type::UTF8;
-        }
-    }
-
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
-        out |= simdutf::encoding_type::UTF16_LE;
-    }
-
-    if (is_utf32 && (len % 4 == 0)) {
-        const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
-        uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
-            out |= simdutf::encoding_type::UTF32_LE;
-        }
-    }
-
-    return out;
-}
-/* end file src/arm64/arm_detect_encodings.cpp */
-
 /* begin file src/arm64/arm_validate_utf16.cpp */
 template <endianness big_endian>
 const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
@@ -14985,6 +14849,12 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
     // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
     uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we have.
+    // Unfortunately, we cannot trust the input. So it is possible to get 0xff as an input byte
+    // and it should not result in a surrogate pair. We need to check for that.
+    uint32_t permbuffer[4];
+    vst1q_u32(permbuffer, perm);
     // Mask the low and middle bytes
     // 00000000 00000000 00000000 0ddddddd
     uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
@@ -15040,11 +14910,14 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // Attempting to shuffle and store would be complex, just scalarize.
     uint32_t buffer[4];
     vst1q_u32(buffer, selected);
-    // Test for the top bit of the surrogate mask.
-    const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000;
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000;
     for (size_t i = 0; i < 3; i++) {
       // Surrogate
-      if (buffer[i] & SURROGATE_MASK) {
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if((permbuffer[i] & 0xf8000000) == 0xf0000000) {
         utf16_output[0] = uint16_t(buffer[i] >> 16);
         utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
         utf16_output += 2;
@@ -15083,14 +14956,14 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   // This results in more instructions but, potentially, also higher speeds.
   //
   // We first try a few fast paths.
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-    // We process in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes.
     // use fast implementation in src/simdutf/arm64/simd.h
     // Ideally the compiler can keep the tables in registers.
     simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
     temp.store_ascii_as_utf32_tbl(utf32_out);
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   if(input_utf8_end_of_code_point_mask == 0x924) {
     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units.
@@ -15259,11 +15132,11 @@ size_t convert_masked_utf8_to_latin1(const char *input,
 
   // We first try a few fast paths.
   // The obvious first test is ASCII, which actually consumes the full 16.
-  if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
-    // We process in chunks of 16 bytes
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes
     vst1q_u8(reinterpret_cast<uint8_t*>(latin1_output), in);
-    latin1_output += 16; // We wrote 16 18-bit characters.
-    return 16; // We consumed 16 bytes.
+    latin1_output += 12; // We wrote 12 18-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   /// We do not have a fast path available, or the fast path is unimportant, so we fallback.
   const uint8_t idx =
@@ -16832,6 +16705,7 @@ static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
 struct block64 {
   uint8x16_t chunks[4];
 };
+
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
 template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
@@ -16853,6 +16727,7 @@ template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
   uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
   uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+
   // Needed by the decoding step.
   uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
   uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
@@ -16862,20 +16737,16 @@ template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
   if (base64_url) {
     lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4);
+        simdutf_make_uint8x16_t(0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xf4,0xe5,0xa5,0xf4,0xf4);
   } else {
     lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4);
+        simdutf_make_uint8x16_t(0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xb4,0xe5,0xe5,0xf4,0xb4);
   }
 #else
   if (base64_url) {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-              0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4};
+    lut_lo = uint8x16_t{0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xf4,0xe5,0xa5,0xf4,0xf4};
   } else {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-              0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4};
+    lut_lo = uint8x16_t{0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xb4,0xe5,0xe5,0xf4,0xb4};
   }
 #endif
   uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
@@ -17047,19 +16918,22 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
                               base64_options options) {
   const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
                                         : tables::base64::to_base64_value;
+  size_t equallocation = srclen; // location of the first padding character if any
   // skip trailing spaces
-  while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
     srclen--;
   }
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
     srclen--;
     equalsigns = 1;
     // skip trailing spaces
-    while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
       srclen--;
     }
     if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
       srclen--;
       equalsigns = 2;
     }
@@ -17079,21 +16953,23 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
       src += 64;
       bool error = false;
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
-      if(badcharmask)
-      if (error) {
-        src -= 64;
-
-        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
-          src++;
+      if(badcharmask){
+        if (error) {
+          src -= 64;
+          while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) {
+            src++;
+          }
+          if(src < srcend){
+            // should never happen
+          }
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
-        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
 
       if (badcharmask != 0) {
         // optimization opportunity: check for simple masks like those made of
         // continuous 1s followed by continuous 0s. And masks containing a
         // single bad character.
-
         bufferptr += compress_block(&b, badcharmask, bufferptr);
       } else {
         // optimization opportunity: if bufferptr == buffer and mask == 0, we
@@ -17120,7 +16996,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
-      if (val > 64) {
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
       bufferptr += (val <= 63);
@@ -17163,7 +17039,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
         uint8_t val = to_base64[uint8_t(*src)];
-        if (val > 64) {
+        if (!scalar::base64::is_eight_byte(*src) || val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
         buffer_start[leftover] = char(val);
@@ -17215,13 +17091,14 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
       // additional checks
       if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
         r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.count = equallocation;
       }
     }
     return r;
   }
   if(equalsigns > 0) {
     if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
   return {SUCCESS, size_t(dst - dstinit)};
@@ -17879,6 +17756,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -17953,14 +17833,14 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -18198,10 +18078,10 @@ using namespace simd;
       size_t pos = 0;
       char32_t* start{utf32_output};
       // In the worst case, we have the haswell kernel which can cause an overflow of
-      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
       // much more than 8 bytes. However, you cannot generally assume that you have valid
-      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // UTF-8 input, so we are going to go back from the end counting 16 leading bytes,
       // to give us a good margin.
       size_t leading_byte = 0;
       size_t margin = size;
@@ -18231,6 +18111,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // we have an error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -18304,12 +18187,12 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -18858,15 +18741,16 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
-    } else {
-      return simdutf::encoding_type::unspecified;
-    }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
+  }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
   }
+  return out;
 }
 
 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
@@ -18886,6 +18770,10 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char
 }
 
 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
   const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
@@ -18895,7 +18783,11 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s
 }
 
 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -18930,6 +18822,10 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch
 }
 
 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
   const char32_t* tail = arm_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
@@ -19596,6 +19492,8 @@ size_t implementation::binary_to_base64(const char * input, size_t length, char*
 
 
 
+#include <cstdint>
+#include <cstring>
 
 namespace simdutf {
 namespace fallback {
@@ -19604,6 +19502,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  // todo: reimplement as a one-pass algorithm.
   int out = 0;
   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
   if((length % 2) == 0) {
@@ -19612,7 +19511,6 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz
   if((length % 4) == 0) {
     if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
   }
-
   return out;
 }
 
@@ -19657,7 +19555,7 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::latin1_to_utf8::convert(buf,len,utf8_output);
+  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
@@ -19669,7 +19567,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char*
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::latin1_to_utf32::convert(buf,len,utf32_output);
+  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept {
@@ -19870,7 +19768,31 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t lengt
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept {
-  return scalar::latin1::utf8_length_from_latin1(input,length);
+  size_t answer = length;
+  size_t i = 0;
+  auto pop = [](uint64_t v) {
+    return (size_t)(((v>>7) & UINT64_C(0x0101010101010101)) * UINT64_C(0x0101010101010101) >> 56);
+  };
+  for(; i + 32 <= length; i += 32) {
+    uint64_t v;
+    memcpy(&v, input + i, 8);
+    answer += pop(v);
+    memcpy(&v, input + i + 8, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 16, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 24, sizeof(v));
+    answer += pop(v);
+  }
+  for(; i + 8 <= length; i += 8) {
+    uint64_t v;
+    memcpy(&v, input + i, sizeof(v));
+    answer += pop(v);
+  }
+  for(; i + 1 <= length; i += 1) {
+      answer += static_cast<uint8_t>(input[i]) >> 7;
+  }
+  return answer;
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
@@ -19924,12 +19846,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   size_t equallocation = length; // location of the first padding character if any
   size_t equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -19961,12 +19885,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   size_t equallocation = length; // location of the first padding character if any
   size_t equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -23078,19 +23004,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                               base64_options options) {
   const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
                                         : tables::base64::to_base64_value;
+  size_t equallocation = srclen; // location of the first padding character if any
   size_t equalsigns = 0;
   // skip trailing spaces
-  while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
     srclen--;
   }
   if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
     srclen--;
     equalsigns = 1;
     // skip trailing spaces
-    while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
       srclen--;
     }
     if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
       srclen--;
       equalsigns = 2;
     }
@@ -23113,7 +23042,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -23151,7 +23080,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
-      if (val > 64) {
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
       bufferptr += (val <= 63);
@@ -23192,7 +23121,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
         uint8_t val = to_base64[uint8_t(*src)];
-        if (val > 64) {
+        if (!scalar::base64::is_eight_byte(*src) || val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
         buffer_start[leftover] = char(val);
@@ -23244,13 +23173,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       // additional checks
       if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
         r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.count = equallocation;
       }
     }
     return r;
   }
   if(equalsigns > 0) {
     if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
   return {SUCCESS, size_t(dst - dstinit)};
@@ -23272,136 +23202,17 @@ implementation::detect_encodings(const char *input,
                                  size_t length) const noexcept {
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: convert to a one-pass algorithm
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    const char *buf = input;
-
-    const char *start = buf;
-    const char *end = input + length;
-
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
-
-    int out = 0;
-
-    avx512_utf8_checker checker{};
-    __m512i currentmax = _mm512_setzero_si512();
-    while (end - buf >= 64) {
-      __m512i in = _mm512_loadu_si512((__m512i *)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if (surrogates) {
-        is_utf8 = false;
-
-        // Can still be either UTF-16LE or UTF-32 depending on the positions
-        // of the surrogates To be valid UTF-32, a surrogate cannot be in the
-        // two most significant bytes of any 32-bit word. On the other hand, to
-        // be valid UTF-16LE, at least one surrogate must be in the two most
-        // significant bytes of a 32-bit word since they always come in pairs in
-        // UTF-16LE. Note that we always proceed in multiple of 4 before this
-        // point so there is no offset in 32-bit code units.
-
-        if ((surrogates & 0xaaaaaaaa) != 0) {
-          is_utf32 = false;
-          __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
-              diff, _mm512_set1_epi16(uint16_t(0x0400)));
-          __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-          // high must be followed by low
-          if ((highsurrogates << 1) != lowsurrogates) {
-            return simdutf::encoding_type::unspecified;
-          }
-
-          bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-          if (ends_with_high) {
-            buf +=
-                31 *
-                sizeof(char16_t); // advance only by 31 code units so that we start
-                                  // with the high surrogate on the next round.
-          } else {
-            buf += 32 * sizeof(char16_t);
-          }
-          is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
-                                      (end - buf) / sizeof(char16_t));
-          if (!is_utf16) {
-            return simdutf::encoding_type::unspecified;
-
-          } else {
-            return simdutf::encoding_type::UTF16_LE;
-          }
-
-        } else {
-          is_utf16 = false;
-          // Check for UTF-32
-          if (length % 4 == 0) {
-            const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
-            const char32_t *end32 =
-                reinterpret_cast<const char32_t *>(start) + length / 4;
-            if (validate_utf32(input32, end32 - input32)) {
-              return simdutf::encoding_type::UTF32_LE;
-            }
-          }
-          return simdutf::encoding_type::unspecified;
-        }
-      }
-      // If no surrogate, validate under other encodings as well
-
-      // UTF-32 validation
-      currentmax = _mm512_max_epu32(in, currentmax);
-
-      // UTF-8 validation
-      checker.check_next_input(in);
-
-      buf += 64;
-    }
-
-    // Check which encodings are possible
-
-    if (is_utf8) {
-      size_t current_length = static_cast<size_t>(buf - start);
-      if (current_length != length) {
-                const __m512i utf8 = _mm512_maskz_loadu_epi8(
-            (UINT64_C(1) << (length - current_length)) - 1, (const __m512i *)buf);
-        checker.check_next_input(utf8);
-      }
-      checker.check_eof();
-      if (!checker.errors()) {
-        out |= simdutf::encoding_type::UTF8;
-      }
-    }
-
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
-                        reinterpret_cast<const char16_t *>(buf),
-                        (length - (buf - start)) / 2)) {
-      out |= simdutf::encoding_type::UTF16_LE;
-    }
-
-    if (is_utf32 && (length % 4 == 0)) {
-      size_t leftover = length - static_cast<size_t>(buf - start);
-      currentmax = _mm512_max_epu32(
-          _mm512_maskz_loadu_epi8(
-              (UINT64_C(1) << leftover) - 1,
-              (const __m512i *)buf),
-          currentmax);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range == 0) {
-          out |= simdutf::encoding_type::UTF32_LE;
-        } else {
-        }
-        //}
-      /*} else {
-        out |= simdutf::encoding_type::UTF32_LE;
-      }*/
-    }
-
-    return out;
-  } else if (implementation::validate_utf8(input, length)) {
-    return simdutf::encoding_type::UTF8;
-  } else {
-    return simdutf::encoding_type::unspecified;
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
   }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
+  }
+  return out;
 }
 
 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
@@ -23673,7 +23484,8 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
-    return false;
+    // we come here if there was an error, or buf was nullptr which may happen for empty input.
+    return len == 0;
   }
 }
 
@@ -23769,7 +23581,7 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(co
     return {simdutf::SUCCESS, written};
   }
   size_t pos = obuf - buf;
-  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, buf + pos, len - pos, olatin1_output);
+  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, buf + pos, len - pos, latin1_output);
   res.count += pos;
   return res;
 }
@@ -24506,60 +24318,68 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *
   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
   size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
   size_t i = 0;
-  unsigned char v_0xFF = 0xff;
-  __m512i eight_64bits = _mm512_setzero_si512();
-  while (i + sizeof(__m512i) <= length) {
-    __m512i runner = _mm512_setzero_si512();
-    size_t iterations = (length - i) / sizeof(__m512i);
-    if (iterations > 255) {
-      iterations = 255;
-    }
-    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
-    for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) {
-            // Load four __m512i vectors
-            __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
-            __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
-            __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
-            __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
-
-            // Generate four masks
-            __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
-            __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
-            __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
-            __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
-            // Apply the masks and subtract from the runner
-            __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
-            __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
-            __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
-            __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
-
-            runner = _mm512_sub_epi8(runner, not_ascii1);
-            runner = _mm512_sub_epi8(runner, not_ascii2);
-            runner = _mm512_sub_epi8(runner, not_ascii3);
-            runner = _mm512_sub_epi8(runner, not_ascii4);
-    }
-
-    for (; i <= max_i; i += sizeof(__m512i)) {
-      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
-
-      __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
-      __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
-      runner = _mm512_sub_epi8(runner, not_ascii);
+  if(answer >= 2048) { // long strings optimization
+    unsigned char v_0xFF = 0xff;
+    __m512i eight_64bits = _mm512_setzero_si512();
+    while (i + sizeof(__m512i) <= length) {
+      __m512i runner = _mm512_setzero_si512();
+      size_t iterations = (length - i) / sizeof(__m512i);
+      if (iterations > 255) {
+        iterations = 255;
+      }
+      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+      for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) {
+              // Load four __m512i vectors
+              __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+              __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+              __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i)));
+              __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i)));
+
+              // Generate four masks
+              __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
+              __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
+              __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
+              __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
+              // Apply the masks and subtract from the runner
+              __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
+              __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
+              __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
+              __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
+
+              runner = _mm512_sub_epi8(runner, not_ascii1);
+              runner = _mm512_sub_epi8(runner, not_ascii2);
+              runner = _mm512_sub_epi8(runner, not_ascii3);
+              runner = _mm512_sub_epi8(runner, not_ascii4);
+      }
+
+      for (; i <= max_i; i += sizeof(__m512i)) {
+        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+
+        __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
+        __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
+        runner = _mm512_sub_epi8(runner, not_ascii);
+      }
+
+      eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
+    }
+
+    __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
+    __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
+    answer += (size_t)_mm256_extract_epi64(first_half, 0) +
+              (size_t)_mm256_extract_epi64(first_half, 1) +
+              (size_t)_mm256_extract_epi64(first_half, 2) +
+              (size_t)_mm256_extract_epi64(first_half, 3) +
+              (size_t)_mm256_extract_epi64(second_half, 0) +
+              (size_t)_mm256_extract_epi64(second_half, 1) +
+              (size_t)_mm256_extract_epi64(second_half, 2) +
+              (size_t)_mm256_extract_epi64(second_half, 3);
+  } else if (answer > 0) {
+    for(; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
+      __m512i latin = _mm512_loadu_si512((const __m512i*)(str + i));
+      uint64_t non_ascii = _mm512_movepi8_mask(latin);
+      answer += count_ones(non_ascii);
     }
-
-    eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
   }
-
-  __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
-  __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
-  answer += (size_t)_mm256_extract_epi64(first_half, 0) +
-            (size_t)_mm256_extract_epi64(first_half, 1) +
-            (size_t)_mm256_extract_epi64(first_half, 2) +
-            (size_t)_mm256_extract_epi64(first_half, 3) +
-            (size_t)_mm256_extract_epi64(second_half, 0) +
-            (size_t)_mm256_extract_epi64(second_half, 1) +
-            (size_t)_mm256_extract_epi64(second_half, 2) +
-            (size_t)_mm256_extract_epi64(second_half, 3);
   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), length - i);
 }
 
@@ -24725,195 +24545,6 @@ simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t>
   return simd8<bool>(is_third_byte | is_fourth_byte);
 }
 
-/* begin file src/haswell/avx2_detect_encodings.cpp */
-template<class checker>
-// len is known to be a multiple of 2 when this is called
-int avx2_detect_encodings(const char * buf, size_t len) {
-    const char* start = buf;
-    const char* end = buf + len;
-
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
-
-    int out = 0;
-
-    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-
-    __m256i currentmax = _mm256_setzero_si256();
-
-    checker check{};
-
-    while(end - buf >= 64) {
-        __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-
-        const auto u0 = simd16<uint16_t>(in);
-        const auto u1 = simd16<uint16_t>(nextin);
-
-        const auto v0 = u0.shr<8>();
-        const auto v1 = u1.shr<8>();
-
-        const auto in16 = simd16<uint16_t>::pack(v0, v1);
-
-        const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
-        uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask();
-
-        // Check for surrogates
-        if (surrogates_bitmask0 != 0x0) {
-            // Cannot be UTF8
-            is_utf8 = false;
-            // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
-            // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
-            // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
-            // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
-            // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
-
-            if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
-                is_utf32 = false;
-                // Code from avx2_validate_utf16le.cpp
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
-
-                const auto v_fc = simd8<uint8_t>::splat(0xfc);
-                const auto v_dc = simd8<uint8_t>::splat(0xdc);
-
-                const uint32_t V0 = ~surrogates_bitmask0;
-
-                const auto    vH0 = (in16 & v_fc) == v_dc;
-                const uint32_t H0 = vH0.to_bitmask();
-
-                const uint32_t L0 = ~H0 & surrogates_bitmask0;
-
-                const uint32_t a0 = L0 & (H0 >> 1);
-                const uint32_t b0 = a0 << 1;
-                const uint32_t c0 = V0 | a0 | b0;
-
-                if (c0 == 0xffffffff) {
-                    input += simd16<uint16_t>::ELEMENTS * 2;
-                } else if (c0 == 0x7fffffff) {
-                    input += simd16<uint16_t>::ELEMENTS * 2 - 1;
-                } else {
-                    return simdutf::encoding_type::unspecified;
-                }
-
-                while (input + simd16<uint16_t>::ELEMENTS * 2 < end16) {
-                    const auto in0 = simd16<uint16_t>(input);
-                    const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
-
-                    const auto t0 = in0.shr<8>();
-                    const auto t1 = in1.shr<8>();
-
-                    const auto in_16 = simd16<uint16_t>::pack(t0, t1);
-
-                    const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
-                    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
-                    if (surrogates_bitmask == 0x0) {
-                        input += simd16<uint16_t>::ELEMENTS * 2;
-                    } else {
-                        const uint32_t V = ~surrogates_bitmask;
-
-                        const auto    vH = (in_16 & v_fc) == v_dc;
-                        const uint32_t H = vH.to_bitmask();
-
-                        const uint32_t L = ~H & surrogates_bitmask;
-
-                        const uint32_t a = L & (H >> 1);
-
-                        const uint32_t b = a << 1;
-
-                        const uint32_t c = V | a | b;
-
-                        if (c == 0xffffffff) {
-                            input += simd16<uint16_t>::ELEMENTS * 2;
-                        } else if (c == 0x7fffffff) {
-                            input += simd16<uint16_t>::ELEMENTS * 2 - 1;
-                        } else {
-                            return simdutf::encoding_type::unspecified;
-                        }
-                    }
-                }
-            } else {
-                is_utf16 = false;
-                // Check for UTF-32
-                if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
-
-                    // Must start checking for surrogates
-                    __m256i currentoffsetmax = _mm256_setzero_si256();
-                    const __m256i offset = _mm256_set1_epi32(0xffff2000);
-                    const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
-
-                    currentmax = _mm256_max_epu32(in, currentmax);
-                    currentmax = _mm256_max_epu32(nextin, currentmax);
-
-                    currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
-                    currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
-
-                    while (input + 8 < end32) {
-                        const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
-                        currentmax = _mm256_max_epu32(in32,currentmax);
-                        currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
-                        input += 8;
-                    }
-
-                    __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
-                        return simdutf::encoding_type::unspecified;
-                    }
-                } else {
-                    return simdutf::encoding_type::unspecified;
-                }
-            }
-            break;
-        }
-        // If no surrogate, validate under other encodings as well
-
-        // UTF-32 validation
-        currentmax = _mm256_max_epu32(in, currentmax);
-        currentmax = _mm256_max_epu32(nextin, currentmax);
-
-        // UTF-8 validation
-        // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
-        simd::simd8x64<uint8_t> in8(in, nextin);
-        check.check_next_input(in8);
-
-        buf += 64;
-    }
-
-    // Check which encodings are possible
-
-    if (is_utf8) {
-        if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
-            std::memset(block, 0x20, 64);
-            std::memcpy(block, buf, len - (buf - start));
-            simd::simd8x64<uint8_t> in(block);
-            check.check_next_input(in);
-        }
-        if (!check.errors()) {
-            out |= simdutf::encoding_type::UTF8;
-        }
-    }
-
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
-        out |= simdutf::encoding_type::UTF16_LE;
-    }
-
-    if (is_utf32 && (len % 4 == 0)) {
-        const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
-        __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
-            out |= simdutf::encoding_type::UTF32_LE;
-        }
-    }
-
-    return out;
-}
-/* end file src/haswell/avx2_detect_encodings.cpp */
-
 /* begin file src/haswell/avx2_validate_utf16.cpp */
 /*
     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
@@ -24962,9 +24593,6 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 */
 template <endianness big_endian>
 const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
-    if (simdutf_unlikely(size == 0)) {
-        return input;
-    }
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -25126,9 +24754,6 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
    - nullptr if an error was detected.
 */
 const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
-    if (simdutf_unlikely(size == 0)) {
-        return input;
-    }
     const char32_t* end = input + size;
 
     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
@@ -25158,9 +24783,6 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
 
 
 const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
-    if (simdutf_unlikely(size == 0)) {
-        return result(error_code::SUCCESS, 0);
-    }
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -25356,8 +24978,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   const __m128i in = _mm_loadu_si128((__m128i *)input);
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask  == 0xfff) {
+    // We process the data in chunks of 12 bytes.
     __m256i ascii = _mm256_cvtepu8_epi16(in);
     if (big_endian) {
       const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
@@ -25365,8 +24987,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       ascii = _mm256_shuffle_epi8(ascii, swap256);
     }
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
+    utf16_output += 12; // We wrote 12 16-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units.
@@ -25536,12 +25158,12 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   const __m128i in = _mm_loadu_si128((__m128i *)input);
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units.
@@ -27175,11 +26797,11 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff; // we are only processing 12 bytes in case it is not all ASCII
 
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
     _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 16; // We wrote 16 characters.
-    return 16; // We consumed 16 bytes.
+    latin1_output += 12; // We wrote 12 characters.
+    return 12; // We consumed 1 bytes.
   }
   /// We do not have a fast path available, so we fallback.
   const uint8_t idx =
@@ -27466,9 +27088,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
 
   if (base64_url) {
     check_asso =
-        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
-                         0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
+        _mm256_setr_epi8(0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6,
+        0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6);
   } else {
 
     check_asso = _mm256_setr_epi8(
@@ -27479,13 +27100,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   __m256i check_values;
   if (base64_url) {
     check_values = _mm256_setr_epi8(
-        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF),
-        uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
-        uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0),
-        uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3),
-        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0),
-        uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
+        uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF),uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80),0x0,uint8_t(0x80),0x0,uint8_t(0x80),
+        uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF),uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80),0x0,uint8_t(0x80),0x0,uint8_t(0x80));
   } else {
     check_values = _mm256_setr_epi8(
         int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
@@ -27496,7 +27112,7 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
         int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
         int8_t(0x91), int8_t(0x80));
   }
-  const __m256i shifted = _mm256_srli_epi32(*src, 3);
+  const __m256i shifted =_mm256_srli_epi32(*src, 3);
   const __m256i delta_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
   const __m256i check_hash =
@@ -27603,22 +27219,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                               base64_options options) {
   const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
                                         : tables::base64::to_base64_value;
+  size_t equallocation = srclen; // location of the first padding character if any
   // skip trailing spaces
-  while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
     srclen--;
   }
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
     srclen--;
     equalsigns = 1;
     // skip trailing spaces
-    while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
-      srclen--;
-    }
-    while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
       srclen--;
     }
     if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
       srclen--;
       equalsigns = 2;
     }
@@ -27644,7 +27260,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -27692,7 +27308,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
-      if (val > 64) {
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
       bufferptr += (val <= 63);
@@ -27739,7 +27355,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
         uint8_t val = to_base64[uint8_t(*src)];
-        if (val > 64) {
+        if (!scalar::base64::is_eight_byte(*src) || val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
         buffer_start[leftover] = char(val);
@@ -27790,13 +27406,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       // additional checks
       if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
         r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.count = equallocation;
       }
     }
     return r;
   }
   if(equalsigns > 0) {
     if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
   return {SUCCESS, size_t(dst - dstinit)};
@@ -28455,6 +28072,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -28529,14 +28149,14 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -28774,10 +28394,10 @@ using namespace simd;
       size_t pos = 0;
       char32_t* start{utf32_output};
       // In the worst case, we have the haswell kernel which can cause an overflow of
-      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
       // much more than 8 bytes. However, you cannot generally assume that you have valid
-      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // UTF-8 input, so we are going to go back from the end counting 16 leading bytes,
       // to give us a good margin.
       size_t leading_byte = 0;
       size_t margin = size;
@@ -28807,6 +28427,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // we have an error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -28880,12 +28503,12 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -29431,15 +29054,15 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
-    } else {
-      return simdutf::encoding_type::unspecified;
-    }
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
+  }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
   }
+  return out;
 }
 
 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
@@ -29459,7 +29082,12 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char
 }
 
 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
   } else {
@@ -29468,6 +29096,11 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s
 }
 
 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
   const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
@@ -29497,6 +29130,11 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch
 }
 
 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return true;
+  }
   const char32_t* tail = avx2_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
@@ -29506,6 +29144,11 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return result(error_code::SUCCESS, 0);
+  }
   result res = avx2_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -30036,40 +29679,48 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *i
   const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
   size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
   size_t i = 0;
-  __m256i four_64bits = _mm256_setzero_si256();
-  while (i + sizeof(__m256i) <= len) {
-    __m256i runner = _mm256_setzero_si256();
-    // We can do up to 255 loops without overflow.
-    size_t iterations = (len - i) / sizeof(__m256i);
-    if (iterations > 255) {
-      iterations = 255;
-    }
-    size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
-    for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) {
-      __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
-      __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
-      __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i)));
-      __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i)));
-      __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
-              _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
-      __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
-              _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
-      __m256i input1234 = _mm256_add_epi8(input12, input23);
-      runner = _mm256_sub_epi8(
-          runner, input1234);
-    }
-    for (; i <= max_i; i += sizeof(__m256i)) {
-      __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i));
-      runner = _mm256_sub_epi8(
-          runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
-    }
-    four_64bits = _mm256_add_epi64(
-        four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
-  }
-  answer += _mm256_extract_epi64(four_64bits, 0) +
-            _mm256_extract_epi64(four_64bits, 1) +
-            _mm256_extract_epi64(four_64bits, 2) +
-            _mm256_extract_epi64(four_64bits, 3);
+  if(answer >= 2048) { // long strings optimization
+    __m256i four_64bits = _mm256_setzero_si256();
+    while (i + sizeof(__m256i) <= len) {
+      __m256i runner = _mm256_setzero_si256();
+      // We can do up to 255 loops without overflow.
+      size_t iterations = (len - i) / sizeof(__m256i);
+      if (iterations > 255) {
+        iterations = 255;
+      }
+      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+      for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) {
+        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+        __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+        __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i)));
+        __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i)));
+        __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+                _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+        __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+                _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+        __m256i input1234 = _mm256_add_epi8(input12, input23);
+        runner = _mm256_sub_epi8(
+            runner, input1234);
+      }
+      for (; i <= max_i; i += sizeof(__m256i)) {
+        __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i));
+        runner = _mm256_sub_epi8(
+            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+      }
+      four_64bits = _mm256_add_epi64(
+          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+    }
+    answer += _mm256_extract_epi64(four_64bits, 0) +
+              _mm256_extract_epi64(four_64bits, 1) +
+              _mm256_extract_epi64(four_64bits, 2) +
+              _mm256_extract_epi64(four_64bits, 3);
+  } else if (answer > 0) {
+    for(; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
+      __m256i latin = _mm256_loadu_si256((const __m256i*)(data + i));
+      uint32_t non_ascii = _mm256_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
+  }
   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(data + i), len - i);
 }
 
@@ -30854,6 +30505,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -30928,14 +30582,14 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -31173,10 +30827,10 @@ using namespace simd;
       size_t pos = 0;
       char32_t* start{utf32_output};
       // In the worst case, we have the haswell kernel which can cause an overflow of
-      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
       // much more than 8 bytes. However, you cannot generally assume that you have valid
-      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // UTF-8 input, so we are going to go back from the end counting 16 leading bytes,
       // to give us a good margin.
       size_t leading_byte = 0;
       size_t margin = size;
@@ -31206,6 +30860,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // we have an error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -31279,12 +30936,12 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -31456,6 +31113,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+  // todo: reimplement as a one-pass algorithm.
   int out = 0;
   if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
   if((length % 2) == 0) {
@@ -31689,12 +31347,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   size_t equallocation = length; // location of the first padding character if any
   size_t equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -31727,12 +31387,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   size_t equallocation = length; // location of the first padding character if any
   size_t equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -32083,10 +31745,11 @@ simdutf_really_inline static result rvv_validate_utf16_with_errors(const char16_
       break;
     }
   }
-  if (last - 0xD800u < 0x400u)
+  if (last - 0xD800u < 0x400u) {
     return result(error_code::SURROGATE, src - beg - 1); /* end on high surrogate */
-  else
+  } else {
     return result(error_code::SUCCESS, src - beg);
+  }
 }
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *src, size_t len) const noexcept {
@@ -32122,11 +31785,14 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char
     vl = __riscv_vsetvl_e32m8(len);
     vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
     vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
-    long idx;
-    idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
-    if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx);
-    idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
-    if (idx >= 0) return result(error_code::SURROGATE, src - beg + idx);
+    long idx1 = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
+    if(idx1 >= 0 && idx2 >= 0) {
+      if(idx1 <= idx2) { return result(error_code::TOO_LARGE, src - beg + idx1); }
+      else { return result(error_code::SURROGATE, src - beg + idx2); }
+    }
+    if (idx1 >= 0) { return result(error_code::TOO_LARGE, src - beg + idx1); }
+    if (idx2 >= 0) { return result(error_code::SURROGATE, src - beg + idx2); }
   }
   return result(error_code::SUCCESS, src - beg);
 }
@@ -32560,7 +32226,7 @@ simdutf_really_inline static result rvv_utf16_to_latin1_with_errors(const char16
     v = simdutf_byteflip<bflip>(v, vl);
     long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
     if (idx >= 0)
-      return result(error_code::TOO_LARGE, beg - src + idx);
+      return result(error_code::TOO_LARGE, src - beg + idx);
     __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
   }
   return result(error_code::SUCCESS, src - beg);
@@ -32812,7 +32478,11 @@ simdutf_really_inline static result rvv_utf16_to_utf32_with_errors(const char16_
         const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
         const long idx = __riscv_vfirst_m_b8(diff, vl);
         if (idx >= 0) {
-          return result(error_code::SURROGATE, src - srcBeg + idx + 1);
+          uint16_t word = simdutf_byteflip<bflip>(src[idx]);
+          if(word < 0xD800 || word > 0xDBFF) {
+            return result(error_code::SURROGATE, src - srcBeg + idx + 1);
+          }
+          return result(error_code::SURROGATE, src - srcBeg + idx);
         }
     }
 
@@ -32962,10 +32632,15 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con
       n -= vl, src += vl, dst += vlOut;
       continue;
     }
-
+    long idx1 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
     vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(__riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
-    long idx = __riscv_vfirst_m_b8(sur, vl);
-    if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx);
+    long idx2 = __riscv_vfirst_m_b8(sur, vl);
+    if(idx1 >= 0 && idx2 >= 0) {
+      if(idx1 <= idx2) { return result(error_code::TOO_LARGE, src - srcBeg + idx1); }
+      else { return result(error_code::SURROGATE, src - srcBeg + idx2); }
+    }
+    if (idx1 >= 0) { return result(error_code::TOO_LARGE, src - srcBeg + idx1); }
+    if (idx2 >= 0) { return result(error_code::SURROGATE, src - srcBeg + idx2); }
 
     vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000-1, vl);
     long first = __riscv_vfirst_m_b8(m4, vl);
@@ -33040,18 +32715,21 @@ simdutf_really_inline static result rvv_convert_utf32_to_utf16_with_errors(const
     vl = __riscv_vsetvl_e32m4(len);
     vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl);
     vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
-    long idx;
-    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
-    if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx);
-    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
+    long idx1 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2) return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
+    }
+    if (idx1 >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+    if (idx2 >= 0) return result(error_code::SURROGATE, src - srcBeg + idx2);
+    long idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
     if (idx < 0) {
       vlOut = vl;
       vuint16m2_t n = simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
       __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut);
       continue;
     }
-    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
-    if (idx >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx);
     vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t*)dst, v, vl, m4even);
   }
   return result(error_code::SUCCESS, dst - dstBeg);
@@ -33114,6 +32792,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char *input, size
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
   if (bom_encoding != encoding_type::unspecified)
     return bom_encoding;
+  // todo: reimplement as a one-pass algorithm.
   int out = 0;
   if (validate_utf8(input, length))
     out |= encoding_type::UTF8;
@@ -33156,12 +32835,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   size_t equallocation = length; // location of the first padding character if any
   size_t equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -33194,12 +32875,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   size_t equallocation = length; // location of the first padding character if any
   auto equalsigns = 0;
   if(length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
     length -= 1;
     equalsigns++;
     while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) {
       length--;
     }
     if(length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
       equalsigns++;
       length -= 1;
     }
@@ -33356,214 +33039,6 @@ inline void write_v_u16_11bits_to_utf8(
 } // namespace westmere
 } // namespace internal
 /* end file src/westmere/internal/loader.cpp */
-/* begin file src/westmere/sse_detect_encodings.cpp */
-template<class checker>
-// len is known to be a multiple of 2 when this is called
-int sse_detect_encodings(const char * buf, size_t len) {
-    const char* start = buf;
-    const char* end = buf + len;
-
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
-
-    int out = 0;
-
-    const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-    const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-
-    __m128i currentmax = _mm_setzero_si128();
-
-    checker check{};
-
-    while(end - buf >= 64) {
-        __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
-        __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-        __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-
-        const auto u0 = simd16<uint16_t>(in);
-        const auto u1 = simd16<uint16_t>(secondin);
-        const auto u2 = simd16<uint16_t>(thirdin);
-        const auto u3 = simd16<uint16_t>(fourthin);
-
-        const auto v0 = u0.shr<8>();
-        const auto v1 = u1.shr<8>();
-        const auto v2 = u2.shr<8>();
-        const auto v3 = u3.shr<8>();
-
-        const auto in16 = simd16<uint16_t>::pack(v0, v1);
-        const auto nextin16 = simd16<uint16_t>::pack(v2, v3);
-
-        const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8;
-        const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8;
-        uint16_t surrogates_bitmask0 = static_cast<uint16_t>(surrogates_wordmask0.to_bitmask());
-        uint16_t surrogates_bitmask1 = static_cast<uint16_t>(surrogates_wordmask1.to_bitmask());
-
-        // Check for surrogates
-        if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) {
-            // Cannot be UTF8
-            is_utf8 = false;
-            // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates
-            // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word.
-            // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant
-            // bytes of a 32-bit word since they always come in pairs in UTF-16LE.
-            // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units.
-
-            if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) {
-                is_utf32 = false;
-                // Code from sse_validate_utf16le.cpp
-                // Not efficient, we do not process surrogates_bitmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
-
-                const auto v_fc = simd8<uint8_t>::splat(0xfc);
-                const auto v_dc = simd8<uint8_t>::splat(0xdc);
-
-                const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
-
-                const auto    vH0 = (in16 & v_fc) == v_dc;
-                const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
-
-                const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
-
-                const uint16_t a0 = static_cast<uint16_t>(L0 & (H0 >> 1));
-
-                const uint16_t b0 = static_cast<uint16_t>(a0 << 1);
-
-                const uint16_t c0 = static_cast<uint16_t>(V0 | a0 | b0);
-
-                if (c0 == 0xffff) {
-                    input += 16;
-                } else if (c0 == 0x7fff) {
-                    input += 15;
-                } else {
-                    is_utf16 = false;
-                    break;
-                }
-
-                while (input + simd16<uint16_t>::SIZE * 2 < end16) {
-                    const auto in0 = simd16<uint16_t>(input);
-                    const auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
-
-                    const auto t0 = in0.shr<8>();
-                    const auto t1 = in1.shr<8>();
-
-                    const auto in_16 = simd16<uint16_t>::pack(t0, t1);
-
-                    const auto surrogates_wordmask = (in_16 & v_f8) == v_d8;
-                    const uint16_t surrogates_bitmask = static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
-                    if (surrogates_bitmask == 0x0) {
-                        input += 16;
-                    } else {
-                        const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
-
-                        const auto    vH = (in_16 & v_fc) == v_dc;
-                        const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
-
-                        const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
-
-                        const uint16_t a = static_cast<uint16_t>(L & (H >> 1));
-
-                        const uint16_t b = static_cast<uint16_t>(a << 1);
-
-                        const uint16_t c = static_cast<uint16_t>(V | a | b);
-
-                        if (c == 0xffff) {
-                            input += 16;
-                        } else if (c == 0x7fff) {
-                            input += 15;
-                        } else {
-                            is_utf16 = false;
-                            break;
-                        }
-                    }
-                }
-            } else {
-                is_utf16 = false;
-                // Check for UTF-32
-                if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
-
-                    // Must start checking for surrogates
-                    __m128i currentoffsetmax = _mm_setzero_si128();
-                    const __m128i offset = _mm_set1_epi32(0xffff2000);
-                    const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
-
-                    currentmax = _mm_max_epu32(in, currentmax);
-                    currentmax = _mm_max_epu32(secondin, currentmax);
-                    currentmax = _mm_max_epu32(thirdin, currentmax);
-                    currentmax = _mm_max_epu32(fourthin, currentmax);
-
-                    currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
-                    currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax);
-                    currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax);
-                    currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
-
-                    while (input + 4 < end32) {
-                        const __m128i in32 = _mm_loadu_si128((__m128i *)input);
-                        currentmax = _mm_max_epu32(in32,currentmax);
-                        currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
-                        input += 4;
-                    }
-
-                    __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
-                        is_utf32 = false;
-                    }
-                } else {
-                    is_utf32 = false;
-                }
-            }
-            break;
-        }
-        // If no surrogate, validate under other encodings as well
-
-        // UTF-32 validation
-        currentmax = _mm_max_epu32(in, currentmax);
-        currentmax = _mm_max_epu32(secondin, currentmax);
-        currentmax = _mm_max_epu32(thirdin, currentmax);
-        currentmax = _mm_max_epu32(fourthin, currentmax);
-
-        // UTF-8 validation
-        // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
-        simd::simd8x64<uint8_t> in8(in, secondin, thirdin, fourthin);
-        check.check_next_input(in8);
-
-        buf += 64;
-    }
-
-    // Check which encodings are possible
-
-    if (is_utf8) {
-        if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
-            std::memset(block, 0x20, 64);
-            std::memcpy(block, buf, len - (buf - start));
-            simd::simd8x64<uint8_t> in(block);
-            check.check_next_input(in);
-        }
-        if (!check.errors()) {
-            out |= simdutf::encoding_type::UTF8;
-        }
-    }
-
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
-        out |= simdutf::encoding_type::UTF16_LE;
-    }
-
-    if (is_utf32 && (len % 4 == 0)) {
-        const __m128i standardmax = _mm_set1_epi32(0x10ffff);
-        __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
-            out |= simdutf::encoding_type::UTF32_LE;
-        }
-    }
-
-    return out;
-}
-/* end file src/westmere/sse_detect_encodings.cpp */
 
 /* begin file src/westmere/sse_validate_utf16.cpp */
 /*
@@ -33613,9 +33088,6 @@ int sse_detect_encodings(const char * buf, size_t len) {
 */
 template <endianness big_endian>
 const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
-    if (simdutf_unlikely(size == 0)) {
-        return input;
-    }
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -33776,9 +33248,6 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
    - nullptr if an error was detected.
 */
 const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
-    if (size == 0) {
-        return input;
-    }
     const char32_t* end = input + size;
 
     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
@@ -33808,9 +33277,6 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
 
 
 const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
-    if (size == 0) {
-        return result(error_code::SUCCESS, 0);
-    }
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -34005,8 +33471,9 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   const __m128i in = _mm_loadu_si128((__m128i *)input);
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
     __m128i ascii_first = _mm_cvtepu8_epi16(in);
     __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
     if (big_endian) {
@@ -34015,8 +33482,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     }
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
+    utf16_output += 12; // We wrote 12 16-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units.
@@ -34187,14 +33654,14 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   const __m128i in = _mm_loadu_si128((__m128i *)input);
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
     _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12; // We consumed 12 bytes.
   }
   if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units.
@@ -34316,11 +33783,11 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   const __m128i in = _mm_loadu_si128((__m128i *)input);
   const uint16_t input_utf8_end_of_code_point_mask =
       utf8_end_of_code_point_mask & 0xfff; // we are only processing 12 bytes in case it is not all ASCII
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
+  if(utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
     _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 16; // We wrote 16 characters.
-    return 16; // We consumed 16 bytes.
+    latin1_output += 12; // We wrote 12 characters.
+    return 12; // We consumed 12 bytes.
   }
   /// We do not have a fast path available, so we fallback.
   const uint8_t idx =
@@ -35986,8 +35453,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   }
   __m128i check_asso;
   if (base64_url) {
-    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                               0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
+    check_asso = _mm_setr_epi8(0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6);
   } else {
 
     check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
@@ -35995,11 +35461,9 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   }
   __m128i check_values;
   if (base64_url) {
-    check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80),
-                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
-                                 uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
-                                 uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80),
-                                 uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
+    check_values = _mm_setr_epi8(uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF),
+                   uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80),
+                   0x0,uint8_t(0x80),0x0,uint8_t(0x80));
   } else {
 
     check_values =
@@ -36008,7 +35472,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
                       int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
                       int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
   }
-  const __m128i shifted = _mm_srli_epi32(*src, 3);
+  const __m128i shifted =_mm_srli_epi32(*src, 3);
 
   const __m128i delta_hash =
       _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
@@ -36138,19 +35602,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
                               base64_options options) {
   const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
                                         : tables::base64::to_base64_value;
+  size_t equallocation = srclen; // location of the first padding character if any
   // skip trailing spaces
-  while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
     srclen--;
   }
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
     srclen--;
     equalsigns = 1;
     // skip trailing spaces
-    while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) {
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) {
       srclen--;
     }
     if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
       srclen--;
       equalsigns = 2;
     }
@@ -36176,7 +35643,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -36223,7 +35690,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
-      if (val > 64) {
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
       bufferptr += (val <= 63);
@@ -36270,7 +35737,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
         uint8_t val = to_base64[uint8_t(*src)];
-        if (val > 64) {
+        if (!scalar::base64::is_eight_byte(*src) || val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
         buffer_start[leftover] = char(val);
@@ -36323,13 +35790,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       // additional checks
       if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
         r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.count = equallocation;
       }
     }
     return r;
   }
   if(equalsigns > 0) {
     if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
   return {SUCCESS, size_t(dst - dstinit)};
@@ -36988,6 +36456,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -37062,14 +36533,14 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -37307,10 +36778,10 @@ using namespace simd;
       size_t pos = 0;
       char32_t* start{utf32_output};
       // In the worst case, we have the haswell kernel which can cause an overflow of
-      // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+      // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
       // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
       // much more than 8 bytes. However, you cannot generally assume that you have valid
-      // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+      // UTF-8 input, so we are going to go back from the end counting 16 leading bytes,
       // to give us a good margin.
       size_t leading_byte = 0;
       size_t margin = size;
@@ -37340,6 +36811,9 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
           uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if(utf8_continuation_mask & 1) {
+            return 0; // we have an error
+          }
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -37413,12 +36887,12 @@ using namespace simd;
             this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
             this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
           }
-          if (errors()) {
+          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+          if (errors() || (utf8_continuation_mask & 1)) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
           }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
           uint64_t utf8_leading_mask = ~utf8_continuation_mask;
           uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
           // We process in blocks of up to 12 bytes except possibly
@@ -37966,16 +37440,17 @@ namespace westmere {
 simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: reimplement as a one-pass algorithm.
   if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
-    } else {
-      return simdutf::encoding_type::unspecified;
-    }
+  int out = 0;
+  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
+  if((length % 2) == 0) {
+    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
+  }
+  if((length % 4) == 0) {
+    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
   }
+  return out;
 }
 
 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
@@ -37995,7 +37470,12 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char
 }
 
 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
   } else {
@@ -38004,7 +37484,12 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s
 }
 
 simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -38033,7 +37518,12 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch
 }
 
 simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = sse_validate_utf32le(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char32_t *tail = sse_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
@@ -38042,6 +37532,11 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  if (len == 0) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return result(error_code::SUCCESS, 0);
+  }
   result res = sse_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -38565,46 +38060,62 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *
   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
   size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
   size_t i = 0;
-  __m128i two_64bits = _mm_setzero_si128();
-  while (i + sizeof(__m128i) <= len) {
-    __m128i runner = _mm_setzero_si128();
-    size_t iterations = (len - i) / sizeof(__m128i);
-    if (iterations > 255) {
-      iterations = 255;
-    }
-    size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
-    for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) {
-      __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
-      __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
-      __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i)));
-      __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
-      __m128i input12 = _mm_add_epi8(
-                                      _mm_cmpgt_epi8(
-                                                    _mm_setzero_si128(),
-                                                    input1),
-                                      _mm_cmpgt_epi8(
-                                                    _mm_setzero_si128(),
-                                                    input2));
-      __m128i input34 = _mm_add_epi8(
-                                      _mm_cmpgt_epi8(
-                                                    _mm_setzero_si128(),
-                                                    input3),
-                                      _mm_cmpgt_epi8(
-                                                    _mm_setzero_si128(),
-                                                    input4));
-      __m128i input1234 = _mm_add_epi8(input12, input34);
-      runner = _mm_sub_epi8(runner, input1234);
-    }
-    for (; i <= max_i; i += sizeof(__m128i)) {
-      __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
-      runner = _mm_sub_epi8(
-          runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
-    }
-    two_64bits = _mm_add_epi64(
-        two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
-  }
-  answer += _mm_extract_epi64(two_64bits, 0) +
-            _mm_extract_epi64(two_64bits, 1);
+  if(answer >= 2048) { // long strings optimization
+    __m128i two_64bits = _mm_setzero_si128();
+    while (i + sizeof(__m128i) <= len) {
+      __m128i runner = _mm_setzero_si128();
+      size_t iterations = (len - i) / sizeof(__m128i);
+      if (iterations > 255) {
+        iterations = 255;
+      }
+      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
+      for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) {
+        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
+        __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
+        __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i)));
+        __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
+        __m128i input12 = _mm_add_epi8(
+                                        _mm_cmpgt_epi8(
+                                                      _mm_setzero_si128(),
+                                                      input1),
+                                        _mm_cmpgt_epi8(
+                                                      _mm_setzero_si128(),
+                                                      input2));
+        __m128i input34 = _mm_add_epi8(
+                                        _mm_cmpgt_epi8(
+                                                      _mm_setzero_si128(),
+                                                      input3),
+                                        _mm_cmpgt_epi8(
+                                                      _mm_setzero_si128(),
+                                                      input4));
+        __m128i input1234 = _mm_add_epi8(input12, input34);
+        runner = _mm_sub_epi8(runner, input1234);
+      }
+      for (; i <= max_i; i += sizeof(__m128i)) {
+        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
+        runner = _mm_sub_epi8(
+            runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
+      }
+      two_64bits = _mm_add_epi64(
+          two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
+    }
+    answer += _mm_extract_epi64(two_64bits, 0) +
+              _mm_extract_epi64(two_64bits, 1);
+  } else if (answer > 0) { // short string optimization
+    for(; i + 2*sizeof(__m128i) <= len; i += 2*sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i*)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+      latin = _mm_loadu_si128((const __m128i*)(input + i)+1);
+      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
+    for(; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i*)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
+  }
   return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast<const char *>(str + i), len - i);
 }
 
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index b656861346e927..4b534c12563072 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */
+/* auto-generated on 2024-09-04 18:13:32 +0200. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -20,6 +20,18 @@
 #endif
 #endif
 
+
+// C++ 23
+#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L)
+#define SIMDUTF_CPLUSPLUS23 1
+#endif
+
+// C++ 20
+#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L)
+#define SIMDUTF_CPLUSPLUS20 1
+#endif
+
+
 // C++ 17
 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L)
 #define SIMDUTF_CPLUSPLUS17 1
@@ -398,6 +410,8 @@
 #define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
 
 #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO)
+  #define SIMDUTF_DEPRECATED __declspec(deprecated)
+
 
   #define simdutf_really_inline __forceinline
   #define simdutf_never_inline __declspec(noinline)
@@ -438,6 +452,8 @@
 #else
   #define simdutf_really_inline inline
 #endif
+
+  #define SIMDUTF_DEPRECATED __attribute__((deprecated))
   #define simdutf_never_inline inline __attribute__((noinline))
 
   #define simdutf_unused __attribute__((unused))
@@ -597,7 +613,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "5.3.4"
+#define SIMDUTF_VERSION "5.5.0"
 
 namespace simdutf {
 enum {
@@ -608,11 +624,11 @@ enum {
   /**
    * The minor version (major.MINOR.revision) of simdutf being used.
    */
-  SIMDUTF_VERSION_MINOR = 3,
+  SIMDUTF_VERSION_MINOR = 5,
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 4
+  SIMDUTF_VERSION_REVISION = 0
 };
 } // namespace simdutf
 
@@ -1120,67 +1136,79 @@ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcep
  */
 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
 
-  /**
-   * Convert Latin1 string into UTF8 string.
-   *
-   * This function is suitable to work with inputs from untrusted sources.
-   *
-   * @param input         the Latin1 string to convert
-   * @param length        the length of the string in bytes
-   * @param latin1_output  the pointer to buffer that can hold conversion result
-   * @return the number of written char; 0 if conversion is not possible
-   */
-  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
+/**
+ * Convert Latin1 string into UTF8 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the Latin1 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf8_output   the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
 
+/**
+ * Convert Latin1 string into UTF8 string with output limit.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the Latin1 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf8_output  	the pointer to buffer that can hold conversion result
+ * @param utf8_len      the maximum output length
+ * @return the number of written char; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf8_safe(const char * input, size_t length, char* utf8_output, size_t utf8_len) noexcept;
 
-    /**
-   * Convert possibly Latin1 string into UTF-16LE string.
-   *
-   * This function is suitable to work with inputs from untrusted sources.
-   *
-   * @param input         the Latin1  string to convert
-   * @param length        the length of the string in bytes
-   * @param utf16_buffer  the pointer to buffer that can hold conversion result
-   * @return the number of written char16_t; 0 if conversion is not possible
-   */
-  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
+/**
+ * Convert possibly Latin1 string into UTF-16LE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the Latin1  string to convert
+ * @param length        the length of the string in bytes
+ * @param utf16_buffer  the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
-  /**
-   * Convert Latin1 string into UTF-16BE string.
-   *
-   * This function is suitable to work with inputs from untrusted sources.
-   *
-   * @param input         the Latin1 string to convert
-   * @param length        the length of the string in bytes
-   * @param utf16_buffer  the pointer to buffer that can hold conversion result
-   * @return the number of written char16_t; 0 if conversion is not possible
-   */
-  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
+/**
+ * Convert Latin1 string into UTF-16BE string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the Latin1 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf16_buffer  the pointer to buffer that can hold conversion result
+ * @return the number of written char16_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
-  /**
-   * Convert Latin1 string into UTF-32 string.
-   *
-   * This function is suitable to work with inputs from untrusted sources.
-   *
-   * @param input         the Latin1 string to convert
-   * @param length        the length of the string in bytes
-   * @param utf32_buffer  the pointer to buffer that can hold conversion result
-   * @return the number of written char32_t; 0 if conversion is not possible
-   */
-  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
+/**
+ * Convert Latin1 string into UTF-32 string.
+ *
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the Latin1 string to convert
+ * @param length        the length of the string in bytes
+ * @param utf32_buffer  the pointer to buffer that can hold conversion result
+ * @return the number of written char32_t; 0 if conversion is not possible
+ */
+simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
 
- /**
-   * Convert possibly broken UTF-8 string into latin1 string.
-   *
-   * During the conversion also validation of the input string is done.
-   * This function is suitable to work with inputs from untrusted sources.
-   *
-   * @param input         the UTF-8 string to convert
-   * @param length        the length of the string in bytes
-   * @param latin1_output  the pointer to buffer that can hold conversion result
-   * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1
-   */
-  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+/**
+ * Convert possibly broken UTF-8 string into latin1 string.
+ *
+ * During the conversion also validation of the input string is done.
+ * This function is suitable to work with inputs from untrusted sources.
+ *
+ * @param input         the UTF-8 string to convert
+ * @param length        the length of the string in bytes
+ * @param latin1_output  the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1
+ */
+simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
 
 /**
  * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string.
@@ -1195,7 +1223,6 @@ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_
  */
 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
-
 /**
  * Using native endianness, convert a Latin1 string into a UTF-16 string.
  *
@@ -1314,19 +1341,23 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng
  */
 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
 
-    /**
-   * Convert valid UTF-8 string into latin1 string.
-   *
-   * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1.
-   *
-   * This function is not BOM-aware.
-   *
-   * @param input         the UTF-8 string to convert
-   * @param length        the length of the string in bytes
-   * @param latin1_output  the pointer to buffer that can hold conversion result
-   * @return the number of written char; 0 if the input was not valid UTF-8 string
-   */
-  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+/**
+ * Convert valid UTF-8 string into latin1 string.
+ *
+ * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1.
+ * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use convert_utf8_to_latin1 instead.
+ * The function may be removed from the library in the future.
+ *
+ * This function is not BOM-aware.
+ *
+ * @param input         the UTF-8 string to convert
+ * @param length        the length of the string in bytes
+ * @param latin1_output  the pointer to buffer that can hold conversion result
+ * @return the number of written char; 0 if the input was not valid UTF-8 string
+ */
+simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
 
 
 /**
@@ -1635,6 +1666,10 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, s
  * Using native endianness, convert UTF-16 string into Latin1 string.
  *
  * This function assumes that the input string is valid UTF-16 and that it can be represented as Latin1.
+ * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use convert_utf16_to_latin1 instead.
+ * The function may be removed from the library in the future.
  *
  * This function is not BOM-aware.
  *
@@ -1649,6 +1684,10 @@ simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input,
  * Convert valid UTF-16LE string into Latin1 string.
  *
  * This function assumes that the input string is valid UTF-16LE and that it can be represented as Latin1.
+ * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use convert_utf16le_to_latin1 instead.
+ * The function may be removed from the library in the future.
  *
  * This function is not BOM-aware.
  *
@@ -1663,6 +1702,10 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * inpu
  * Convert valid UTF-16BE string into Latin1 string.
  *
  * This function assumes that the input string is valid UTF-16BE and that it can be represented as Latin1.
+ * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use convert_utf16be_to_latin1 instead.
+ * The function may be removed from the library in the future.
  *
  * This function is not BOM-aware.
  *
@@ -1997,6 +2040,10 @@ simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *
  * Convert valid UTF-32 string into Latin1 string.
  *
  * This function assumes that the input string is valid UTF-32 and that it can be represented as Latin1.
+ * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+ *
+ * This function is for expert users only and not part of our public API. Use convert_utf32_to_latin1 instead.
+ * The function may be removed from the library in the future.
  *
  * This function is not BOM-aware.
  *
@@ -2323,7 +2370,6 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t leng
 
 // base64_options are used to specify the base64 encoding options.
 using base64_options = uint64_t;
-using base64_options = uint64_t;
 enum : base64_options {
   base64_default = 0, /* standard base64 format (with padding) */
   base64_url = 1, /* base64url format (no padding) */
@@ -2355,7 +2401,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
 
 /**
- * Convert a base64 input to a binary ouput.
+ * Convert a base64 input to a binary output.
  *
  * This function follows the WHATWG forgiving-base64 format, which means that it will
  * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
@@ -2398,7 +2444,7 @@ simdutf_warn_unused result base64_to_binary(const char * input, size_t length, c
 simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options = base64_default) noexcept;
 
 /**
- * Convert a binary input to a base64 ouput.
+ * Convert a binary input to a base64 output.
  *
  * The default option (simdutf::base64_default) uses the characters `+` and `/` as part of its alphabet.
  * Further, it adds padding (`=`) at the end of the output to ensure that the output length is a multiple of four.
@@ -2417,7 +2463,7 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_optio
 size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
- * Convert a base64 input to a binary ouput.
+ * Convert a base64 input to a binary output.
  *
  * This function follows the WHATWG forgiving-base64 format, which means that it will
  * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
@@ -2452,7 +2498,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_
 simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
 /**
- * Convert a base64 input to a binary ouput.
+ * Convert a base64 input to a binary output.
  *
  * This function follows the WHATWG forgiving-base64 format, which means that it will
  * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
@@ -2692,13 +2738,12 @@ class implementation {
    *
    * @param input         the Latin1 string to convert
    * @param length        the length of the string in bytes
-   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @param utf8_output  the pointer to buffer that can hold conversion result
    * @return the number of written char; 0 if conversion is not possible
    */
   simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
 
-
-    /**
+  /**
    * Convert possibly Latin1 string into UTF-16LE string.
    *
    * This function is suitable to work with inputs from untrusted sources.
@@ -2766,6 +2811,9 @@ class implementation {
    * Convert valid UTF-8 string into latin1 string.
    *
    * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1.
+   * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+   *
+   * This function is for expert users only and not part of our public API. Use convert_utf8_to_latin1 instead.
    *
    * This function is not BOM-aware.
    *
@@ -2983,7 +3031,10 @@ class implementation {
    * Convert valid UTF-16LE string into Latin1 string.
    *
    * This function assumes that the input string is valid UTF-L16LE and that it can be represented as Latin1.
-
+   * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+   *
+   * This function is for expert users only and not part of our public API. Use convert_utf16le_to_latin1 instead.
+   *
    * This function is not BOM-aware.
    *
    * @param input         the UTF-16LE string to convert
@@ -2997,6 +3048,9 @@ class implementation {
    * Convert valid UTF-16BE string into Latin1 string.
    *
    * This function assumes that the input string is valid UTF16-BE and that it can be represented as Latin1.
+   * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+   *
+   * This function is for expert users only and not part of our public API. Use convert_utf16be_to_latin1 instead.
    *
    * This function is not BOM-aware.
    *
@@ -3246,7 +3300,10 @@ class implementation {
   /**
    * Convert valid UTF-32 string into Latin1 string.
    *
-   * This function assumes that the input string is valid UTF-32.
+   * This function assumes that the input string is valid UTF-32 and can be represented as Latin1.
+   * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes.
+   *
+   * This function is for expert users only and not part of our public API. Use convert_utf32_to_latin1 instead.
    *
    * This function is not BOM-aware.
    *
@@ -3598,7 +3655,7 @@ class implementation {
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0;
 
   /**
-   * Convert a base64 input to a binary ouput.
+   * Convert a base64 input to a binary output.
    *
    * This function follows the WHATWG forgiving-base64 format, which means that it will
    * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
@@ -3623,7 +3680,7 @@ class implementation {
   simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
-   * Convert a base64 input to a binary ouput.
+   * Convert a base64 input to a binary output.
    *
    * This function follows the WHATWG forgiving-base64 format, which means that it will
    * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
@@ -3657,7 +3714,7 @@ class implementation {
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length, base64_options options = base64_default) const noexcept = 0;
 
   /**
-   * Convert a binary input to a base64 ouput.
+   * Convert a binary input to a base64 output.
    *
    * The default option (simdutf::base64_default) uses the characters `+` and `/` as part of its alphabet.
    * Further, it adds padding (`=`) at the end of the output to ensure that the output length is a multiple of four.