From bf4bf7cc6b11b50b0dc39897e5050587dccfa28e Mon Sep 17 00:00:00 2001 From: "Node.js GitHub Bot" Date: Sun, 18 Aug 2024 00:29:37 +0000 Subject: [PATCH] deps: update simdutf to 5.5.0 PR-URL: https://github.com/nodejs/node/pull/54434 Reviewed-By: Rafael Gonzaga Reviewed-By: Luigi Pinca Reviewed-By: Marco Ippolito Reviewed-By: James M Snell Reviewed-By: Michael Dawson --- deps/simdutf/simdutf.cpp | 1757 ++++++++++++++------------------------ deps/simdutf/simdutf.h | 231 +++-- 2 files changed, 778 insertions(+), 1210 deletions(-) diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp index 2e685aab872f93..b04ec9773e9215 100644 --- a/deps/simdutf/simdutf.cpp +++ b/deps/simdutf/simdutf.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */ +/* auto-generated on 2024-09-04 18:13:32 +0200. Do not edit! */ /* begin file src/simdutf.cpp */ #include "simdutf.h" // We include base64_tables once. @@ -5643,7 +5643,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64 buffer[idx] = uint8_t(code); if (is_eight_byte(c) && code <= 63) { idx++; - } else if (code > 64 || !is_eight_byte(c)) { + } else if (code > 64 || !scalar::base64::is_eight_byte(c)) { return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } else { // We have a space or a newline. We ignore it. @@ -5698,7 +5698,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64 } } -// like base64_tail_decode, but it will not write past the end of the ouput buffer. +// like base64_tail_decode, but it will not write past the end of the output buffer. // outlen is modified to reflect the number of bytes written. // This functions assumes that the padding (=) has been removed. template @@ -5747,7 +5747,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, buffer[idx] = uint8_t(code); if (is_eight_byte(c) && code <= 63) { idx++; - } else if (code > 64 || !is_eight_byte(c)) { + } else if (code > 64 || !scalar::base64::is_eight_byte(c)) { outlen = size_t(dst - dstinit); return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } else { @@ -5908,6 +5908,99 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_optio #endif /* end file src/scalar/base64.h */ +/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ +#ifndef SIMDUTF_LATIN1_TO_UTF8_H +#define SIMDUTF_LATIN1_TO_UTF8_H + +namespace simdutf { +namespace scalar { +namespace { +namespace latin1_to_utf8 { + +inline size_t convert(const char* buf, size_t len, char* utf8_output) { + const unsigned char *data = reinterpret_cast(buf); + size_t pos = 0; + size_t utf8_pos = 0; + while (pos < len) { + // try to convert the next block of 16 ASCII bytes + if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + size_t final_pos = pos + 16; + while(pos < final_pos) { + utf8_output[utf8_pos++] = char(buf[pos]); + pos++; + } + continue; + } + } + + unsigned char byte = data[pos]; + if((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + utf8_output[utf8_pos++] = char(byte); + pos++; + } else { + // will generate two UTF-8 bytes + utf8_output[utf8_pos++] = char((byte>>6) | 0b11000000); + utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000); + pos++; + } + } + return utf8_pos; +} + +inline size_t convert_safe(const char* buf, size_t len, char* utf8_output, size_t utf8_len) { + const unsigned char *data = reinterpret_cast(buf); + size_t pos = 0; + size_t skip_pos = 0; + size_t utf8_pos = 0; + while (pos < len && utf8_pos < utf8_len) { + // try to convert the next block of 16 ASCII bytes + if (pos >= skip_pos && pos + 16 <= len && utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes, check that they are ascii + uint64_t v1; + ::memcpy(&v1, data + pos, sizeof(uint64_t)); + uint64_t v2; + ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); + uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything + if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII + ::memcpy(utf8_output + utf8_pos, buf + pos, 16); + utf8_pos += 16; + pos += 16; + } else { + // At least one of the next 16 bytes are not ASCII, we will process them one by one + skip_pos = pos + 16; + } + } else { + const auto byte = data[pos]; + if((byte & 0x80) == 0) { // if ASCII + // will generate one UTF-8 bytes + utf8_output[utf8_pos++] = char(byte); + pos++; + } else if (utf8_pos + 2 <= utf8_len) { + // will generate two UTF-8 bytes + utf8_output[utf8_pos++] = char((byte>>6) | 0b11000000); + utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000); + pos++; + } else { + break; + } + } + } + return utf8_pos; +} + +} // latin1_to_utf8 namespace +} // unnamed namespace +} // namespace scalar +} // namespace simdutf + +#endif +/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ namespace simdutf { bool implementation::supported_by_runtime_system() const { @@ -6074,7 +6167,7 @@ class detect_best_supported_implementation_on_first_use final : public implement } simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept final override { - return set_best()->convert_latin1_to_utf8(buf, len,utf8_output); + return set_best()->convert_latin1_to_utf8(buf, len, utf8_output); } simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override { @@ -7247,6 +7340,31 @@ simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, si } + +simdutf_warn_unused size_t convert_latin1_to_utf8_safe(const char * buf, size_t len, char* utf8_output, size_t utf8_len) noexcept { + const auto start{utf8_output}; + + while (true) { + // convert_latin1_to_utf8 will never write more than input length * 2 + auto read_len = std::min(len, utf8_len >> 1); + if (read_len <= 16) { + break; + } + + const auto write_len = simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output); + + utf8_output += write_len; + utf8_len -= write_len; + buf += read_len; + len -= read_len; + } + + utf8_output += scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len); + + return utf8_output - start; +} + + simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept { return base64_to_binary_safe_impl(input, length, output, outlen, options); } @@ -13578,59 +13696,6 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf #endif /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */ -/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ -#ifndef SIMDUTF_LATIN1_TO_UTF8_H -#define SIMDUTF_LATIN1_TO_UTF8_H - -namespace simdutf { -namespace scalar { -namespace { -namespace latin1_to_utf8 { - -inline size_t convert(const char* buf, size_t len, char* utf8_output) { - const unsigned char *data = reinterpret_cast(buf); - size_t pos = 0; - char* start{utf8_output}; - while (pos < len) { - // try to convert the next block of 16 ASCII bytes - if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii - uint64_t v1; - ::memcpy(&v1, data + pos, sizeof(uint64_t)); - uint64_t v2; - ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); - uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything - if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII - size_t final_pos = pos + 16; - while(pos < final_pos) { - *utf8_output++ = char(buf[pos]); - pos++; - } - continue; - } - } - - unsigned char byte = data[pos]; - if((byte & 0x80) == 0) { // if ASCII - // will generate one UTF-8 bytes - *utf8_output++ = char(byte); - pos++; - } else { - // will generate two UTF-8 bytes - *utf8_output++ = char((byte>>6) | 0b11000000); - *utf8_output++ = char((byte & 0b111111) | 0b10000000); - pos++; - } - } - return utf8_output - start; -} - -} // latin1_to_utf8 namespace -} // unnamed namespace -} // namespace scalar -} // namespace simdutf - -#endif -/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */ /* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */ #ifndef SIMDUTF_LATIN1_TO_UTF16_H #define SIMDUTF_LATIN1_TO_UTF16_H @@ -14127,26 +14192,32 @@ inline size_t convert_valid(const char32_t *buf, size_t len, char *latin1_output size_t pos = 0; while (pos < len) { - utf32_char = (uint32_t)data[pos]; - - if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 - uint64_t v; - ::memcpy(&v, data + pos, sizeof(uint64_t)); - if ((v & 0xFFFFFF00FFFFFF00) == 0) { - *latin1_output++ = char(buf[pos]); - *latin1_output++ = char(buf[pos+1]); - pos += 2; - continue; - } - } - *latin1_output++ = (char)(utf32_char & 0xFF); - pos++; - + utf32_char = (uint32_t) data[pos]; + + if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1 + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if ((v & 0xFFFFFF00FFFFFF00) == 0) { + *latin1_output++ = char(buf[pos]); + *latin1_output++ = char(buf[pos + 1]); + pos += 2; + continue; + } else { + // output can not be represented in latin1 + return 0; + } + } + if ((utf32_char & 0xFFFFFF00) == 0) { + *latin1_output++ = char(utf32_char); + } else { + // output can not be represented in latin1 + return 0; + } + pos++; } return latin1_output - start; } - } // utf32_to_latin1 namespace } // unnamed namespace } // namespace scalar @@ -14272,213 +14343,6 @@ simdutf_really_inline uint16x8_t convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in return composed; } -/* begin file src/arm64/arm_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int arm_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - - uint32x4_t currentmax = vmovq_n_u32(0x0); - - checker check{}; - - while(end - buf >= 64) { - uint16x8_t in = vld1q_u16(reinterpret_cast(buf)); - uint16x8_t secondin = vld1q_u16(reinterpret_cast(buf) + simd16::SIZE / sizeof(char16_t)); - uint16x8_t thirdin = vld1q_u16(reinterpret_cast(buf) + 2*simd16::SIZE / sizeof(char16_t)); - uint16x8_t fourthin = vld1q_u16(reinterpret_cast(buf) + 3*simd16::SIZE / sizeof(char16_t)); - - const auto u0 = simd16(in); - const auto u1 = simd16(secondin); - const auto u2 = simd16(thirdin); - const auto u3 = simd16(fourthin); - - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - const auto v2 = u2.shr<8>(); - const auto v3 = u3.shr<8>(); - - const auto in16 = simd16::pack(v0, v1); - const auto nextin16 = simd16::pack(v2, v3); - - const uint64_t surrogates_wordmask0 = ((in16 & v_f8) == v_d8).to_bitmask64(); - const uint64_t surrogates_wordmask1 = ((nextin16 & v_f8) == v_d8).to_bitmask64(); - - // Check for surrogates - if (surrogates_wordmask0 != 0 || surrogates_wordmask1 != 0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - - if (((surrogates_wordmask0 | surrogates_wordmask1) & 0xf0f0f0f0f0f0f0f0) != 0) { - is_utf32 = false; - // Code from arm_validate_utf16le.cpp - // Not efficient, we do not process surrogates_wordmask1 - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; - - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - const uint64_t V0 = ~surrogates_wordmask0; - - const auto vH0 = ((in16 & v_fc) == v_dc); - const uint64_t H0 = vH0.to_bitmask64(); - - const uint64_t L0 = ~H0 & surrogates_wordmask0; - - const uint64_t a0 = L0 & (H0 >> 4); - - const uint64_t b0 = a0 << 4; - - const uint64_t c0 = V0 | a0 | b0; - if (c0 == ~0ull) { - input += 16; - } else if (c0 == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } - - while (input + 16 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - const simd8 in_16 = simd16::pack(t0, t1); - - const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64(); - if(surrogates_wordmask == 0) { - input += 16; - } else { - const uint64_t V = ~surrogates_wordmask; - - const auto vH = ((in_16 & v_fc) == v_dc); - const uint64_t H = vH.to_bitmask64(); - - const uint64_t L = ~H & surrogates_wordmask; - - const uint64_t a = L & (H >> 4); - - const uint64_t b = a << 4; - - const uint64_t c = V | a | b; - if (c == ~0ull) { - input += 16; - } else if (c == 0xfffffffffffffffull) { - input += 15; - } else { - is_utf16 = false; - break; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - uint32x4_t currentoffsetmax = vmovq_n_u32(0x0); - const uint32x4_t offset = vmovq_n_u32(0xffff2000); - const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff); - - const uint32x4_t in32 = vreinterpretq_u32_u16(in); - const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin); - const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin); - const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin); - - currentmax = vmaxq_u32(in32,currentmax); - currentmax = vmaxq_u32(secondin32,currentmax); - currentmax = vmaxq_u32(thirdin32,currentmax); - currentmax = vmaxq_u32(fourthin32,currentmax); - - currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(thirdin32, offset), currentoffsetmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(fourthin32, offset), currentoffsetmax); - - while (input + 4 < end32) { - const uint32x4_t in_32 = vld1q_u32(reinterpret_cast(input)); - currentmax = vmaxq_u32(in_32,currentmax); - currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax); - input += 4; - } - - uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(vmaxvq_u32(forbidden_words) != 0) { - is_utf32 = false; - } - } else { - is_utf32 = false; - } - } - break; - } - // If no surrogate, validate under other encodings as well - - // UTF-32 validation - currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax); - currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax); - - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(secondin), vreinterpretq_u8_u16(thirdin), vreinterpretq_u8_u16(fourthin)); - check.check_next_input(in8); - - buf += 64; - } - - // Check which encodings are possible - - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } - - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (len % 4 == 0)) { - const uint32x4_t standardmax = vmovq_n_u32(0x10ffff); - uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax); - if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } - } - - return out; -} -/* end file src/arm64/arm_detect_encodings.cpp */ - /* begin file src/arm64/arm_validate_utf16.cpp */ template const char16_t* arm_validate_utf16(const char16_t* input, size_t size) { @@ -14985,6 +14849,12 @@ size_t convert_masked_utf8_to_utf16(const char *input, // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh)); + // added to fix issue https://github.com/simdutf/simdutf/issues/514 + // We only want to write 2 * 16-bit code units when that is actually what we have. + // Unfortunately, we cannot trust the input. So it is possible to get 0xff as an input byte + // and it should not result in a surrogate pair. We need to check for that. + uint32_t permbuffer[4]; + vst1q_u32(permbuffer, perm); // Mask the low and middle bytes // 00000000 00000000 00000000 0ddddddd uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f)); @@ -15040,11 +14910,14 @@ size_t convert_masked_utf8_to_utf16(const char *input, // Attempting to shuffle and store would be complex, just scalarize. uint32_t buffer[4]; vst1q_u32(buffer, selected); - // Test for the top bit of the surrogate mask. - const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000; + // Test for the top bit of the surrogate mask. Remove due to issue 514 + // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 : 0x00800000; for (size_t i = 0; i < 3; i++) { // Surrogate - if (buffer[i] & SURROGATE_MASK) { + // Used to be if (buffer[i] & SURROGATE_MASK) { + // See discussion above. + // patch for issue https://github.com/simdutf/simdutf/issues/514 + if((permbuffer[i] & 0xf8000000) == 0xf0000000) { utf16_output[0] = uint16_t(buffer[i] >> 16); utf16_output[1] = uint16_t(buffer[i] & 0xFFFF); utf16_output += 2; @@ -15083,14 +14956,14 @@ size_t convert_masked_utf8_to_utf32(const char *input, // This results in more instructions but, potentially, also higher speeds. // // We first try a few fast paths. - if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) { - // We process in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process in chunks of 12 bytes. // use fast implementation in src/simdutf/arm64/simd.h // Ideally the compiler can keep the tables in registers. simd8 temp{vreinterpretq_s8_u8(in)}; temp.store_ascii_as_utf32_tbl(utf32_out); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. } if(input_utf8_end_of_code_point_mask == 0x924) { // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte UTF-32 code units. @@ -15259,11 +15132,11 @@ size_t convert_masked_utf8_to_latin1(const char *input, // We first try a few fast paths. // The obvious first test is ASCII, which actually consumes the full 16. - if((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) { - // We process in chunks of 16 bytes + if(utf8_end_of_code_point_mask == 0xfff) { + // We process in chunks of 12 bytes vst1q_u8(reinterpret_cast(latin1_output), in); - latin1_output += 16; // We wrote 16 18-bit characters. - return 16; // We consumed 16 bytes. + latin1_output += 12; // We wrote 12 18-bit characters. + return 12; // We consumed 12 bytes. } /// We do not have a fast path available, or the fast path is unimportant, so we fallback. const uint8_t idx = @@ -16832,6 +16705,7 @@ static inline void compress(uint8x16_t data, uint16_t mask, char *output) { struct block64 { uint8x16_t chunks[4]; }; + static_assert(sizeof(block64) == 64, "block64 is not 64 bytes"); template uint64_t to_base64_mask(block64 *b, bool *error) { uint8x16_t v0f = vdupq_n_u8(0xf); @@ -16853,6 +16727,7 @@ template uint64_t to_base64_mask(block64 *b, bool *error) { uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f); uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f); uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f); + // Needed by the decoding step. uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4); uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4); @@ -16862,20 +16737,16 @@ template uint64_t to_base64_mask(block64 *b, bool *error) { #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO if (base64_url) { lut_lo = - simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4); + simdutf_make_uint8x16_t(0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xf4,0xe5,0xa5,0xf4,0xf4); } else { lut_lo = - simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4); + simdutf_make_uint8x16_t(0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xb4,0xe5,0xe5,0xf4,0xb4); } #else if (base64_url) { - lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xf4, 0xf5, 0xa5, 0xf4, 0xf4}; + lut_lo = uint8x16_t{0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xf4,0xe5,0xa5,0xf4,0xf4}; } else { - lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, - 0x70, 0x61, 0xe1, 0xb4, 0xf5, 0xe5, 0xf4, 0xb4}; + lut_lo = uint8x16_t{0x3a,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x70,0x61,0xe1,0xb4,0xe5,0xe5,0xf4,0xb4}; } #endif uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0); @@ -17047,19 +16918,22 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) { const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; + size_t equallocation = srclen; // location of the first padding character if any // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } size_t equalsigns = 0; if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 1; // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 2; } @@ -17079,21 +16953,23 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, src += 64; bool error = false; uint64_t badcharmask = to_base64_mask(&b, &error); - if(badcharmask) - if (error) { - src -= 64; - - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { - src++; + if(badcharmask){ + if (error) { + src -= 64; + while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { + src++; + } + if(src < srcend){ + // should never happen + } + return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } - return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } if (badcharmask != 0) { // optimization opportunity: check for simple masks like those made of // continuous 1s followed by continuous 0s. And masks containing a // single bad character. - bufferptr += compress_block(&b, badcharmask, bufferptr); } else { // optimization opportunity: if bufferptr == buffer and mask == 0, we @@ -17120,7 +16996,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } bufferptr += (val <= 63); @@ -17163,7 +17039,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, if (leftover > 0) { while (leftover < 4 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } buffer_start[leftover] = char(val); @@ -17215,13 +17091,14 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, // additional checks if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { r.error = error_code::INVALID_BASE64_CHARACTER; + r.count = equallocation; } } return r; } if(equalsigns > 0) { if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + return {INVALID_BASE64_CHARACTER, equallocation}; } } return {SUCCESS, size_t(dst - dstinit)}; @@ -17879,6 +17756,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -17953,14 +17833,14 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { // rewind_and_convert_with_errors will seek a potential error from in+pos onward, // with the ability to go back up to pos bytes, and read size-pos bytes forward. result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -18198,10 +18078,10 @@ using namespace simd; size_t pos = 0; char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 16 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; @@ -18231,6 +18111,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // we have an error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -18304,12 +18187,12 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -18858,15 +18741,16 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return arm_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } + // todo: reimplement as a one-pass algorithm. + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } } + return out; } simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { @@ -18886,6 +18770,10 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char } simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } const char16_t* tail = arm_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); @@ -18895,7 +18783,11 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s } simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = arm_validate_utf16(buf, len); + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } + const char16_t *tail = arm_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); } else { @@ -18930,6 +18822,10 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch } simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid. protected the implementation from nullptr. + return true; + } const char32_t* tail = arm_validate_utf32le(buf, len); if (tail) { return scalar::utf32::validate(tail, len - (tail - buf)); @@ -19596,6 +19492,8 @@ size_t implementation::binary_to_base64(const char * input, size_t length, char* +#include +#include namespace simdutf { namespace fallback { @@ -19604,6 +19502,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + // todo: reimplement as a one-pass algorithm. int out = 0; if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } if((length % 2) == 0) { @@ -19612,7 +19511,6 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz if((length % 4) == 0) { if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } } - return out; } @@ -19657,7 +19555,7 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char } simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) const noexcept { - return scalar::latin1_to_utf8::convert(buf,len,utf8_output); + return scalar::latin1_to_utf8::convert(buf, len, utf8_output); } simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept { @@ -19669,7 +19567,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* } simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept { - return scalar::latin1_to_utf32::convert(buf,len,utf32_output); + return scalar::latin1_to_utf32::convert(buf, len, utf32_output); } simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept { @@ -19870,7 +19768,31 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t lengt } simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * input, size_t length) const noexcept { - return scalar::latin1::utf8_length_from_latin1(input,length); + size_t answer = length; + size_t i = 0; + auto pop = [](uint64_t v) { + return (size_t)(((v>>7) & UINT64_C(0x0101010101010101)) * UINT64_C(0x0101010101010101) >> 56); + }; + for(; i + 32 <= length; i += 32) { + uint64_t v; + memcpy(&v, input + i, 8); + answer += pop(v); + memcpy(&v, input + i + 8, sizeof(v)); + answer += pop(v); + memcpy(&v, input + i + 16, sizeof(v)); + answer += pop(v); + memcpy(&v, input + i + 24, sizeof(v)); + answer += pop(v); + } + for(; i + 8 <= length; i += 8) { + uint64_t v; + memcpy(&v, input + i, sizeof(v)); + answer += pop(v); + } + for(; i + 1 <= length; i += 1) { + answer += static_cast(input[i]) >> 7; + } + return answer; } simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept { @@ -19924,12 +19846,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t equallocation = length; // location of the first padding character if any size_t equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -19961,12 +19885,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp size_t equallocation = length; // location of the first padding character if any size_t equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -23078,19 +23004,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) { const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; + size_t equallocation = srclen; // location of the first padding character if any size_t equalsigns = 0; // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 1; // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 2; } @@ -23113,7 +23042,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, uint64_t badcharmask = to_base64_mask(&b, &error); if (error) { src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { + while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { src++; } return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; @@ -23151,7 +23080,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } bufferptr += (val <= 63); @@ -23192,7 +23121,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, if (leftover > 0) { while (leftover < 4 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } buffer_start[leftover] = char(val); @@ -23244,13 +23173,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, // additional checks if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { r.error = error_code::INVALID_BASE64_CHARACTER; + r.count = equallocation; } } return r; } if(equalsigns > 0) { if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + return {INVALID_BASE64_CHARACTER, equallocation}; } } return {SUCCESS, size_t(dst - dstinit)}; @@ -23272,136 +23202,17 @@ implementation::detect_encodings(const char *input, size_t length) const noexcept { // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); + // todo: convert to a one-pass algorithm if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - const char *buf = input; - - const char *start = buf; - const char *end = input + length; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - avx512_utf8_checker checker{}; - __m512i currentmax = _mm512_setzero_si512(); - while (end - buf >= 64) { - __m512i in = _mm512_loadu_si512((__m512i *)buf); - __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800))); - __mmask32 surrogates = - _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800))); - if (surrogates) { - is_utf8 = false; - - // Can still be either UTF-16LE or UTF-32 depending on the positions - // of the surrogates To be valid UTF-32, a surrogate cannot be in the - // two most significant bytes of any 32-bit word. On the other hand, to - // be valid UTF-16LE, at least one surrogate must be in the two most - // significant bytes of a 32-bit word since they always come in pairs in - // UTF-16LE. Note that we always proceed in multiple of 4 before this - // point so there is no offset in 32-bit code units. - - if ((surrogates & 0xaaaaaaaa) != 0) { - is_utf32 = false; - __mmask32 highsurrogates = _mm512_cmplt_epu16_mask( - diff, _mm512_set1_epi16(uint16_t(0x0400))); - __mmask32 lowsurrogates = surrogates ^ highsurrogates; - // high must be followed by low - if ((highsurrogates << 1) != lowsurrogates) { - return simdutf::encoding_type::unspecified; - } - - bool ends_with_high = ((highsurrogates & 0x80000000) != 0); - if (ends_with_high) { - buf += - 31 * - sizeof(char16_t); // advance only by 31 code units so that we start - // with the high surrogate on the next round. - } else { - buf += 32 * sizeof(char16_t); - } - is_utf16 = validate_utf16le(reinterpret_cast(buf), - (end - buf) / sizeof(char16_t)); - if (!is_utf16) { - return simdutf::encoding_type::unspecified; - - } else { - return simdutf::encoding_type::UTF16_LE; - } - - } else { - is_utf16 = false; - // Check for UTF-32 - if (length % 4 == 0) { - const char32_t *input32 = reinterpret_cast(buf); - const char32_t *end32 = - reinterpret_cast(start) + length / 4; - if (validate_utf32(input32, end32 - input32)) { - return simdutf::encoding_type::UTF32_LE; - } - } - return simdutf::encoding_type::unspecified; - } - } - // If no surrogate, validate under other encodings as well - - // UTF-32 validation - currentmax = _mm512_max_epu32(in, currentmax); - - // UTF-8 validation - checker.check_next_input(in); - - buf += 64; - } - - // Check which encodings are possible - - if (is_utf8) { - size_t current_length = static_cast(buf - start); - if (current_length != length) { - const __m512i utf8 = _mm512_maskz_loadu_epi8( - (UINT64_C(1) << (length - current_length)) - 1, (const __m512i *)buf); - checker.check_next_input(utf8); - } - checker.check_eof(); - if (!checker.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } - - if (is_utf16 && scalar::utf16::validate( - reinterpret_cast(buf), - (length - (buf - start)) / 2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (length % 4 == 0)) { - size_t leftover = length - static_cast(buf - start); - currentmax = _mm512_max_epu32( - _mm512_maskz_loadu_epi8( - (UINT64_C(1) << leftover) - 1, - (const __m512i *)buf), - currentmax); - __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff), - _MM_CMPINT_GT); - if (outside_range == 0) { - out |= simdutf::encoding_type::UTF32_LE; - } else { - } - //} - /*} else { - out |= simdutf::encoding_type::UTF32_LE; - }*/ - } - - return out; - } else if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } + } + return out; } simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { @@ -23673,7 +23484,8 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz if (tail) { return scalar::utf32::validate(tail, len - (tail - buf)); } else { - return false; + // we come here if there was an error, or buf was nullptr which may happen for empty input. + return len == 0; } } @@ -23769,7 +23581,7 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(co return {simdutf::SUCCESS, written}; } size_t pos = obuf - buf; - result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, buf + pos, len - pos, olatin1_output); + result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, buf + pos, len - pos, latin1_output); res.count += pos; return res; } @@ -24506,60 +24318,68 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * const uint8_t *str = reinterpret_cast(input); size_t answer = length / sizeof(__m512i) * sizeof(__m512i); size_t i = 0; - unsigned char v_0xFF = 0xff; - __m512i eight_64bits = _mm512_setzero_si512(); - while (i + sizeof(__m512i) <= length) { - __m512i runner = _mm512_setzero_si512(); - size_t iterations = (length - i) / sizeof(__m512i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); - for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) { - // Load four __m512i vectors - __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); - __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); - __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); - __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); - - // Generate four masks - __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1); - __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2); - __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3); - __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4); - // Apply the masks and subtract from the runner - __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF); - __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF); - __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF); - __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF); - - runner = _mm512_sub_epi8(runner, not_ascii1); - runner = _mm512_sub_epi8(runner, not_ascii2); - runner = _mm512_sub_epi8(runner, not_ascii3); - runner = _mm512_sub_epi8(runner, not_ascii4); - } - - for (; i <= max_i; i += sizeof(__m512i)) { - __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); - - __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input); - __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF); - runner = _mm512_sub_epi8(runner, not_ascii); + if(answer >= 2048) { // long strings optimization + unsigned char v_0xFF = 0xff; + __m512i eight_64bits = _mm512_setzero_si512(); + while (i + sizeof(__m512i) <= length) { + __m512i runner = _mm512_setzero_si512(); + size_t iterations = (length - i) / sizeof(__m512i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i); + for (; i + 4*sizeof(__m512i) <= max_i; i += 4*sizeof(__m512i)) { + // Load four __m512i vectors + __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i)); + __m512i input2 = _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i))); + __m512i input3 = _mm512_loadu_si512((const __m512i *)(str + i + 2*sizeof(__m512i))); + __m512i input4 = _mm512_loadu_si512((const __m512i *)(str + i + 3*sizeof(__m512i))); + + // Generate four masks + __mmask64 mask1 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1); + __mmask64 mask2 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2); + __mmask64 mask3 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3); + __mmask64 mask4 = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4); + // Apply the masks and subtract from the runner + __m512i not_ascii1 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF); + __m512i not_ascii2 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF); + __m512i not_ascii3 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF); + __m512i not_ascii4 = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF); + + runner = _mm512_sub_epi8(runner, not_ascii1); + runner = _mm512_sub_epi8(runner, not_ascii2); + runner = _mm512_sub_epi8(runner, not_ascii3); + runner = _mm512_sub_epi8(runner, not_ascii4); + } + + for (; i <= max_i; i += sizeof(__m512i)) { + __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i)); + + __mmask64 mask = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input); + __m512i not_ascii = _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF); + runner = _mm512_sub_epi8(runner, not_ascii); + } + + eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512())); + } + + __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0); + __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1); + answer += (size_t)_mm256_extract_epi64(first_half, 0) + + (size_t)_mm256_extract_epi64(first_half, 1) + + (size_t)_mm256_extract_epi64(first_half, 2) + + (size_t)_mm256_extract_epi64(first_half, 3) + + (size_t)_mm256_extract_epi64(second_half, 0) + + (size_t)_mm256_extract_epi64(second_half, 1) + + (size_t)_mm256_extract_epi64(second_half, 2) + + (size_t)_mm256_extract_epi64(second_half, 3); + } else if (answer > 0) { + for(; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) { + __m512i latin = _mm512_loadu_si512((const __m512i*)(str + i)); + uint64_t non_ascii = _mm512_movepi8_mask(latin); + answer += count_ones(non_ascii); } - - eight_64bits = _mm512_add_epi64(eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512())); } - - __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0); - __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1); - answer += (size_t)_mm256_extract_epi64(first_half, 0) + - (size_t)_mm256_extract_epi64(first_half, 1) + - (size_t)_mm256_extract_epi64(first_half, 2) + - (size_t)_mm256_extract_epi64(first_half, 3) + - (size_t)_mm256_extract_epi64(second_half, 0) + - (size_t)_mm256_extract_epi64(second_half, 1) + - (size_t)_mm256_extract_epi64(second_half, 2) + - (size_t)_mm256_extract_epi64(second_half, 3); return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), length - i); } @@ -24725,195 +24545,6 @@ simdutf_really_inline simd8 must_be_2_3_continuation(const simd8 return simd8(is_third_byte | is_fourth_byte); } -/* begin file src/haswell/avx2_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int avx2_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - - __m256i currentmax = _mm256_setzero_si256(); - - checker check{}; - - while(end - buf >= 64) { - __m256i in = _mm256_loadu_si256((__m256i*)buf); - __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1); - - const auto u0 = simd16(in); - const auto u1 = simd16(nextin); - - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - - const auto in16 = simd16::pack(v0, v1); - - const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; - uint32_t surrogates_bitmask0 = surrogates_wordmask0.to_bitmask(); - - // Check for surrogates - if (surrogates_bitmask0 != 0x0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - - if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) { - is_utf32 = false; - // Code from avx2_validate_utf16le.cpp - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; - - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - const uint32_t V0 = ~surrogates_bitmask0; - - const auto vH0 = (in16 & v_fc) == v_dc; - const uint32_t H0 = vH0.to_bitmask(); - - const uint32_t L0 = ~H0 & surrogates_bitmask0; - - const uint32_t a0 = L0 & (H0 >> 1); - const uint32_t b0 = a0 << 1; - const uint32_t c0 = V0 | a0 | b0; - - if (c0 == 0xffffffff) { - input += simd16::ELEMENTS * 2; - } else if (c0 == 0x7fffffff) { - input += simd16::ELEMENTS * 2 - 1; - } else { - return simdutf::encoding_type::unspecified; - } - - while (input + simd16::ELEMENTS * 2 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::ELEMENTS); - - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - - const auto in_16 = simd16::pack(t0, t1); - - const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; - const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask(); - if (surrogates_bitmask == 0x0) { - input += simd16::ELEMENTS * 2; - } else { - const uint32_t V = ~surrogates_bitmask; - - const auto vH = (in_16 & v_fc) == v_dc; - const uint32_t H = vH.to_bitmask(); - - const uint32_t L = ~H & surrogates_bitmask; - - const uint32_t a = L & (H >> 1); - - const uint32_t b = a << 1; - - const uint32_t c = V | a | b; - - if (c == 0xffffffff) { - input += simd16::ELEMENTS * 2; - } else if (c == 0x7fffffff) { - input += simd16::ELEMENTS * 2 - 1; - } else { - return simdutf::encoding_type::unspecified; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - __m256i currentoffsetmax = _mm256_setzero_si256(); - const __m256i offset = _mm256_set1_epi32(0xffff2000); - const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff); - - currentmax = _mm256_max_epu32(in, currentmax); - currentmax = _mm256_max_epu32(nextin, currentmax); - - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax); - - while (input + 8 < end32) { - const __m256i in32 = _mm256_loadu_si256((__m256i *)input); - currentmax = _mm256_max_epu32(in32,currentmax); - currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax); - input += 8; - } - - __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) { - return simdutf::encoding_type::unspecified; - } - } else { - return simdutf::encoding_type::unspecified; - } - } - break; - } - // If no surrogate, validate under other encodings as well - - // UTF-32 validation - currentmax = _mm256_max_epu32(in, currentmax); - currentmax = _mm256_max_epu32(nextin, currentmax); - - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(in, nextin); - check.check_next_input(in8); - - buf += 64; - } - - // Check which encodings are possible - - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } - - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (len % 4 == 0)) { - const __m256i standardmax = _mm256_set1_epi32(0x10ffff); - __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax); - if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } - } - - return out; -} -/* end file src/haswell/avx2_detect_encodings.cpp */ - /* begin file src/haswell/avx2_validate_utf16.cpp */ /* In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. @@ -24962,9 +24593,6 @@ int avx2_detect_encodings(const char * buf, size_t len) { */ template const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) { - if (simdutf_unlikely(size == 0)) { - return input; - } const char16_t* end = input + size; const auto v_d8 = simd8::splat(0xd8); @@ -25126,9 +24754,6 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) - nullptr if an error was detected. */ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) { - if (simdutf_unlikely(size == 0)) { - return input; - } const char32_t* end = input + size; const __m256i standardmax = _mm256_set1_epi32(0x10ffff); @@ -25158,9 +24783,6 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) { const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) { - if (simdutf_unlikely(size == 0)) { - return result(error_code::SUCCESS, 0); - } const char32_t* start = input; const char32_t* end = input + size; @@ -25356,8 +24978,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, const __m128i in = _mm_loadu_si128((__m128i *)input); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. __m256i ascii = _mm256_cvtepu8_epi16(in); if (big_endian) { const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, @@ -25365,8 +24987,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, ascii = _mm256_shuffle_epi8(ascii, swap256); } _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. @@ -25536,12 +25158,12 @@ size_t convert_masked_utf8_to_utf32(const char *input, const __m128i in = _mm_loadu_si128((__m128i *)input); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in)); _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8))); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. @@ -27175,11 +26797,11 @@ size_t convert_masked_utf8_to_latin1(const char *input, const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; // we are only processing 12 bytes in case it is not all ASCII - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); - latin1_output += 16; // We wrote 16 characters. - return 16; // We consumed 16 bytes. + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 1 bytes. } /// We do not have a fast path available, so we fallback. const uint8_t idx = @@ -27466,9 +27088,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) { if (base64_url) { check_asso = - _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3, - 0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1, - 0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12); + _mm256_setr_epi8(0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6, + 0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6); } else { check_asso = _mm256_setr_epi8( @@ -27479,13 +27100,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) { __m256i check_values; if (base64_url) { check_values = _mm256_setr_epi8( - 0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF), - uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5), - uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0), - uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80), - uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3), - uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0), - uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0); + uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF),uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80),0x0,uint8_t(0x80),0x0,uint8_t(0x80), + uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF),uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80),0x0,uint8_t(0x80),0x0,uint8_t(0x80)); } else { check_values = _mm256_setr_epi8( int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF), @@ -27496,7 +27112,7 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) { int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)); } - const __m256i shifted = _mm256_srli_epi32(*src, 3); + const __m256i shifted =_mm256_srli_epi32(*src, 3); const __m256i delta_hash = _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted); const __m256i check_hash = @@ -27603,22 +27219,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) { const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; + size_t equallocation = srclen; // location of the first padding character if any // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } size_t equalsigns = 0; if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 1; // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { - srclen--; - } - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 2; } @@ -27644,7 +27260,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, uint64_t badcharmask = to_base64_mask(&b, &error); if (error) { src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { + while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { src++; } return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; @@ -27692,7 +27308,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } bufferptr += (val <= 63); @@ -27739,7 +27355,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, if (leftover > 0) { while (leftover < 4 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } buffer_start[leftover] = char(val); @@ -27790,13 +27406,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, // additional checks if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { r.error = error_code::INVALID_BASE64_CHARACTER; + r.count = equallocation; } } return r; } if(equalsigns > 0) { if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + return {INVALID_BASE64_CHARACTER, equallocation}; } } return {SUCCESS, size_t(dst - dstinit)}; @@ -28455,6 +28072,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -28529,14 +28149,14 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { // rewind_and_convert_with_errors will seek a potential error from in+pos onward, // with the ability to go back up to pos bytes, and read size-pos bytes forward. result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -28774,10 +28394,10 @@ using namespace simd; size_t pos = 0; char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 16 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; @@ -28807,6 +28427,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // we have an error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -28880,12 +28503,12 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -29431,15 +29054,15 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return avx2_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } } + return out; } simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { @@ -29459,7 +29082,12 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char } simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = avx2_validate_utf16(buf, len); + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const char16_t *tail = avx2_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); } else { @@ -29468,6 +29096,11 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s } simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } const char16_t* tail = avx2_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); @@ -29497,6 +29130,11 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch } simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; + } const char32_t* tail = avx2_validate_utf32le(buf, len); if (tail) { return scalar::utf32::validate(tail, len - (tail - buf)); @@ -29506,6 +29144,11 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz } simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } result res = avx2_validate_utf32le_with_errors(buf, len); if (res.count != len) { result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); @@ -30036,40 +29679,48 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *i const uint8_t *data = reinterpret_cast(input); size_t answer = len / sizeof(__m256i) * sizeof(__m256i); size_t i = 0; - __m256i four_64bits = _mm256_setzero_si256(); - while (i + sizeof(__m256i) <= len) { - __m256i runner = _mm256_setzero_si256(); - // We can do up to 255 loops without overflow. - size_t iterations = (len - i) / sizeof(__m256i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); - for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) { - __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); - __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); - __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i))); - __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i))); - __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), - _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); - __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), - _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); - __m256i input1234 = _mm256_add_epi8(input12, input23); - runner = _mm256_sub_epi8( - runner, input1234); - } - for (; i <= max_i; i += sizeof(__m256i)) { - __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i)); - runner = _mm256_sub_epi8( - runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); - } - four_64bits = _mm256_add_epi64( - four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); - } - answer += _mm256_extract_epi64(four_64bits, 0) + - _mm256_extract_epi64(four_64bits, 1) + - _mm256_extract_epi64(four_64bits, 2) + - _mm256_extract_epi64(four_64bits, 3); + if(answer >= 2048) { // long strings optimization + __m256i four_64bits = _mm256_setzero_si256(); + while (i + sizeof(__m256i) <= len) { + __m256i runner = _mm256_setzero_si256(); + // We can do up to 255 loops without overflow. + size_t iterations = (len - i) / sizeof(__m256i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i); + for (; i + 4*sizeof(__m256i) <= max_i; i += 4*sizeof(__m256i)) { + __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i)); + __m256i input2 = _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i))); + __m256i input3 = _mm256_loadu_si256((const __m256i *)(data + i + 2*sizeof(__m256i))); + __m256i input4 = _mm256_loadu_si256((const __m256i *)(data + i + 3*sizeof(__m256i))); + __m256i input12 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2)); + __m256i input23 = _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3), + _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4)); + __m256i input1234 = _mm256_add_epi8(input12, input23); + runner = _mm256_sub_epi8( + runner, input1234); + } + for (; i <= max_i; i += sizeof(__m256i)) { + __m256i input_256_chunk = _mm256_loadu_si256((const __m256i *)(data + i)); + runner = _mm256_sub_epi8( + runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk)); + } + four_64bits = _mm256_add_epi64( + four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256())); + } + answer += _mm256_extract_epi64(four_64bits, 0) + + _mm256_extract_epi64(four_64bits, 1) + + _mm256_extract_epi64(four_64bits, 2) + + _mm256_extract_epi64(four_64bits, 3); + } else if (answer > 0) { + for(; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) { + __m256i latin = _mm256_loadu_si256((const __m256i*)(data + i)); + uint32_t non_ascii = _mm256_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + } return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(data + i), len - i); } @@ -30854,6 +30505,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -30928,14 +30582,14 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { // rewind_and_convert_with_errors will seek a potential error from in+pos onward, // with the ability to go back up to pos bytes, and read size-pos bytes forward. result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -31173,10 +30827,10 @@ using namespace simd; size_t pos = 0; char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 16 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; @@ -31206,6 +30860,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // we have an error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -31279,12 +30936,12 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -31456,6 +31113,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char * input, siz // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } + // todo: reimplement as a one-pass algorithm. int out = 0; if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } if((length % 2) == 0) { @@ -31689,12 +31347,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t equallocation = length; // location of the first padding character if any size_t equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -31727,12 +31387,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp size_t equallocation = length; // location of the first padding character if any size_t equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -32083,10 +31745,11 @@ simdutf_really_inline static result rvv_validate_utf16_with_errors(const char16_ break; } } - if (last - 0xD800u < 0x400u) + if (last - 0xD800u < 0x400u) { return result(error_code::SURROGATE, src - beg - 1); /* end on high surrogate */ - else + } else { return result(error_code::SUCCESS, src - beg); + } } simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *src, size_t len) const noexcept { @@ -32122,11 +31785,14 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(const char vl = __riscv_vsetvl_e32m8(len); vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl); vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl); - long idx; - idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl); - if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx); - idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl); - if (idx >= 0) return result(error_code::SURROGATE, src - beg + idx); + long idx1 = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl); + long idx2 = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl); + if(idx1 >= 0 && idx2 >= 0) { + if(idx1 <= idx2) { return result(error_code::TOO_LARGE, src - beg + idx1); } + else { return result(error_code::SURROGATE, src - beg + idx2); } + } + if (idx1 >= 0) { return result(error_code::TOO_LARGE, src - beg + idx1); } + if (idx2 >= 0) { return result(error_code::SURROGATE, src - beg + idx2); } } return result(error_code::SUCCESS, src - beg); } @@ -32560,7 +32226,7 @@ simdutf_really_inline static result rvv_utf16_to_latin1_with_errors(const char16 v = simdutf_byteflip(v, vl); long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl); if (idx >= 0) - return result(error_code::TOO_LARGE, beg - src + idx); + return result(error_code::TOO_LARGE, src - beg + idx); __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl); } return result(error_code::SUCCESS, src - beg); @@ -32812,7 +32478,11 @@ simdutf_really_inline static result rvv_utf16_to_utf32_with_errors(const char16_ const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl); const long idx = __riscv_vfirst_m_b8(diff, vl); if (idx >= 0) { - return result(error_code::SURROGATE, src - srcBeg + idx + 1); + uint16_t word = simdutf_byteflip(src[idx]); + if(word < 0xD800 || word > 0xDBFF) { + return result(error_code::SURROGATE, src - srcBeg + idx + 1); + } + return result(error_code::SURROGATE, src - srcBeg + idx); } } @@ -32962,10 +32632,15 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(con n -= vl, src += vl, dst += vlOut; continue; } - + long idx1 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(__riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl); - long idx = __riscv_vfirst_m_b8(sur, vl); - if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx); + long idx2 = __riscv_vfirst_m_b8(sur, vl); + if(idx1 >= 0 && idx2 >= 0) { + if(idx1 <= idx2) { return result(error_code::TOO_LARGE, src - srcBeg + idx1); } + else { return result(error_code::SURROGATE, src - srcBeg + idx2); } + } + if (idx1 >= 0) { return result(error_code::TOO_LARGE, src - srcBeg + idx1); } + if (idx2 >= 0) { return result(error_code::SURROGATE, src - srcBeg + idx2); } vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000-1, vl); long first = __riscv_vfirst_m_b8(m4, vl); @@ -33040,18 +32715,21 @@ simdutf_really_inline static result rvv_convert_utf32_to_utf16_with_errors(const vl = __riscv_vsetvl_e32m4(len); vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl); vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl); - long idx; - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl); - if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx); - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl); + long idx1 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); + long idx2 = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl); + if (idx1 >= 0 && idx2 >= 0) { + if (idx1 <= idx2) return result(error_code::TOO_LARGE, src - srcBeg + idx1); + return result(error_code::SURROGATE, src - srcBeg + idx2); + } + if (idx1 >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx1); + if (idx2 >= 0) return result(error_code::SURROGATE, src - srcBeg + idx2); + long idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl); if (idx < 0) { vlOut = vl; vuint16m2_t n = simdutf_byteflip(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut); __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut); continue; } - idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl); - if (idx >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx); vlOut = rvv_utf32_store_utf16_m4((uint16_t*)dst, v, vl, m4even); } return result(error_code::SUCCESS, dst - dstBeg); @@ -33114,6 +32792,7 @@ simdutf_warn_unused int implementation::detect_encodings(const char *input, size auto bom_encoding = simdutf::BOM::check_bom(input, length); if (bom_encoding != encoding_type::unspecified) return bom_encoding; + // todo: reimplement as a one-pass algorithm. int out = 0; if (validate_utf8(input, length)) out |= encoding_type::UTF8; @@ -33156,12 +32835,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t equallocation = length; // location of the first padding character if any size_t equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -33194,12 +32875,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp size_t equallocation = length; // location of the first padding character if any auto equalsigns = 0; if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; length -= 1; equalsigns++; while(length > 0 && scalar::base64::is_ascii_white_space(input[length - 1])) { length--; } if(length > 0 && input[length - 1] == '=') { + equallocation = length - 1; equalsigns++; length -= 1; } @@ -33356,214 +33039,6 @@ inline void write_v_u16_11bits_to_utf8( } // namespace westmere } // namespace internal /* end file src/westmere/internal/loader.cpp */ -/* begin file src/westmere/sse_detect_encodings.cpp */ -template -// len is known to be a multiple of 2 when this is called -int sse_detect_encodings(const char * buf, size_t len) { - const char* start = buf; - const char* end = buf + len; - - bool is_utf8 = true; - bool is_utf16 = true; - bool is_utf32 = true; - - int out = 0; - - const auto v_d8 = simd8::splat(0xd8); - const auto v_f8 = simd8::splat(0xf8); - - __m128i currentmax = _mm_setzero_si128(); - - checker check{}; - - while(end - buf >= 64) { - __m128i in = _mm_loadu_si128((__m128i*)buf); - __m128i secondin = _mm_loadu_si128((__m128i*)buf+1); - __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2); - __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3); - - const auto u0 = simd16(in); - const auto u1 = simd16(secondin); - const auto u2 = simd16(thirdin); - const auto u3 = simd16(fourthin); - - const auto v0 = u0.shr<8>(); - const auto v1 = u1.shr<8>(); - const auto v2 = u2.shr<8>(); - const auto v3 = u3.shr<8>(); - - const auto in16 = simd16::pack(v0, v1); - const auto nextin16 = simd16::pack(v2, v3); - - const auto surrogates_wordmask0 = (in16 & v_f8) == v_d8; - const auto surrogates_wordmask1 = (nextin16 & v_f8) == v_d8; - uint16_t surrogates_bitmask0 = static_cast(surrogates_wordmask0.to_bitmask()); - uint16_t surrogates_bitmask1 = static_cast(surrogates_wordmask1.to_bitmask()); - - // Check for surrogates - if (surrogates_bitmask0 != 0x0 || surrogates_bitmask1 != 0x0) { - // Cannot be UTF8 - is_utf8 = false; - // Can still be either UTF-16LE or UTF-32 depending on the positions of the surrogates - // To be valid UTF-32, a surrogate cannot be in the two most significant bytes of any 32-bit word. - // On the other hand, to be valid UTF-16LE, at least one surrogate must be in the two most significant - // bytes of a 32-bit word since they always come in pairs in UTF-16LE. - // Note that we always proceed in multiple of 4 before this point so there is no offset in 32-bit code units. - - if (((surrogates_bitmask0 | surrogates_bitmask1) & 0xaaaa) != 0) { - is_utf32 = false; - // Code from sse_validate_utf16le.cpp - // Not efficient, we do not process surrogates_bitmask1 - const char16_t * input = reinterpret_cast(buf); - const char16_t* end16 = reinterpret_cast(start) + len/2; - - const auto v_fc = simd8::splat(0xfc); - const auto v_dc = simd8::splat(0xdc); - - const uint16_t V0 = static_cast(~surrogates_bitmask0); - - const auto vH0 = (in16 & v_fc) == v_dc; - const uint16_t H0 = static_cast(vH0.to_bitmask()); - - const uint16_t L0 = static_cast(~H0 & surrogates_bitmask0); - - const uint16_t a0 = static_cast(L0 & (H0 >> 1)); - - const uint16_t b0 = static_cast(a0 << 1); - - const uint16_t c0 = static_cast(V0 | a0 | b0); - - if (c0 == 0xffff) { - input += 16; - } else if (c0 == 0x7fff) { - input += 15; - } else { - is_utf16 = false; - break; - } - - while (input + simd16::SIZE * 2 < end16) { - const auto in0 = simd16(input); - const auto in1 = simd16(input + simd16::SIZE / sizeof(char16_t)); - - const auto t0 = in0.shr<8>(); - const auto t1 = in1.shr<8>(); - - const auto in_16 = simd16::pack(t0, t1); - - const auto surrogates_wordmask = (in_16 & v_f8) == v_d8; - const uint16_t surrogates_bitmask = static_cast(surrogates_wordmask.to_bitmask()); - if (surrogates_bitmask == 0x0) { - input += 16; - } else { - const uint16_t V = static_cast(~surrogates_bitmask); - - const auto vH = (in_16 & v_fc) == v_dc; - const uint16_t H = static_cast(vH.to_bitmask()); - - const uint16_t L = static_cast(~H & surrogates_bitmask); - - const uint16_t a = static_cast(L & (H >> 1)); - - const uint16_t b = static_cast(a << 1); - - const uint16_t c = static_cast(V | a | b); - - if (c == 0xffff) { - input += 16; - } else if (c == 0x7fff) { - input += 15; - } else { - is_utf16 = false; - break; - } - } - } - } else { - is_utf16 = false; - // Check for UTF-32 - if (len % 4 == 0) { - const char32_t * input = reinterpret_cast(buf); - const char32_t* end32 = reinterpret_cast(start) + len/4; - - // Must start checking for surrogates - __m128i currentoffsetmax = _mm_setzero_si128(); - const __m128i offset = _mm_set1_epi32(0xffff2000); - const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff); - - currentmax = _mm_max_epu32(in, currentmax); - currentmax = _mm_max_epu32(secondin, currentmax); - currentmax = _mm_max_epu32(thirdin, currentmax); - currentmax = _mm_max_epu32(fourthin, currentmax); - - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(secondin, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(thirdin, offset), currentoffsetmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax); - - while (input + 4 < end32) { - const __m128i in32 = _mm_loadu_si128((__m128i *)input); - currentmax = _mm_max_epu32(in32,currentmax); - currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax); - input += 4; - } - - __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax); - if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) { - is_utf32 = false; - } - } else { - is_utf32 = false; - } - } - break; - } - // If no surrogate, validate under other encodings as well - - // UTF-32 validation - currentmax = _mm_max_epu32(in, currentmax); - currentmax = _mm_max_epu32(secondin, currentmax); - currentmax = _mm_max_epu32(thirdin, currentmax); - currentmax = _mm_max_epu32(fourthin, currentmax); - - // UTF-8 validation - // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h - simd::simd8x64 in8(in, secondin, thirdin, fourthin); - check.check_next_input(in8); - - buf += 64; - } - - // Check which encodings are possible - - if (is_utf8) { - if (static_cast(buf - start) != len) { - uint8_t block[64]{}; - std::memset(block, 0x20, 64); - std::memcpy(block, buf, len - (buf - start)); - simd::simd8x64 in(block); - check.check_next_input(in); - } - if (!check.errors()) { - out |= simdutf::encoding_type::UTF8; - } - } - - if (is_utf16 && scalar::utf16::validate(reinterpret_cast(buf), (len - (buf - start))/2)) { - out |= simdutf::encoding_type::UTF16_LE; - } - - if (is_utf32 && (len % 4 == 0)) { - const __m128i standardmax = _mm_set1_epi32(0x10ffff); - __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax); - if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast(buf), (len - (buf - start))/4)) { - out |= simdutf::encoding_type::UTF32_LE; - } - } - - return out; -} -/* end file src/westmere/sse_detect_encodings.cpp */ /* begin file src/westmere/sse_validate_utf16.cpp */ /* @@ -33613,9 +33088,6 @@ int sse_detect_encodings(const char * buf, size_t len) { */ template const char16_t* sse_validate_utf16(const char16_t* input, size_t size) { - if (simdutf_unlikely(size == 0)) { - return input; - } const char16_t* end = input + size; const auto v_d8 = simd8::splat(0xd8); @@ -33776,9 +33248,6 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) - nullptr if an error was detected. */ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) { - if (size == 0) { - return input; - } const char32_t* end = input + size; const __m128i standardmax = _mm_set1_epi32(0x10ffff); @@ -33808,9 +33277,6 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) { const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) { - if (size == 0) { - return result(error_code::SUCCESS, 0); - } const char32_t* start = input; const char32_t* end = input + size; @@ -34005,8 +33471,9 @@ size_t convert_masked_utf8_to_utf16(const char *input, const __m128i in = _mm_loadu_si128((__m128i *)input); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. + // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218 __m128i ascii_first = _mm_cvtepu8_epi16(in); __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8)); if (big_endian) { @@ -34015,8 +33482,8 @@ size_t convert_masked_utf8_to_utf16(const char *input, } _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second); - utf16_output += 16; // We wrote 16 16-bit characters. - return 16; // We consumed 16 bytes. + utf16_output += 12; // We wrote 12 16-bit characters. + return 12; // We consumed 12 bytes. } if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) { // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte UTF-16 code units. @@ -34187,14 +33654,14 @@ size_t convert_masked_utf8_to_utf32(const char *input, const __m128i in = _mm_loadu_si128((__m128i *)input); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in)); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4))); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8))); _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12))); - utf32_output += 16; // We wrote 16 32-bit characters. - return 16; // We consumed 16 bytes. + utf32_output += 12; // We wrote 12 32-bit characters. + return 12; // We consumed 12 bytes. } if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) { // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte UTF-32 code units. @@ -34316,11 +33783,11 @@ size_t convert_masked_utf8_to_latin1(const char *input, const __m128i in = _mm_loadu_si128((__m128i *)input); const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff; // we are only processing 12 bytes in case it is not all ASCII - if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) { - // We process the data in chunks of 16 bytes. + if(utf8_end_of_code_point_mask == 0xfff) { + // We process the data in chunks of 12 bytes. _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in); - latin1_output += 16; // We wrote 16 characters. - return 16; // We consumed 16 bytes. + latin1_output += 12; // We wrote 12 characters. + return 12; // We consumed 12 bytes. } /// We do not have a fast path available, so we fallback. const uint8_t idx = @@ -35986,8 +35453,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) { } __m128i check_asso; if (base64_url) { - check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, - 0x3, 0x7, 0xB, 0x6, 0xB, 0x12); + check_asso = _mm_setr_epi8(0xD,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x1,0x3,0x7,0xB,0xE,0xB,0x6); } else { check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, @@ -35995,11 +35461,9 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) { } __m128i check_values; if (base64_url) { - check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80), - uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), - uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5), - uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), - uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0); + check_values = _mm_setr_epi8(uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0x80),uint8_t(0xCF), + uint8_t(0xBF),uint8_t(0xB6),uint8_t(0xA6),uint8_t(0xB5),uint8_t(0xA1),0x0,uint8_t(0x80), + 0x0,uint8_t(0x80),0x0,uint8_t(0x80)); } else { check_values = @@ -36008,7 +35472,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) { int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)); } - const __m128i shifted = _mm_srli_epi32(*src, 3); + const __m128i shifted =_mm_srli_epi32(*src, 3); const __m128i delta_hash = _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted); @@ -36138,19 +35602,22 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) { const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value; + size_t equallocation = srclen; // location of the first padding character if any // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } size_t equalsigns = 0; if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 1; // skip trailing spaces - while (srclen > 0 && to_base64[uint8_t(src[srclen - 1])] == 64) { + while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) && to_base64[uint8_t(src[srclen - 1])] == 64) { srclen--; } if (srclen > 0 && src[srclen - 1] == '=') { + equallocation = srclen - 1; srclen--; equalsigns = 2; } @@ -36176,7 +35643,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, uint64_t badcharmask = to_base64_mask(&b, &error); if (error) { src -= 64; - while (src < srcend && to_base64[uint8_t(*src)] <= 64) { + while (src < srcend && scalar::base64::is_eight_byte(*src) && to_base64[uint8_t(*src)] <= 64) { src++; } return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; @@ -36223,7 +35690,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; *bufferptr = char(val); - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } bufferptr += (val <= 63); @@ -36270,7 +35737,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, if (leftover > 0) { while (leftover < 4 && src < srcend) { uint8_t val = to_base64[uint8_t(*src)]; - if (val > 64) { + if (!scalar::base64::is_eight_byte(*src) || val > 64) { return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)}; } buffer_start[leftover] = char(val); @@ -36323,13 +35790,14 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, // additional checks if((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) { r.error = error_code::INVALID_BASE64_CHARACTER; + r.count = equallocation; } } return r; } if(equalsigns > 0) { if((size_t(dst - dstinit) % 3 == 0) || ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) { - return {INVALID_BASE64_CHARACTER, size_t(dst - dstinit)}; + return {INVALID_BASE64_CHARACTER, equallocation}; } } return {SUCCESS, size_t(dst - dstinit)}; @@ -36988,6 +36456,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -37062,14 +36533,14 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { // rewind_and_convert_with_errors will seek a potential error from in+pos onward, // with the ability to go back up to pos bytes, and read size-pos bytes forward. result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf16_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -37307,10 +36778,10 @@ using namespace simd; size_t pos = 0; char32_t* start{utf32_output}; // In the worst case, we have the haswell kernel which can cause an overflow of - // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, + // 8 words when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes, // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate // much more than 8 bytes. However, you cannot generally assume that you have valid - // UTF-8 input, so we are going to go back from the end counting 8 leading bytes, + // UTF-8 input, so we are going to go back from the end counting 16 leading bytes, // to give us a good margin. size_t leading_byte = 0; size_t margin = size; @@ -37340,6 +36811,9 @@ using namespace simd; this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if(utf8_continuation_mask & 1) { + return 0; // we have an error + } uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -37413,12 +36887,12 @@ using namespace simd; this->check_utf8_bytes(input.chunks[2], input.chunks[1]); this->check_utf8_bytes(input.chunks[3], input.chunks[2]); } - if (errors()) { + uint64_t utf8_continuation_mask = input.lt(-65 + 1); + if (errors() || (utf8_continuation_mask & 1)) { result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output); res.count += pos; return res; } - uint64_t utf8_continuation_mask = input.lt(-65 + 1); uint64_t utf8_leading_mask = ~utf8_continuation_mask; uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1; // We process in blocks of up to 12 bytes except possibly @@ -37966,16 +37440,17 @@ namespace westmere { simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept { // If there is a BOM, then we trust it. auto bom_encoding = simdutf::BOM::check_bom(input, length); + // todo: reimplement as a one-pass algorithm. if(bom_encoding != encoding_type::unspecified) { return bom_encoding; } - if (length % 2 == 0) { - return sse_detect_encodings(input, length); - } else { - if (implementation::validate_utf8(input, length)) { - return simdutf::encoding_type::UTF8; - } else { - return simdutf::encoding_type::unspecified; - } + int out = 0; + if(validate_utf8(input, length)) { out |= encoding_type::UTF8; } + if((length % 2) == 0) { + if(validate_utf16le(reinterpret_cast(input), length/2)) { out |= encoding_type::UTF16_LE; } + } + if((length % 4) == 0) { + if(validate_utf32(reinterpret_cast(input), length/4)) { out |= encoding_type::UTF32_LE; } } + return out; } simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept { @@ -37995,7 +37470,12 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char } simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = sse_validate_utf16(buf, len); + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const char16_t *tail = sse_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); } else { @@ -38004,7 +37484,12 @@ simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, s } simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept { - const char16_t* tail = sse_validate_utf16(buf, len); + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-16. protect the implementation from + // handling nullptr + return true; + } + const char16_t *tail = sse_validate_utf16(buf, len); if (tail) { return scalar::utf16::validate(tail, len - (tail - buf)); } else { @@ -38033,7 +37518,12 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch } simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept { - const char32_t* tail = sse_validate_utf32le(buf, len); + if (simdutf_unlikely(len == 0)) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return true; + } + const char32_t *tail = sse_validate_utf32le(buf, len); if (tail) { return scalar::utf32::validate(tail, len - (tail - buf)); } else { @@ -38042,6 +37532,11 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz } simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept { + if (len == 0) { + // empty input is valid UTF-32. protect the implementation from + // handling nullptr + return result(error_code::SUCCESS, 0); + } result res = sse_validate_utf32le_with_errors(buf, len); if (res.count != len) { result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count); @@ -38565,46 +38060,62 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char * const uint8_t *str = reinterpret_cast(input); size_t answer = len / sizeof(__m128i) * sizeof(__m128i); size_t i = 0; - __m128i two_64bits = _mm_setzero_si128(); - while (i + sizeof(__m128i) <= len) { - __m128i runner = _mm_setzero_si128(); - size_t iterations = (len - i) / sizeof(__m128i); - if (iterations > 255) { - iterations = 255; - } - size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i); - for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) { - __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i)); - __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i))); - __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i))); - __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i))); - __m128i input12 = _mm_add_epi8( - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input1), - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input2)); - __m128i input34 = _mm_add_epi8( - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input3), - _mm_cmpgt_epi8( - _mm_setzero_si128(), - input4)); - __m128i input1234 = _mm_add_epi8(input12, input34); - runner = _mm_sub_epi8(runner, input1234); - } - for (; i <= max_i; i += sizeof(__m128i)) { - __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i)); - runner = _mm_sub_epi8( - runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input)); - } - two_64bits = _mm_add_epi64( - two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128())); - } - answer += _mm_extract_epi64(two_64bits, 0) + - _mm_extract_epi64(two_64bits, 1); + if(answer >= 2048) { // long strings optimization + __m128i two_64bits = _mm_setzero_si128(); + while (i + sizeof(__m128i) <= len) { + __m128i runner = _mm_setzero_si128(); + size_t iterations = (len - i) / sizeof(__m128i); + if (iterations > 255) { + iterations = 255; + } + size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i); + for (; i + 4*sizeof(__m128i) <= max_i; i += 4*sizeof(__m128i)) { + __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i)); + __m128i input2 = _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i))); + __m128i input3 = _mm_loadu_si128((const __m128i *)(str + i + 2*sizeof(__m128i))); + __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i))); + __m128i input12 = _mm_add_epi8( + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input1), + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input2)); + __m128i input34 = _mm_add_epi8( + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input3), + _mm_cmpgt_epi8( + _mm_setzero_si128(), + input4)); + __m128i input1234 = _mm_add_epi8(input12, input34); + runner = _mm_sub_epi8(runner, input1234); + } + for (; i <= max_i; i += sizeof(__m128i)) { + __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i)); + runner = _mm_sub_epi8( + runner, _mm_cmpgt_epi8(_mm_setzero_si128(), more_input)); + } + two_64bits = _mm_add_epi64( + two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128())); + } + answer += _mm_extract_epi64(two_64bits, 0) + + _mm_extract_epi64(two_64bits, 1); + } else if (answer > 0) { // short string optimization + for(; i + 2*sizeof(__m128i) <= len; i += 2*sizeof(__m128i)) { + __m128i latin = _mm_loadu_si128((const __m128i*)(input + i)); + uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + latin = _mm_loadu_si128((const __m128i*)(input + i)+1); + non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + for(; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) { + __m128i latin = _mm_loadu_si128((const __m128i*)(input + i)); + uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin); + answer += count_ones(non_ascii); + } + } return answer + scalar::latin1::utf8_length_from_latin1(reinterpret_cast(str + i), len - i); } diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h index b656861346e927..4b534c12563072 100644 --- a/deps/simdutf/simdutf.h +++ b/deps/simdutf/simdutf.h @@ -1,4 +1,4 @@ -/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */ +/* auto-generated on 2024-09-04 18:13:32 +0200. Do not edit! */ /* begin file include/simdutf.h */ #ifndef SIMDUTF_H #define SIMDUTF_H @@ -20,6 +20,18 @@ #endif #endif + +// C++ 23 +#if !defined(SIMDUTF_CPLUSPLUS23) && (SIMDUTF_CPLUSPLUS >= 202302L) +#define SIMDUTF_CPLUSPLUS23 1 +#endif + +// C++ 20 +#if !defined(SIMDUTF_CPLUSPLUS20) && (SIMDUTF_CPLUSPLUS >= 202002L) +#define SIMDUTF_CPLUSPLUS20 1 +#endif + + // C++ 17 #if !defined(SIMDUTF_CPLUSPLUS17) && (SIMDUTF_CPLUSPLUS >= 201703L) #define SIMDUTF_CPLUSPLUS17 1 @@ -398,6 +410,8 @@ #define SIMDUTF_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) #if defined(SIMDUTF_REGULAR_VISUAL_STUDIO) + #define SIMDUTF_DEPRECATED __declspec(deprecated) + #define simdutf_really_inline __forceinline #define simdutf_never_inline __declspec(noinline) @@ -438,6 +452,8 @@ #else #define simdutf_really_inline inline #endif + + #define SIMDUTF_DEPRECATED __attribute__((deprecated)) #define simdutf_never_inline inline __attribute__((noinline)) #define simdutf_unused __attribute__((unused)) @@ -597,7 +613,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS #define SIMDUTF_SIMDUTF_VERSION_H /** The version of simdutf being used (major.minor.revision) */ -#define SIMDUTF_VERSION "5.3.4" +#define SIMDUTF_VERSION "5.5.0" namespace simdutf { enum { @@ -608,11 +624,11 @@ enum { /** * The minor version (major.MINOR.revision) of simdutf being used. */ - SIMDUTF_VERSION_MINOR = 3, + SIMDUTF_VERSION_MINOR = 5, /** * The revision (major.minor.REVISION) of simdutf being used. */ - SIMDUTF_VERSION_REVISION = 4 + SIMDUTF_VERSION_REVISION = 0 }; } // namespace simdutf @@ -1120,67 +1136,79 @@ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcep */ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept; - /** - * Convert Latin1 string into UTF8 string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept; +/** + * Convert Latin1 string into UTF8 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf8_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept; +/** + * Convert Latin1 string into UTF8 string with output limit. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf8_output the pointer to buffer that can hold conversion result + * @param utf8_len the maximum output length + * @return the number of written char; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf8_safe(const char * input, size_t length, char* utf8_output, size_t utf8_len) noexcept; - /** - * Convert possibly Latin1 string into UTF-16LE string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; +/** + * Convert possibly Latin1 string into UTF-16LE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept; - /** - * Convert Latin1 string into UTF-16BE string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf16_buffer the pointer to buffer that can hold conversion result - * @return the number of written char16_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; +/** + * Convert Latin1 string into UTF-16BE string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf16_buffer the pointer to buffer that can hold conversion result + * @return the number of written char16_t; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept; - /** - * Convert Latin1 string into UTF-32 string. - * - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the Latin1 string to convert - * @param length the length of the string in bytes - * @param utf32_buffer the pointer to buffer that can hold conversion result - * @return the number of written char32_t; 0 if conversion is not possible - */ - simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; +/** + * Convert Latin1 string into UTF-32 string. + * + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the Latin1 string to convert + * @param length the length of the string in bytes + * @param utf32_buffer the pointer to buffer that can hold conversion result + * @return the number of written char32_t; 0 if conversion is not possible + */ +simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept; - /** - * Convert possibly broken UTF-8 string into latin1 string. - * - * During the conversion also validation of the input string is done. - * This function is suitable to work with inputs from untrusted sources. - * - * @param input the UTF-8 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1 - */ - simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; +/** + * Convert possibly broken UTF-8 string into latin1 string. + * + * During the conversion also validation of the input string is done. + * This function is suitable to work with inputs from untrusted sources. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string or if it cannot be represented as Latin1 + */ +simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; /** * Using native endianness, convert possibly broken UTF-8 string into a UTF-16 string. @@ -1195,7 +1223,6 @@ simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_ */ simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept; - /** * Using native endianness, convert a Latin1 string into a UTF-16 string. * @@ -1314,19 +1341,23 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng */ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept; - /** - * Convert valid UTF-8 string into latin1 string. - * - * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1. - * - * This function is not BOM-aware. - * - * @param input the UTF-8 string to convert - * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result - * @return the number of written char; 0 if the input was not valid UTF-8 string - */ - simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; +/** + * Convert valid UTF-8 string into latin1 string. + * + * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf8_to_latin1 instead. + * The function may be removed from the library in the future. + * + * This function is not BOM-aware. + * + * @param input the UTF-8 string to convert + * @param length the length of the string in bytes + * @param latin1_output the pointer to buffer that can hold conversion result + * @return the number of written char; 0 if the input was not valid UTF-8 string + */ +simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept; /** @@ -1635,6 +1666,10 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, s * Using native endianness, convert UTF-16 string into Latin1 string. * * This function assumes that the input string is valid UTF-16 and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf16_to_latin1 instead. + * The function may be removed from the library in the future. * * This function is not BOM-aware. * @@ -1649,6 +1684,10 @@ simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input, * Convert valid UTF-16LE string into Latin1 string. * * This function assumes that the input string is valid UTF-16LE and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf16le_to_latin1 instead. + * The function may be removed from the library in the future. * * This function is not BOM-aware. * @@ -1663,6 +1702,10 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * inpu * Convert valid UTF-16BE string into Latin1 string. * * This function assumes that the input string is valid UTF-16BE and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf16be_to_latin1 instead. + * The function may be removed from the library in the future. * * This function is not BOM-aware. * @@ -1997,6 +2040,10 @@ simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * * Convert valid UTF-32 string into Latin1 string. * * This function assumes that the input string is valid UTF-32 and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf32_to_latin1 instead. + * The function may be removed from the library in the future. * * This function is not BOM-aware. * @@ -2323,7 +2370,6 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t leng // base64_options are used to specify the base64 encoding options. using base64_options = uint64_t; -using base64_options = uint64_t; enum : base64_options { base64_default = 0, /* standard base64 format (with padding) */ base64_url = 1, /* base64url format (no padding) */ @@ -2355,7 +2401,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept; /** - * Convert a base64 input to a binary ouput. + * Convert a base64 input to a binary output. * * This function follows the WHATWG forgiving-base64 format, which means that it will * ignore any ASCII spaces in the input. You may provide a padded input (with one or two @@ -2398,7 +2444,7 @@ simdutf_warn_unused result base64_to_binary(const char * input, size_t length, c simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_options options = base64_default) noexcept; /** - * Convert a binary input to a base64 ouput. + * Convert a binary input to a base64 output. * * The default option (simdutf::base64_default) uses the characters `+` and `/` as part of its alphabet. * Further, it adds padding (`=`) at the end of the output to ensure that the output length is a multiple of four. @@ -2417,7 +2463,7 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length, base64_optio size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept; /** - * Convert a base64 input to a binary ouput. + * Convert a base64 input to a binary output. * * This function follows the WHATWG forgiving-base64 format, which means that it will * ignore any ASCII spaces in the input. You may provide a padded input (with one or two @@ -2452,7 +2498,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) noexcept; /** - * Convert a base64 input to a binary ouput. + * Convert a base64 input to a binary output. * * This function follows the WHATWG forgiving-base64 format, which means that it will * ignore any ASCII spaces in the input. You may provide a padded input (with one or two @@ -2692,13 +2738,12 @@ class implementation { * * @param input the Latin1 string to convert * @param length the length of the string in bytes - * @param latin1_output the pointer to buffer that can hold conversion result + * @param utf8_output the pointer to buffer that can hold conversion result * @return the number of written char; 0 if conversion is not possible */ simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0; - - /** + /** * Convert possibly Latin1 string into UTF-16LE string. * * This function is suitable to work with inputs from untrusted sources. @@ -2766,6 +2811,9 @@ class implementation { * Convert valid UTF-8 string into latin1 string. * * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf8_to_latin1 instead. * * This function is not BOM-aware. * @@ -2983,7 +3031,10 @@ class implementation { * Convert valid UTF-16LE string into Latin1 string. * * This function assumes that the input string is valid UTF-L16LE and that it can be represented as Latin1. - + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf16le_to_latin1 instead. + * * This function is not BOM-aware. * * @param input the UTF-16LE string to convert @@ -2997,6 +3048,9 @@ class implementation { * Convert valid UTF-16BE string into Latin1 string. * * This function assumes that the input string is valid UTF16-BE and that it can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf16be_to_latin1 instead. * * This function is not BOM-aware. * @@ -3246,7 +3300,10 @@ class implementation { /** * Convert valid UTF-32 string into Latin1 string. * - * This function assumes that the input string is valid UTF-32. + * This function assumes that the input string is valid UTF-32 and can be represented as Latin1. + * If you violate this assumption, the result is implementation defined and may include system-dependent behavior such as crashes. + * + * This function is for expert users only and not part of our public API. Use convert_utf32_to_latin1 instead. * * This function is not BOM-aware. * @@ -3598,7 +3655,7 @@ class implementation { simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0; /** - * Convert a base64 input to a binary ouput. + * Convert a base64 input to a binary output. * * This function follows the WHATWG forgiving-base64 format, which means that it will * ignore any ASCII spaces in the input. You may provide a padded input (with one or two @@ -3623,7 +3680,7 @@ class implementation { simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0; /** - * Convert a base64 input to a binary ouput. + * Convert a base64 input to a binary output. * * This function follows the WHATWG forgiving-base64 format, which means that it will * ignore any ASCII spaces in the input. You may provide a padded input (with one or two @@ -3657,7 +3714,7 @@ class implementation { simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length, base64_options options = base64_default) const noexcept = 0; /** - * Convert a binary input to a base64 ouput. + * Convert a binary input to a base64 output. * * The default option (simdutf::base64_default) uses the characters `+` and `/` as part of its alphabet. * Further, it adds padding (`=`) at the end of the output to ensure that the output length is a multiple of four.