Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FR] Faster binary file checking with "ridiculously fast" SIMD UTF-8 validation #421

Closed
genivia-inc opened this issue Aug 21, 2024 · 3 comments
Labels
enhancement New feature or request

Comments

@genivia-inc
Copy link
Member

genivia-inc commented Aug 21, 2024

Ugrep checks if an input file is binary by checking if it has a zero byte or if it contains invalid UTF-8. Binary file checking is important, because the -I (--ignore-binary) uses it and also every search that has a match reports "Binary file XYZ matches".

To speed this up, "Ridiculously fast unicode (UTF-8) validation" comes to mind (sorry, not my choice of title), with details in this article. It uses table lookups in SSE3/SSE4.1/AVX and NEON/AArch64.

However, my dilemma is that I can't rely on the availability of SSE3, since the base ugrep version uses SSE2 for Intel CPUs, i.e. not SSE3 (or SSSE3 or higher) nor do I want to suddenly require SSE3 when it ran fast on SSE2 or have additional configure checks just for SSE3 to enable faster binary file checking (i.e. UTF-8 validation checking).

So I recently wrote a faster SIMD UTF-8 validation check that uses "bit bashing". It works with SSE2 just fine and runs fast too. Compared to the "ridiculously fast" article's table lookup method, my proposed method runs in SSE2 (table lookup does not, it requires SSE3). Also very nice is that the new method is faster than table lookup method for SSE2, SSE3 and AVX2:

UPDATE: updated with faster method that merges two tests into one to branch out of the loop.

SIMD algorithm time throughput
table lookup method SSE4.1 compiled with -mavx2 -O2 174ms 5.7GB/s
table lookup method SSE3 compiled with -msse3 -O2 205ms 4.9GB/s
bitbash method SSE4.1 version compiled with -mavx2 -O2 161ms 6.2GB/s
bitbash method SSE3 version compiled with -msse3 -O2 195ms 5.1GB/s
bitbash method SSE2 version compiled with -msse2 -O2 204ms 4.9GB/s
bitbash method AVX2 version compiled with -mavx2 -O2 (see next post) 117ms 8.5GB/s

Benchmark machine: MacOS 12.7.4 Intel quad core i7 2.9 GHz 16GB 2133 MHz LPDDR3
Benchmark file: enwik9 file size 1,000,000,000 mostly ASCII with some UTF-8

While both the table lookup and my proposed "bitbash" method both validate UTF-8, there is a small difference however. "Bitbash" does not flag surrogates and accepts some overlongs in the longer 3 and 4 byte UTF-8 sequences. That is fine for quickly validating UTF-8 to check if the input is binary or text. Surrogates and overlongs are practically never present in text files. The ugrep tool does not choke on them anyway.

The SSE2 implementation with commentary (note that we can't use palignr with SSE2 nor can we use _mm_shuffle_epi8 that the table lookup method uses):

bool isutf8(const char *s, const char *e)
{
  if (s <= e - 16)
  {
    const __m128i vxc0 = _mm_set1_epi8(0xc0);
    const __m128i vxc1 = _mm_set1_epi8(0xc1);
    const __m128i vxf5 = _mm_set1_epi8(0xf5);
    const __m128i v0 = _mm_setzero_si128();
    __m128i vp = v0;
    __m128i vq = v0;
    __m128i vr = v0;
    while (s <= e - 16)
    {
      // step 1: check valid signed byte ranges
      //   c = s[i]
      //   if (!(c > 0 || c < -64 || (c > -63 && c < -11)))
      //     return false
      //
      __m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
      __m128i vt = _mm_and_si128(_mm_cmpgt_epi8(vc, vxc1), _mm_cmplt_epi8(vc, vxf5));
      vt = _mm_or_si128(vt, _mm_cmplt_epi8(vc, vxc0));
      vt = _mm_or_si128(vt, _mm_cmpgt_epi8(vc, v0));
      __m128i vm = vt;
      //   step 2: check UTF-8 multi-byte sequences of 2, 3 and 4 bytes long
      //     if (((-(c > -63) ^ (p | q | r)) & 0x80) != 0x80)
      //       return false
      //     r = (q & (q << 1))
      //     q = (p & (p << 1))
      //     p = (c & (c << 1))
      //
      //   possible values of c after step 1 and subsequent values of p, q, r:
      //     c at 1st byte   p at 2nd byte   q at 3rd byte   r at 4th byte
      //       0xxxxxxx        0xxxxxxx        0xxxxxxx        0xxxxxxx
      //       10xxxxxx        00xxxxxx        00xxxxxx        00xxxxxx
      //       110xxxxx        100xxxxx        000xxxxx        000xxxxx
      //       1110xxxx        1100xxxx        1000xxxx        0000xxxx
      //       11110xxx        11100xxx        11000xxx        10000xxx
      //
      //   byte vectors vc, vp, vq, vr and previous values:
      //                             | c | c | c | c | ... | c |
      //                     | old p | p | p | p | p | ... | p |
      //             | old q | old q | q | q | q | q | ... | q |
      //     | old r | old r | old r | r | r | r | r | ... | r |
      //
      //   shift vectors vp, vq, vr to align to compute bitwise-or vp | vq | vr -> vt:
      //                 |     c |     c |     c | c | ... | c | = vc
      //                 | old p |     p |     p | p | ... | p |
      //                 | old q | old q |     q | q | ... | q |
      //                 | old r | old r | old r | r | ... | r |
      //                   -----   -----   -----   -   ---   -   or
      //                 |     t |     t |     t | t | ... | t | = vt
      //
      //   SSE2 code to perform r = (q & (q << 1)); q = (p & (p << 1)); p = (c & (c << 1));
      //   shift parts of the old vp, vq, vr and new vp, vq, vr in vt using psrldq and por
      //   then check if ((-(c > -63) ^ (p | q | r))) bit 7 is 1
      //
      vt = _mm_bsrli_si128(vp, 15);
      vp = _mm_and_si128(vc, _mm_add_epi8(vc, vc));
      vt = _mm_or_si128(vt, _mm_bsrli_si128(vq, 14));
      vq = _mm_and_si128(vp, _mm_add_epi8(vp, vp));
      vt = _mm_or_si128(vt, _mm_bsrli_si128(vr, 13));
      vr = _mm_and_si128(vq, _mm_add_epi8(vq, vq));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vp, 1));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vq, 2));
      vt = _mm_or_si128(vt, _mm_bslli_si128(vr, 3));
      vt = _mm_xor_si128(vt, _mm_cmpgt_epi8(vc, vxc1));
      vm = _mm_and_si128(vm, vt);
      if (_mm_movemask_epi8(vm) != 0xffff)
        return false;
      s += 16;
    }
    // do not end in the middle of a UTF-8 multibyte sequence, backtrack when necessary (this will terminate)
    while ((*--s & 0xc0) == 0x80)
      continue;
  }
  // check remainder with scalar code

With SSE3/SSE4/AVX we can use _mm_alignr_epi8 and no longer need the _mm_bsrli_si128 and _mm_bslli_si128:

    [...]
    const __m128i vxc0 = _mm_set1_epi8(0xc0);
    const __m128i vxc1 = _mm_set1_epi8(0xc1);
    const __m128i vxf5 = _mm_set1_epi8(0xf5);
    const __m128i v0 = _mm_setzero_si128();
    __m128i vp = v0;
    __m128i vq = v0;
    __m128i vr = v0;
    while (s <= e - 16)
    {
      __m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
      __m128i vt = _mm_and_si128(_mm_cmpgt_epi8(vc, vxc1), _mm_cmplt_epi8(vc, vxf5));
      vt = _mm_or_si128(vt, _mm_cmplt_epi8(vc, vxc0));
      vt = _mm_or_si128(vt, _mm_cmpgt_epi8(vc, v0));
      __m128i vm = vt;
      __m128i vo = vp;
      vp = _mm_and_si128(vc, _mm_add_epi8(vc, vc));
      vt = _mm_alignr_epi8(vp, vo, 15);
      vo = vq;
      vq = _mm_and_si128(vp, _mm_add_epi8(vp, vp));
      vt = _mm_or_si128(vt, _mm_alignr_epi8(vq, vo, 14));
      vo = vr;
      vr = _mm_and_si128(vq, _mm_add_epi8(vq, vq));
      vt = _mm_or_si128(vt, _mm_alignr_epi8(vr, vo, 13));
      vt = _mm_xor_si128(vt, _mm_cmpgt_epi8(vc, vxc1));
      vm = _mm_and_si128(vm, vt);
      if (_mm_movemask_epi8(vm) != 0xffff)
        return false;
      s += 16;
    }
    while ((*--s & 0xc0) == 0x80)
      continue;

The "bitbash" UTF-8 validation in ARM NEON, tested with Apple M1 Pro (AArch64):

SIMD algorithm time throughput
bitbash method ARM NEON version compiled with -O2 116ms 8.6GB/s
  if (s <= e - 16)
  {
    const int8x16_t vxc0 = vdupq_n_s8(0xc0);
    const int8x16_t vxc1 = vdupq_n_s8(0xc1);
    const int8x16_t vxf5 = vdupq_n_s8(0xf5);
    const int8x16_t v0 = vdupq_n_s8(0);
    int8x16_t vp = v0;
    int8x16_t vq = v0;
    int8x16_t vr = v0;
    while (s <= e - 16)
    {
      int8x16_t vc = vld1q_s8(reinterpret_cast<const int8_t*>(s));
      int8x16_t vt = vandq_s8(vcgtq_s8(vc, vxc1), vcltq_s8(vc, vxf5));
      vt = vorrq_s8(vt, vcltq_s8(vc, vxc0));
      vt = vorrq_s8(vt, vcgtq_s8(vc, v0));
      int64x2_t vm = vreinterpretq_s64_s8(vt);
      int8x16_t vo = vp;
      vp = vandq_s8(vc, vshlq_n_s8(vc, 1));
      vt = vextq_s8(vo, vp, 15);
      vo = vq;
      vq = vandq_s8(vp, vshlq_n_s8(vp, 1));
      vt = vorrq_s8(vt, vextq_s8(vo, vq, 14));
      vo = vr;
      vr = vandq_s8(vq, vshlq_n_s8(vq, 1));
      vt = vorrq_s8(vt, vextq_s8(vo, vr, 13));
      vt = veorq_s8(vt, vcgtq_s8(vc, vxc1));
      vm = vandq_s64(vm, vreinterpretq_s64_s8(vt));
      if (((vgetq_lane_s64(vm, 0) & vgetq_lane_s64(vm, 1)) & 0x8080808080808080LL) != 0x8080808080808080LL)
        return false;;
      s += 16;
    }
    while ((*--s & 0xc0) == 0x80)
      continue;
  }
@genivia-inc genivia-inc added the enhancement New feature or request label Aug 21, 2024
@genivia-inc
Copy link
Member Author

genivia-inc commented Aug 22, 2024

Adding results and code for my "bitbash" AVX2 256 bit vector UTF-8 validation. It's compact and spiffy:

SIMD algorithm time throughput
bitbash method AVX2 version compiled with -mavx2 -O2 117ms 8.5GB/s
  if (s <= e - 32)
  {
    const __m256i vxc0 = _mm256_set1_epi8(0xc0);
    const __m256i vxc1 = _mm256_set1_epi8(0xc1);
    const __m256i vxf5 = _mm256_set1_epi8(0xf5);
    const __m256i v0 = _mm256_setzero_si256();
    __m256i vp = v0;
    __m256i vq = v0;
    __m256i vr = v0;
    while (s <= e - 32)
    {
      __m256i vc = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));
      __m256i vt = _mm256_and_si256(_mm256_cmpgt_epi8(vc, vxc1), _mm256_cmpgt_epi8(vxf5, vc));
      vt = _mm256_or_si256(vt, _mm256_cmpgt_epi8(vxc0, vc));
      vt = _mm256_or_si256(vt, _mm256_cmpgt_epi8(vc, v0));
      __m256i vm = vt;
      __m256i vo = vp;
      vp = _mm256_and_si256(vc, _mm256_add_epi8(vc, vc));
      // vt = [vp,vo] >> 15*8 split in 128 bit lanes:
      // vthi = [vphi,vplo] >> 15*8
      // vtlo = [vplo,vohi] >> 15*8
      vt = _mm256_alignr_epi8(vp, _mm256_permute2x128_si256(vp, vo, 0x03), 15);
      vo = vq;
      vq = _mm256_and_si256(vp, _mm256_add_epi8(vp, vp));
      // vt = [vq,vo] >> 14*8 split in 128 bit lanes:
      // vthi |= [vqhi,vqlo] >> 14*8
      // vtlo |= [vqlo,vohi] >> 14*8
      vt = _mm256_or_si256(vt, _mm256_alignr_epi8(vq, _mm256_permute2x128_si256(vq, vo, 0x03), 14));
      vo = vr;
      vr = _mm256_and_si256(vq, _mm256_add_epi8(vq, vq));
      // vt = [vr,vo] >> 13*8 split in 128 bit lanes:
      // vthi |= [vrhi,vrlo] >> 13*8
      // vtlo |= [vrlo,vohi] >> 13*8
      vt = _mm256_or_si256(vt, _mm256_alignr_epi8(vr, _mm256_permute2x128_si256(vr, vo, 0x03), 13));
      vt = _mm256_xor_si256(vt, _mm256_cmpgt_epi8(vc, vxc1));
      vm = _mm256_and_si256(vm, vt);
      if (_mm256_movemask_epi8(vm) != 0xffffffff)
        return false;
      s += 32;
    }
    while ((*--s & 0xc0) == 0x80)
      continue;
  }

@genivia-inc
Copy link
Member Author

Note that we can scan over ASCII text before executing the UTF-8 validation. This is faster when the input file is ASCII and also when the initial part of the file is ASCII.

SSE2 code to add:

      // prep step: scan ASCII w/o NUL first for speed, then check remaining UTF-8
      const __m128i v0 = _mm_setzero_si128();
      while (s <= e - 16)
      {
        __m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
        __m128i vm = _mm_cmpgt_epi8(vc, v0);
        if (_mm_movemask_epi8(vm) != 0xffff)
        {
          // non-ASCII, return false if a NUL was found
          vm = _mm_cmpeq_epi8(vc, v0);
          if (_mm_movemask_epi8(vm) != 0x0000)
            return false;
          break;
        }
        s += 16;
      }

AVX2 code to add:

  // prep step: scan ASCII w/o NUL first for speed, then check remaining UTF-8
  const __m256i v00 = _mm256_setzero_si256();
  while (s <= e - 32)
  {
    __m256i vc = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));
    __m256i vm = _mm256_cmpgt_epi8(vc, v00);
    if (_mm256_movemask_epi8(vm) != 0xffffffff)
    {
      vm = _mm256_cmpeq_epi8(vc, v00);
      if (_mm256_movemask_epi8(vm) != 0x00000000)
        return false;
      break;
    }
    s += 32;
  }

ARM NEON code to add:

    // prep step: scan ASCII first for speed, then check remaining UTF-8
    const int8x16_t v0 = vdupq_n_s8(0);
    while (s <= e - 16)
    {
      int8x16_t vc = vld1q_s8(reinterpret_cast<const int8_t*>(s));
      int64x2_t vm = vreinterpretq_s64_s8(vcgtq_s8(vc, v0));
      if ((vgetq_lane_s64(vm, 0) & vgetq_lane_s64(vm, 1)) != -1LL)
      {
        // non-ASCII, return false if a NUL was found
        vm = vreinterpretq_s64_s8(vceqq_s8(vc, v0));
        if ((vgetq_lane_s64(vm, 0) | vgetq_lane_s64(vm, 1)) != 0LL)
          return false;
        break;
      }
      s += 16;
    }

genivia-inc added a commit to Genivia/ugrep-benchmarks that referenced this issue Aug 23, 2024
genivia-inc added a commit that referenced this issue Aug 23, 2024
- faster binary file checking with SIMD (SSE2/AVX2/NEON) UTF-8 validation #421
- new option `--context-separator=SEP`
- options `-W` and `-X` now also apply to `--format` fields `%o` and `%O` to output hex
- ugrep-indexer option `-z` to index zip/7z/tar/cpio/pax archives no longer indexes hidden directories and files stored in the archive
- fix ugrep-indexer option `-z`, which was broken due to a code refactoring mistake in one line of code
@genivia-inc
Copy link
Member Author

genivia-inc commented Aug 27, 2024

For completeness, I'm posting the lookup table version. I checked with the author of the method on the details in the final step, which wasn't clear from their paper.
lemire/validateutf8-experiments#1 (comment)

Comments explain how this code works.

  if (s <= e - 16)
  {
    //  ranges      UTF-8 encoding                                  nibbles
    //  U+0000      00000000                                        00x
    //  U+007f      01111111                                        7fx
    //
    //  U+0080      110.00010 10.000000                             c28 0
    //  U+07ff      110.11111 10.111111                             dfb f
    //
    //  U+0800      1110.0000 10.100000 10.000000                   e0a 080
    //  U+d7ff      1110.1101 10.011111 10.111111                   ed9 fbf
    //  U+e000      1110.1110 10.000000 10.000000                   ee8 080
    //  U+ffff      1110.1111 10.111111 10.111111                   efb fbf
    //
    //  U+010000    11110.000 10.010000 10.000000 10.000000         f09 08080
    //  U+10ffff    11110.100 10.001111 10.111111 10.111111         f48 fbfbf
    //
    //  ten UTF-8 error patterns observed in the first three nibbles:
    //   1:  (0~7) (0~f)     (8~b)
    //   2:  c     (0~1)     (0~f)
    //   3:  c     (2~f)     (0~7|c~f)
    //   4:  d     (0~f)     (0~7|c~f)
    //   5:  e     0         (0~7|8~9|c~f)
    //   6:  e     (1~8|e~f) (0~7|c~f)
    //   7:  e     d         (0~7|a~f)
    //   8:  f     0         (0~7|8)
    //   9:  f     (1~3)     (0~7|c~f)
    //  10:  f     4         (0~7|9~f)
    //
    //  seven combined error patterns with bit error assignments (bit 0 to 6):
    //   1:        (0~7) (0~f) (8~b)          0
    //   2:        c     (0~1) (0~f)          1
    //   3+4+6+9:  (c~f) (0~f) (0~7|c~f)      2
    //   5:        e     0     (0~7|8~9|c~f)  3
    //   7:        e     d     (0~7|a~f)      4
    //   8:        f     0     (0~7|8)        5
    //  10:        f     4     (0~7|9~f)      6
    //  detect two continuation patterns with bit 7:
    //             (8~b) (0~f) (8~b)          7
    //
    //                 table index:   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
    //  previous high nibble table:  01 01 01 01 01 01 01 01 80 80 80 80 06 04 1c 64
    //  previous low nibble table:   ae 86 84 84 c4 84 84 84 85 85 85 85 85 95 85 85
    //  current high nibble table:   7e 7e 7e 7e 7e 7e 7e 7e ab cb d3 d3 5e 5e 5e 5e
    //
    const __m128i vpht = _mm_set_epi8(
    //  f    e    d    c    b    a    9    8    7    6    5    4    3    2    1    0
        0x64,0x1c,0x04,0x06,0x80,0x80,0x80,0x80,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01);
    const __m128i vplt = _mm_set_epi8(
    //  f    e    d    c    b    a    9    8    7    6    5    4    3    2    1    0
        0x85,0x85,0x95,0x85,0x85,0x85,0x85,0x85,0x84,0x84,0x84,0xc4,0x84,0x84,0x86,0xae);
    const __m128i vcht = _mm_set_epi8(
    //  f    e    d    c    b    a    9    8    7    6    5    4    3    2    1    0
        0x5e,0x5e,0x5e,0x5e,0xd3,0xd3,0xcb,0xab,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e);
    const __m128i vx0f = _mm_set1_epi8(0x0f);
    const __m128i vx7f = _mm_set1_epi8(0x7f);
    const __m128i v0 = _mm_setzero_si128();
    __m128i vc = v0;
    while (s <= e - 16)
    {
      __m128i vo = vc;
      vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s)); // current bytes
      __m128i vp = _mm_alignr_epi8(vc, vo, 15); // vc with previous byte
      __m128i vq = _mm_alignr_epi8(vc, vo, 14); // vc with two previous bytes
      __m128i vr = _mm_alignr_epi8(vc, vo, 13); // vc with three previous bytes
      __m128i vph = _mm_and_si128(_mm_srli_epi16(vp, 4), vx0f); // vp high nibbles
      __m128i vpl = _mm_and_si128(vp, vx0f);                    // vp low nibbles
      __m128i vch = _mm_and_si128(_mm_srli_epi16(vc, 4), vx0f); // vc high nibbles
      __m128i vt = _mm_shuffle_epi8(vpht, vph);
      vt = _mm_and_si128(vt, _mm_shuffle_epi8(vplt, vpl));
      vt = _mm_and_si128(vt, _mm_shuffle_epi8(vcht, vch));
#if defined(HAVE_SSE3)
      __m128i vm = _mm_add_epi8(_mm_and_si128(vt, vx7f), vx7f);
      if (_mm_movemask_epi8(vm) != 0x0000)
        return false;
#else // SSE4.1/AVX2
      __m128i vm = _mm_and_si128(vt, vx7f);
       if (!_mm_testz_si128(vm, vm))
        return false;
#endif
      __m128i vb3 = _mm_subs_epu8(vq, _mm_set1_epi8(0xe0u-0x80));
      __m128i vb4 = _mm_subs_epu8(vr, _mm_set1_epi8(0xf0u-0x80));
      __m128i vb = _mm_or_si128(vb3, vb4);
      vm = _mm_xor_si128(vb, vt);
      if (_mm_movemask_epi8(vm) != 0x0000)
        return false;
      s += 16;
    }
    // do not end in the middle of a UTF-8 multibyte sequence, backtrack when necessary (this will terminate)
    while ((*--s & 0xc0) == 0x80)
      continue;
  }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

1 participant