-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FR] Faster binary file checking with "ridiculously fast" SIMD UTF-8 validation #421
Comments
Adding results and code for my "bitbash" AVX2 256 bit vector UTF-8 validation. It's compact and spiffy:
if (s <= e - 32)
{
const __m256i vxc0 = _mm256_set1_epi8(0xc0);
const __m256i vxc1 = _mm256_set1_epi8(0xc1);
const __m256i vxf5 = _mm256_set1_epi8(0xf5);
const __m256i v0 = _mm256_setzero_si256();
__m256i vp = v0;
__m256i vq = v0;
__m256i vr = v0;
while (s <= e - 32)
{
__m256i vc = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));
__m256i vt = _mm256_and_si256(_mm256_cmpgt_epi8(vc, vxc1), _mm256_cmpgt_epi8(vxf5, vc));
vt = _mm256_or_si256(vt, _mm256_cmpgt_epi8(vxc0, vc));
vt = _mm256_or_si256(vt, _mm256_cmpgt_epi8(vc, v0));
__m256i vm = vt;
__m256i vo = vp;
vp = _mm256_and_si256(vc, _mm256_add_epi8(vc, vc));
// vt = [vp,vo] >> 15*8 split in 128 bit lanes:
// vthi = [vphi,vplo] >> 15*8
// vtlo = [vplo,vohi] >> 15*8
vt = _mm256_alignr_epi8(vp, _mm256_permute2x128_si256(vp, vo, 0x03), 15);
vo = vq;
vq = _mm256_and_si256(vp, _mm256_add_epi8(vp, vp));
// vt = [vq,vo] >> 14*8 split in 128 bit lanes:
// vthi |= [vqhi,vqlo] >> 14*8
// vtlo |= [vqlo,vohi] >> 14*8
vt = _mm256_or_si256(vt, _mm256_alignr_epi8(vq, _mm256_permute2x128_si256(vq, vo, 0x03), 14));
vo = vr;
vr = _mm256_and_si256(vq, _mm256_add_epi8(vq, vq));
// vt = [vr,vo] >> 13*8 split in 128 bit lanes:
// vthi |= [vrhi,vrlo] >> 13*8
// vtlo |= [vrlo,vohi] >> 13*8
vt = _mm256_or_si256(vt, _mm256_alignr_epi8(vr, _mm256_permute2x128_si256(vr, vo, 0x03), 13));
vt = _mm256_xor_si256(vt, _mm256_cmpgt_epi8(vc, vxc1));
vm = _mm256_and_si256(vm, vt);
if (_mm256_movemask_epi8(vm) != 0xffffffff)
return false;
s += 32;
}
while ((*--s & 0xc0) == 0x80)
continue;
} |
Note that we can scan over ASCII text before executing the UTF-8 validation. This is faster when the input file is ASCII and also when the initial part of the file is ASCII. SSE2 code to add: // prep step: scan ASCII w/o NUL first for speed, then check remaining UTF-8
const __m128i v0 = _mm_setzero_si128();
while (s <= e - 16)
{
__m128i vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i vm = _mm_cmpgt_epi8(vc, v0);
if (_mm_movemask_epi8(vm) != 0xffff)
{
// non-ASCII, return false if a NUL was found
vm = _mm_cmpeq_epi8(vc, v0);
if (_mm_movemask_epi8(vm) != 0x0000)
return false;
break;
}
s += 16;
} AVX2 code to add: // prep step: scan ASCII w/o NUL first for speed, then check remaining UTF-8
const __m256i v00 = _mm256_setzero_si256();
while (s <= e - 32)
{
__m256i vc = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));
__m256i vm = _mm256_cmpgt_epi8(vc, v00);
if (_mm256_movemask_epi8(vm) != 0xffffffff)
{
vm = _mm256_cmpeq_epi8(vc, v00);
if (_mm256_movemask_epi8(vm) != 0x00000000)
return false;
break;
}
s += 32;
} ARM NEON code to add: // prep step: scan ASCII first for speed, then check remaining UTF-8
const int8x16_t v0 = vdupq_n_s8(0);
while (s <= e - 16)
{
int8x16_t vc = vld1q_s8(reinterpret_cast<const int8_t*>(s));
int64x2_t vm = vreinterpretq_s64_s8(vcgtq_s8(vc, v0));
if ((vgetq_lane_s64(vm, 0) & vgetq_lane_s64(vm, 1)) != -1LL)
{
// non-ASCII, return false if a NUL was found
vm = vreinterpretq_s64_s8(vceqq_s8(vc, v0));
if ((vgetq_lane_s64(vm, 0) | vgetq_lane_s64(vm, 1)) != 0LL)
return false;
break;
}
s += 16;
} |
improvement Genivia/ugrep#421
- faster binary file checking with SIMD (SSE2/AVX2/NEON) UTF-8 validation #421 - new option `--context-separator=SEP` - options `-W` and `-X` now also apply to `--format` fields `%o` and `%O` to output hex - ugrep-indexer option `-z` to index zip/7z/tar/cpio/pax archives no longer indexes hidden directories and files stored in the archive - fix ugrep-indexer option `-z`, which was broken due to a code refactoring mistake in one line of code
For completeness, I'm posting the lookup table version. I checked with the author of the method on the details in the final step, which wasn't clear from their paper. Comments explain how this code works. if (s <= e - 16)
{
// ranges UTF-8 encoding nibbles
// U+0000 00000000 00x
// U+007f 01111111 7fx
//
// U+0080 110.00010 10.000000 c28 0
// U+07ff 110.11111 10.111111 dfb f
//
// U+0800 1110.0000 10.100000 10.000000 e0a 080
// U+d7ff 1110.1101 10.011111 10.111111 ed9 fbf
// U+e000 1110.1110 10.000000 10.000000 ee8 080
// U+ffff 1110.1111 10.111111 10.111111 efb fbf
//
// U+010000 11110.000 10.010000 10.000000 10.000000 f09 08080
// U+10ffff 11110.100 10.001111 10.111111 10.111111 f48 fbfbf
//
// ten UTF-8 error patterns observed in the first three nibbles:
// 1: (0~7) (0~f) (8~b)
// 2: c (0~1) (0~f)
// 3: c (2~f) (0~7|c~f)
// 4: d (0~f) (0~7|c~f)
// 5: e 0 (0~7|8~9|c~f)
// 6: e (1~8|e~f) (0~7|c~f)
// 7: e d (0~7|a~f)
// 8: f 0 (0~7|8)
// 9: f (1~3) (0~7|c~f)
// 10: f 4 (0~7|9~f)
//
// seven combined error patterns with bit error assignments (bit 0 to 6):
// 1: (0~7) (0~f) (8~b) 0
// 2: c (0~1) (0~f) 1
// 3+4+6+9: (c~f) (0~f) (0~7|c~f) 2
// 5: e 0 (0~7|8~9|c~f) 3
// 7: e d (0~7|a~f) 4
// 8: f 0 (0~7|8) 5
// 10: f 4 (0~7|9~f) 6
// detect two continuation patterns with bit 7:
// (8~b) (0~f) (8~b) 7
//
// table index: 0 1 2 3 4 5 6 7 8 9 a b c d e f
// previous high nibble table: 01 01 01 01 01 01 01 01 80 80 80 80 06 04 1c 64
// previous low nibble table: ae 86 84 84 c4 84 84 84 85 85 85 85 85 95 85 85
// current high nibble table: 7e 7e 7e 7e 7e 7e 7e 7e ab cb d3 d3 5e 5e 5e 5e
//
const __m128i vpht = _mm_set_epi8(
// f e d c b a 9 8 7 6 5 4 3 2 1 0
0x64,0x1c,0x04,0x06,0x80,0x80,0x80,0x80,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01);
const __m128i vplt = _mm_set_epi8(
// f e d c b a 9 8 7 6 5 4 3 2 1 0
0x85,0x85,0x95,0x85,0x85,0x85,0x85,0x85,0x84,0x84,0x84,0xc4,0x84,0x84,0x86,0xae);
const __m128i vcht = _mm_set_epi8(
// f e d c b a 9 8 7 6 5 4 3 2 1 0
0x5e,0x5e,0x5e,0x5e,0xd3,0xd3,0xcb,0xab,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e,0x7e);
const __m128i vx0f = _mm_set1_epi8(0x0f);
const __m128i vx7f = _mm_set1_epi8(0x7f);
const __m128i v0 = _mm_setzero_si128();
__m128i vc = v0;
while (s <= e - 16)
{
__m128i vo = vc;
vc = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s)); // current bytes
__m128i vp = _mm_alignr_epi8(vc, vo, 15); // vc with previous byte
__m128i vq = _mm_alignr_epi8(vc, vo, 14); // vc with two previous bytes
__m128i vr = _mm_alignr_epi8(vc, vo, 13); // vc with three previous bytes
__m128i vph = _mm_and_si128(_mm_srli_epi16(vp, 4), vx0f); // vp high nibbles
__m128i vpl = _mm_and_si128(vp, vx0f); // vp low nibbles
__m128i vch = _mm_and_si128(_mm_srli_epi16(vc, 4), vx0f); // vc high nibbles
__m128i vt = _mm_shuffle_epi8(vpht, vph);
vt = _mm_and_si128(vt, _mm_shuffle_epi8(vplt, vpl));
vt = _mm_and_si128(vt, _mm_shuffle_epi8(vcht, vch));
#if defined(HAVE_SSE3)
__m128i vm = _mm_add_epi8(_mm_and_si128(vt, vx7f), vx7f);
if (_mm_movemask_epi8(vm) != 0x0000)
return false;
#else // SSE4.1/AVX2
__m128i vm = _mm_and_si128(vt, vx7f);
if (!_mm_testz_si128(vm, vm))
return false;
#endif
__m128i vb3 = _mm_subs_epu8(vq, _mm_set1_epi8(0xe0u-0x80));
__m128i vb4 = _mm_subs_epu8(vr, _mm_set1_epi8(0xf0u-0x80));
__m128i vb = _mm_or_si128(vb3, vb4);
vm = _mm_xor_si128(vb, vt);
if (_mm_movemask_epi8(vm) != 0x0000)
return false;
s += 16;
}
// do not end in the middle of a UTF-8 multibyte sequence, backtrack when necessary (this will terminate)
while ((*--s & 0xc0) == 0x80)
continue;
} |
Ugrep checks if an input file is binary by checking if it has a zero byte or if it contains invalid UTF-8. Binary file checking is important, because the
-I
(--ignore-binary
) uses it and also every search that has a match reports "Binary file XYZ matches".To speed this up, "Ridiculously fast unicode (UTF-8) validation" comes to mind (sorry, not my choice of title), with details in this article. It uses table lookups in SSE3/SSE4.1/AVX and NEON/AArch64.
However, my dilemma is that I can't rely on the availability of SSE3, since the base ugrep version uses SSE2 for Intel CPUs, i.e. not SSE3 (or SSSE3 or higher) nor do I want to suddenly require SSE3 when it ran fast on SSE2 or have additional configure checks just for SSE3 to enable faster binary file checking (i.e. UTF-8 validation checking).
So I recently wrote a faster SIMD UTF-8 validation check that uses "bit bashing". It works with SSE2 just fine and runs fast too. Compared to the "ridiculously fast" article's table lookup method, my proposed method runs in SSE2 (table lookup does not, it requires SSE3). Also very nice is that the new method is faster than table lookup method for SSE2, SSE3 and AVX2:
UPDATE: updated with faster method that merges two tests into one to branch out of the loop.
Benchmark machine: MacOS 12.7.4 Intel quad core i7 2.9 GHz 16GB 2133 MHz LPDDR3
Benchmark file: enwik9 file size 1,000,000,000 mostly ASCII with some UTF-8
While both the table lookup and my proposed "bitbash" method both validate UTF-8, there is a small difference however. "Bitbash" does not flag surrogates and accepts some overlongs in the longer 3 and 4 byte UTF-8 sequences. That is fine for quickly validating UTF-8 to check if the input is binary or text. Surrogates and overlongs are practically never present in text files. The ugrep tool does not choke on them anyway.
The SSE2 implementation with commentary (note that we can't use
palignr
with SSE2 nor can we use_mm_shuffle_epi8
that the table lookup method uses):With SSE3/SSE4/AVX we can use
_mm_alignr_epi8
and no longer need the_mm_bsrli_si128
and_mm_bslli_si128
:The "bitbash" UTF-8 validation in ARM NEON, tested with Apple M1 Pro (AArch64):
The text was updated successfully, but these errors were encountered: