-
-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vectorization of find_next_host_delimiter and find_next_host_delimiter_special #548
Merged
Merged
Changes from 3 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
1ce99ff
manually vectorize find_next_host_delimiter_special (NEON)
lemire 896652d
some fixes
c6bb54b
vectorization (SSE2)
a0ba1b2
Update src/helpers.cpp
lemire 408f423
More simplification.
7fd2e99
Merge branch 'vectorize_find_next_host_delimiter_special' of github.c…
b9cecb7
Removing unnecessary code
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -197,9 +197,132 @@ ada_really_inline uint64_t swap_bytes_if_big_endian(uint64_t val) noexcept { | |
#endif | ||
} | ||
|
||
ada_really_inline int trailing_zeroes(uint32_t input_num) { | ||
#ifdef ADA_REGULAR_VISUAL_STUDIO | ||
unsigned long ret; | ||
// Search the mask data from least significant bit (LSB) | ||
// to the most significant bit (MSB) for a set bit (1). | ||
_BitScanForward(&ret, input_num); | ||
return (int)ret; | ||
#else // ADA_REGULAR_VISUAL_STUDIO | ||
return __builtin_ctzl(input_num); | ||
#endif // ADA_REGULAR_VISUAL_STUDIO | ||
} | ||
|
||
// starting at index location, this finds the next location of a character | ||
// :, /, \\, ? or [. If none is found, view.size() is returned. | ||
// For use within get_host_delimiter_location. | ||
// ['0x3a', '0x2f', '0x5c', '0x3f', '0x5b'] | ||
#if ADA_NEON | ||
ada_really_inline size_t find_next_host_delimiter_special( | ||
std::string_view view, size_t location) noexcept { | ||
// first check for short strings in which case we do it naively. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This if statement seems like repetitive code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate? We handle the short string (< 16 characters) naively. What is repetitive? |
||
if (view.size() - location < 16) { // slow path | ||
for (size_t i = location; i < view.size(); i++) { | ||
if (view[i] == ':' || view[i] == '/' || view[i] == '\\' || | ||
view[i] == '?' || view[i] == '[') { | ||
return i; | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
auto to_bitmask = [](uint8x16_t input) -> uint16_t { | ||
uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, | ||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; | ||
uint8x16_t minput = vandq_u8(input, bit_mask); | ||
uint8x16_t tmp = vpaddq_u8(minput, minput); | ||
tmp = vpaddq_u8(tmp, tmp); | ||
tmp = vpaddq_u8(tmp, tmp); | ||
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); | ||
}; | ||
|
||
// fast path for long strings (expected to be common) | ||
size_t i = location; | ||
uint8x16_t low_mask = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
0x00, 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03}; | ||
uint8x16_t high_mask = {0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, | ||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; | ||
uint8x16_t fmask = vmovq_n_u8(0xf); | ||
uint8x16_t zero{0}; | ||
for (; i + 15 < view.size(); i += 16) { | ||
uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i); | ||
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); | ||
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); | ||
uint8x16_t classify = vandq_u8(lowpart, highpart); | ||
if (vmaxvq_u8(classify) != 0) { | ||
uint8x16_t is_zero = vceqq_u8(classify, zero); | ||
uint16_t is_non_zero = ~to_bitmask(is_zero); | ||
return i + trailing_zeroes(is_non_zero); | ||
} | ||
} | ||
|
||
if (i < view.size()) { | ||
uint8x16_t word = | ||
vld1q_u8((const uint8_t*)view.data() + view.length() - 16); | ||
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); | ||
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); | ||
uint8x16_t classify = vandq_u8(lowpart, highpart); | ||
if (vmaxvq_u8(classify) != 0) { | ||
uint8x16_t is_zero = vceqq_u8(classify, zero); | ||
uint16_t is_non_zero = ~to_bitmask(is_zero); | ||
return view.length() - 16 + trailing_zeroes(is_non_zero); | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
#elif ADA_SSE2 | ||
ada_really_inline size_t find_next_host_delimiter_special( | ||
std::string_view view, size_t location) noexcept { | ||
// first check for short strings in which case we do it naively. | ||
if (view.size() - location < 16) { // slow path | ||
for (size_t i = location; i < view.size(); i++) { | ||
if (view[i] == ':' || view[i] == '/' || view[i] == '\\' || | ||
view[i] == '?' || view[i] == '[') { | ||
return i; | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
// fast path for long strings (expected to be common) | ||
size_t i = location; | ||
const __m128i mask1 = _mm_set1_epi8(':'); | ||
const __m128i mask2 = _mm_set1_epi8('/'); | ||
const __m128i mask3 = _mm_set1_epi8('\\'); | ||
const __m128i mask4 = _mm_set1_epi8('?'); | ||
const __m128i mask5 = _mm_set1_epi8('['); | ||
|
||
for (; i + 15 < view.size(); i += 16) { | ||
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); | ||
__m128i m1 = _mm_cmpeq_epi8(word, mask1); | ||
__m128i m2 = _mm_cmpeq_epi8(word, mask2); | ||
__m128i m3 = _mm_cmpeq_epi8(word, mask3); | ||
__m128i m4 = _mm_cmpeq_epi8(word, mask4); | ||
__m128i m5 = _mm_cmpeq_epi8(word, mask5); | ||
__m128i m = _mm_or_si128( | ||
_mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5); | ||
int mask = _mm_movemask_epi8(m); | ||
if (mask != 0) { | ||
return i + trailing_zeroes(mask); | ||
} | ||
} | ||
if (i < view.size()) { | ||
__m128i word = | ||
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); | ||
__m128i m1 = _mm_cmpeq_epi8(word, mask1); | ||
__m128i m2 = _mm_cmpeq_epi8(word, mask2); | ||
__m128i m3 = _mm_cmpeq_epi8(word, mask3); | ||
__m128i m4 = _mm_cmpeq_epi8(word, mask4); | ||
__m128i m5 = _mm_cmpeq_epi8(word, mask5); | ||
__m128i m = _mm_or_si128( | ||
_mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5); | ||
int mask = _mm_movemask_epi8(m); | ||
if (mask != 0) { | ||
return view.length() - 16 + trailing_zeroes(mask); | ||
} | ||
} | ||
return size_t(view.length()); | ||
} | ||
#else | ||
ada_really_inline size_t find_next_host_delimiter_special( | ||
std::string_view view, size_t location) noexcept { | ||
// performance: if you plan to call find_next_host_delimiter more than once, | ||
|
@@ -261,10 +384,117 @@ ada_really_inline size_t find_next_host_delimiter_special( | |
} | ||
return view.size(); | ||
} | ||
#endif | ||
|
||
// starting at index location, this finds the next location of a character | ||
// :, /, ? or [. If none is found, view.size() is returned. | ||
// For use within get_host_delimiter_location. | ||
|
||
lemire marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#if ADA_NEON | ||
ada_really_inline size_t find_next_host_delimiter(std::string_view view, | ||
size_t location) noexcept { | ||
// first check for short strings in which case we do it naively. | ||
if (view.size() - location < 16) { // slow path | ||
for (size_t i = location; i < view.size(); i++) { | ||
if (view[i] == ':' || view[i] == '/' || view[i] == '?' || | ||
view[i] == '[') { | ||
return i; | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
auto to_bitmask = [](uint8x16_t input) -> uint16_t { | ||
uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, | ||
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; | ||
uint8x16_t minput = vandq_u8(input, bit_mask); | ||
uint8x16_t tmp = vpaddq_u8(minput, minput); | ||
tmp = vpaddq_u8(tmp, tmp); | ||
tmp = vpaddq_u8(tmp, tmp); | ||
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0); | ||
}; | ||
|
||
// fast path for long strings (expected to be common) | ||
size_t i = location; | ||
uint8x16_t low_mask = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | ||
0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03}; | ||
uint8x16_t high_mask = {0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, | ||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; | ||
uint8x16_t fmask = vmovq_n_u8(0xf); | ||
uint8x16_t zero{0}; | ||
for (; i + 15 < view.size(); i += 16) { | ||
uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i); | ||
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); | ||
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); | ||
uint8x16_t classify = vandq_u8(lowpart, highpart); | ||
if (vmaxvq_u8(classify) != 0) { | ||
uint8x16_t is_zero = vceqq_u8(classify, zero); | ||
uint16_t is_non_zero = ~to_bitmask(is_zero); | ||
return i + trailing_zeroes(is_non_zero); | ||
} | ||
} | ||
|
||
if (i < view.size()) { | ||
uint8x16_t word = | ||
vld1q_u8((const uint8_t*)view.data() + view.length() - 16); | ||
uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask)); | ||
uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4)); | ||
uint8x16_t classify = vandq_u8(lowpart, highpart); | ||
if (vmaxvq_u8(classify) != 0) { | ||
uint8x16_t is_zero = vceqq_u8(classify, zero); | ||
uint16_t is_non_zero = ~to_bitmask(is_zero); | ||
return view.length() - 16 + trailing_zeroes(is_non_zero); | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
#elif ADA_SSE2 | ||
ada_really_inline size_t find_next_host_delimiter(std::string_view view, | ||
size_t location) noexcept { | ||
// first check for short strings in which case we do it naively. | ||
if (view.size() - location < 16) { // slow path | ||
for (size_t i = location; i < view.size(); i++) { | ||
if (view[i] == ':' || view[i] == '/' || view[i] == '?' || | ||
view[i] == '[') { | ||
return i; | ||
} | ||
} | ||
return size_t(view.size()); | ||
} | ||
// fast path for long strings (expected to be common) | ||
size_t i = location; | ||
const __m128i mask1 = _mm_set1_epi8(':'); | ||
const __m128i mask2 = _mm_set1_epi8('/'); | ||
const __m128i mask4 = _mm_set1_epi8('?'); | ||
const __m128i mask5 = _mm_set1_epi8('['); | ||
|
||
for (; i + 15 < view.size(); i += 16) { | ||
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); | ||
__m128i m1 = _mm_cmpeq_epi8(word, mask1); | ||
__m128i m2 = _mm_cmpeq_epi8(word, mask2); | ||
__m128i m4 = _mm_cmpeq_epi8(word, mask4); | ||
__m128i m5 = _mm_cmpeq_epi8(word, mask5); | ||
__m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5)); | ||
int mask = _mm_movemask_epi8(m); | ||
if (mask != 0) { | ||
return i + trailing_zeroes(mask); | ||
} | ||
} | ||
if (i < view.size()) { | ||
__m128i word = | ||
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); | ||
__m128i m1 = _mm_cmpeq_epi8(word, mask1); | ||
__m128i m2 = _mm_cmpeq_epi8(word, mask2); | ||
__m128i m4 = _mm_cmpeq_epi8(word, mask4); | ||
__m128i m5 = _mm_cmpeq_epi8(word, mask5); | ||
__m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5)); | ||
int mask = _mm_movemask_epi8(m); | ||
if (mask != 0) { | ||
return view.length() - 16 + trailing_zeroes(mask); | ||
} | ||
} | ||
return size_t(view.length()); | ||
} | ||
#else | ||
ada_really_inline size_t find_next_host_delimiter(std::string_view view, | ||
size_t location) noexcept { | ||
// performance: if you plan to call find_next_host_delimiter more than once, | ||
|
@@ -323,6 +553,7 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view, | |
} | ||
return view.size(); | ||
} | ||
#endif | ||
|
||
ada_really_inline std::pair<size_t, bool> get_host_delimiter_location( | ||
const bool is_special, std::string_view& view) noexcept { | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment on top of this function? If we don't want to expose this, we should add private to comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did. In my view, an inline function defined in a source file and just used locally does not have to be declared in the header file (and indeed, doing so just adds noise, isn't it?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(To be clearer: I did add a comment in a later commit.)