-
Notifications
You must be signed in to change notification settings - Fork 351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Popcnt vectorization #198
Open
diegohavenstein
wants to merge
11
commits into
simongog:master
Choose a base branch
from
diegohavenstein:popcnt_vectorization
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Popcnt vectorization #198
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
88a8a0c
AVX2 support for popcount function. To be tested.
a65f3f4
Added support for SSE based popcount
8983625
Bugfixes, now compiles and "make test" is running
40409bc
CheckAVX2.cmake
9748a34
Bug fixes
7f72dc4
Some of the fixes suggested on GitHub by Simon
bd7ad14
__AVX2__ -> __SSE4_2__
91362e6
Little fix
24ad2ce
bug in uint256_t fixed (wrong accessing pattern to array elements)
db534bb
values instead of ymm to access data in ymm_union
7fe66dc
count variable being used where it needs to be used now
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Check if the CPU provides fast operations | ||
# for popcount, leftmost and rightmost bit | ||
|
||
set(AVX2 0) | ||
# Check if we are on a Linux system | ||
if(CMAKE_SYSTEM_NAME STREQUAL "Linux") | ||
# Use /proc/cpuinfo to get the information | ||
file(STRINGS "/proc/cpuinfo" _cpuinfo) | ||
if(_cpuinfo MATCHES "(avx2)") | ||
set(AVX2 1) | ||
endif() | ||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") | ||
# handle windows | ||
# get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) | ||
# get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) | ||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") | ||
# handle MacOs | ||
execute_process(COMMAND sysctl -n machdep.cpu.features | ||
OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE) | ||
if(_cpuinfo MATCHES "AVX2") | ||
set(AVX2 1) | ||
endif() | ||
endif() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,9 @@ | |
#include <stdint.h> // for uint64_t uint32_t declaration | ||
#include <iostream>// for cerr | ||
#include <cassert> | ||
#include <x86intrin.h> // SSE/AVX | ||
#include "ymm_union.hpp" // convenient YMM register wrapper | ||
#include "xmm_union.hpp" // convenient XMM register wrapper | ||
#ifdef __SSE4_2__ | ||
#include <xmmintrin.h> | ||
#endif | ||
|
@@ -102,6 +105,22 @@ struct bits { | |
*/ | ||
static uint64_t cnt(uint64_t x); | ||
|
||
//! Counts the number of set bits in YMM register x. | ||
/*! \param YMM register | ||
\return Number of set bits. | ||
*/ | ||
#ifdef __AVX2__ | ||
static uint64_t cnt256(__m256i x); | ||
#endif | ||
|
||
//! Counts the number of set bits in XMM register x. | ||
/*! \param XMM register | ||
\return Number of set bits. | ||
*/ | ||
#ifdef __SSE4_2__ | ||
static uint64_t cnt128(__m128i x); | ||
#endif | ||
|
||
//! Position of the most significant set bit the 64-bit word x | ||
/*! \param x 64-bit word | ||
\return The position (in 0..63) of the least significant set bit | ||
|
@@ -237,6 +256,54 @@ struct bits { | |
|
||
// ============= inline - implementations ================ | ||
|
||
#ifdef __AVX2__ | ||
inline uint64_t bits::cnt256(__m256i x){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please define the methods further up: #ifdef __AVX2__
static uint64_t cnt256(__m256i x);
#endif same for cnt128: #ifdef __SSE4_2__
static uint64_t cnt128(__m128i x);
#endif Also note: sse4_2 -> SSE4_2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in latest commit |
||
|
||
// 4-bit universal table, 4-bit mask | ||
static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); | ||
static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, | ||
1, 2, 2, 3, 2, 3, 3, 4, | ||
0, 1, 1, 2, 1, 2, 2, 3, | ||
1, 2, 2, 3, 2, 3, 3, 4); | ||
|
||
__m256i low, high, bwcount; | ||
|
||
// byte halves stored in separate YMM registers | ||
low = _mm256_and_si256(MASK4_256, x); | ||
high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); | ||
|
||
// bytewise population count | ||
bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), | ||
_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); | ||
|
||
// Use union to access individual bytes (unsigned integers) | ||
sdsl::YMM_union<uint8_t> ymm_union; | ||
ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); | ||
return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12]; | ||
} | ||
#endif | ||
|
||
#ifdef __SSE4_2__ | ||
inline uint64_t bits::cnt128(__m128i x){ | ||
|
||
// 4-bit universal table, 4-bit mask | ||
static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); | ||
static const __m128i MASK4 = _mm_set1_epi8(0x0F); | ||
|
||
__m128i low, high, count; | ||
|
||
low = _mm_and_si128(MASK4, x); | ||
high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); | ||
count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), | ||
_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); | ||
|
||
// Use union to access individual bytes (unsigned integers) | ||
sdsl::XMM_union<uint8_t> xmm_union; | ||
xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128()); | ||
return xmm_union.values[0] + xmm_union.values[4]; | ||
} | ||
#endif | ||
|
||
// see page 11, Knuth TAOCP Vol 4 F1A | ||
inline uint64_t bits::cnt(uint64_t x) | ||
{ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* sdsl - succinct data structures library | ||
Copyright (C) 2012 Simon Gog | ||
|
||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see http://www.gnu.org/licenses/ . | ||
*/ | ||
/*! \file xmm_union.hpp | ||
\brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits). | ||
\author Diego Havenstein | ||
*/ | ||
#ifndef INCLUDED_SDSL_XMMUNION | ||
#define INCLUDED_SDSL_XMMUNION | ||
|
||
namespace sdsl | ||
{ | ||
|
||
#ifdef __SSE4_2__ | ||
template<typename T> | ||
union XMM_union { | ||
__m128i xmm; | ||
T values[16/sizeof(T)]; | ||
}; | ||
#endif | ||
|
||
} // end namespace | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* sdsl - succinct data structures library | ||
Copyright (C) 2012 Simon Gog | ||
|
||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see http://www.gnu.org/licenses/ . | ||
*/ | ||
/*! \file ymm_union.hpp | ||
\brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits). | ||
\author Diego Havenstein | ||
*/ | ||
#ifndef INCLUDED_SDSL_YMMUNION | ||
#define INCLUDED_SDSL_YMMUNION | ||
|
||
namespace sdsl | ||
{ | ||
|
||
#ifdef __AVX2__ | ||
template<typename T> | ||
union YMM_union { | ||
__m256i ymm; | ||
T values[32/sizeof(T)]; | ||
}; | ||
#endif | ||
|
||
} // end namespace | ||
|
||
#endif |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Diego, thanks for your contribution. I'm just testing the code on a Mac equipped with a CPU (i7-4850HQ) which supports AVX2. Surprisingly the command
sysctl -n machdep.cpu.features
does not list AVX2 as feature, but just:FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX SMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C
. However AVX2 is listed in the output ofsysctl -n machdep.cpu
:13 2147483656 GenuineIntel Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz 6 70 4 0 1 3219913727 2147154943 12219 739248384 33 263777 0 FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX SMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C SMEP ENFSTRG RDWRFSGS TSC_THREAD_OFFSET BMI1 HLE AVX2 BMI2 INVPCID RTM SYSCALL XD 1GBPAGE EM64T LAHF RDTSCP TSCI 16 8 15 5 64 64 3 270624 1 1 1 2 1 1 1 1 0 1 7 832 832 0 3 4 48 7 0 3 48 64 8 256 8 64 64 1024 39 48 4 8
So maybe just a match on the latter output?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds reasonable :) I have run my code on Linux machines only, so I did not face this problem