Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Popcnt vectorization #198

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ if( BUILTIN_POPCNT )
endif()
endif()

include(CheckAVX2)
if( AVX2 )
if( CMAKE_COMPILER_IS_GNUCXX )
append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS)
elseif( CMAKE_COMPILER_IS_CLANGXX )
append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS)
endif()
message(STATUS "Your compiler is not supported yet!")
endif()

add_subdirectory(external)
add_subdirectory(include)
add_subdirectory(lib)
Expand Down
24 changes: 24 additions & 0 deletions CMakeModules/CheckAVX2.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Check if the CPU provides fast operations
# for popcount, leftmost and rightmost bit

set(AVX2 0)
# Check if we are on a Linux system
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Use /proc/cpuinfo to get the information
file(STRINGS "/proc/cpuinfo" _cpuinfo)
if(_cpuinfo MATCHES "(avx2)")
set(AVX2 1)
endif()
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
# handle windows
# get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE)
# get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE)
elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
# handle MacOs
execute_process(COMMAND sysctl -n machdep.cpu.features
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Diego, thanks for your contribution. I'm just testing the code on a Mac equipped with a CPU (i7-4850HQ) which supports AVX2. Surprisingly the command sysctl -n machdep.cpu.features does not list AVX2 as feature, but just:
FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX SMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C. However AVX2 is listed in the output of sysctl -n machdep.cpu:

13 2147483656 GenuineIntel Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz 6 70 4 0 1 3219913727 2147154943 12219 739248384 33 263777 0 FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM PBE SSE3 PCLMULQDQ DTES64 MON DSCPL VMX SMX EST TM2 SSSE3 FMA CX16 TPR PDCM SSE4.1 SSE4.2 x2APIC MOVBE POPCNT AES PCID XSAVE OSXSAVE SEGLIM64 TSCTMR AVX1.0 RDRAND F16C SMEP ENFSTRG RDWRFSGS TSC_THREAD_OFFSET BMI1 HLE AVX2 BMI2 INVPCID RTM SYSCALL XD 1GBPAGE EM64T LAHF RDTSCP TSCI 16 8 15 5 64 64 3 270624 1 1 1 2 1 1 1 1 0 1 7 832 832 0 3 4 48 7 0 3 48 64 8 256 8 64 64 1024 39 48 4 8

So maybe just a match on the latter output?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable :) I have run my code on Linux machines only, so I did not face this problem

OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE)
if(_cpuinfo MATCHES "AVX2")
set(AVX2 1)
endif()
endif()

67 changes: 67 additions & 0 deletions include/sdsl/bits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#include <stdint.h> // for uint64_t uint32_t declaration
#include <iostream>// for cerr
#include <cassert>
#include <x86intrin.h> // SSE/AVX
#include "ymm_union.hpp" // convenient YMM register wrapper
#include "xmm_union.hpp" // convenient XMM register wrapper
#ifdef __SSE4_2__
#include <xmmintrin.h>
#endif
Expand Down Expand Up @@ -102,6 +105,22 @@ struct bits {
*/
static uint64_t cnt(uint64_t x);

//! Counts the number of set bits in YMM register x.
/*! \param YMM register
\return Number of set bits.
*/
#ifdef __AVX2__
static uint64_t cnt256(__m256i x);
#endif

//! Counts the number of set bits in XMM register x.
/*! \param XMM register
\return Number of set bits.
*/
#ifdef __SSE4_2__
static uint64_t cnt128(__m128i x);
#endif

//! Position of the most significant set bit the 64-bit word x
/*! \param x 64-bit word
\return The position (in 0..63) of the least significant set bit
Expand Down Expand Up @@ -237,6 +256,54 @@ struct bits {

// ============= inline - implementations ================

#ifdef __AVX2__
inline uint64_t bits::cnt256(__m256i x){
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please define the methods further up:

#ifdef __AVX2__
    static uint64_t cnt256(__m256i x);
#endif

same for cnt128:

#ifdef __SSE4_2__
    static uint64_t cnt128(__m128i x);
#endif 

Also note: sse4_2 -> SSE4_2

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in latest commit


// 4-bit universal table, 4-bit mask
static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F);
static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4);

__m256i low, high, bwcount;

// byte halves stored in separate YMM registers
low = _mm256_and_si256(MASK4_256, x);
high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4));

// bytewise population count
bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));

// Use union to access individual bytes (unsigned integers)
sdsl::YMM_union<uint8_t> ymm_union;
ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12];
}
#endif

#ifdef __SSE4_2__
inline uint64_t bits::cnt128(__m128i x){

// 4-bit universal table, 4-bit mask
static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
static const __m128i MASK4 = _mm_set1_epi8(0x0F);

__m128i low, high, count;

low = _mm_and_si128(MASK4, x);
high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4));
count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));

// Use union to access individual bytes (unsigned integers)
sdsl::XMM_union<uint8_t> xmm_union;
xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128());
return xmm_union.values[0] + xmm_union.values[4];
}
#endif

// see page 11, Knuth TAOCP Vol 4 F1A
inline uint64_t bits::cnt(uint64_t x)
{
Expand Down
22 changes: 22 additions & 0 deletions include/sdsl/uint256_t.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,30 @@ class uint256_t
}

inline uint16_t popcount() {
#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle
sdsl::YMM_union<uint64_t> ymm_union;
ymm_union.values[0] = m_lo;
ymm_union.values[1] = m_mid;
ymm_union.values[2] = m_high >> 64;
ymm_union.values[3] = m_high;
return bits::cnt256(ymm_union.ymm);
#endif

#ifdef __SSE4_2__ // 16 table lookups per clock cycle
sdsl::XMM_union<uint64_t> xmm_union1;
sdsl::XMM_union<uint64_t> xmm_union2;
xmm_union1.values[0] = m_lo;
xmm_union1.values[1] = m_mid;
xmm_union2.values[0] = m_high >> 64;
xmm_union2.values[1] = m_high;

return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm);


#else // byte after byte
return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid)
+ bits::cnt(m_high>>64) + bits::cnt(m_high);
#endif
}

inline uint16_t hi() {
Expand Down
37 changes: 37 additions & 0 deletions include/sdsl/xmm_union.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* sdsl - succinct data structures library
Copyright (C) 2012 Simon Gog

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/ .
*/
/*! \file xmm_union.hpp
\brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits).
\author Diego Havenstein
*/
#ifndef INCLUDED_SDSL_XMMUNION
#define INCLUDED_SDSL_XMMUNION

namespace sdsl
{

#ifdef __SSE4_2__
template<typename T>
union XMM_union {
__m128i xmm;
T values[16/sizeof(T)];
};
#endif

} // end namespace

#endif
37 changes: 37 additions & 0 deletions include/sdsl/ymm_union.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* sdsl - succinct data structures library
Copyright (C) 2012 Simon Gog

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/ .
*/
/*! \file ymm_union.hpp
\brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits).
\author Diego Havenstein
*/
#ifndef INCLUDED_SDSL_YMMUNION
#define INCLUDED_SDSL_YMMUNION

namespace sdsl
{

#ifdef __AVX2__
template<typename T>
union YMM_union {
__m256i ymm;
T values[32/sizeof(T)];
};
#endif

} // end namespace

#endif