simongog · diegohavenstein · Aug 29, 2014 · Aug 29, 2014 · Aug 29, 2014 · Aug 29, 2014
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,6 +61,16 @@ if( BUILTIN_POPCNT )
     endif()
 endif()
 
+include(CheckAVX2)
+if( AVX2 )
+    if( CMAKE_COMPILER_IS_GNUCXX )
+        append_cxx_compiler_flags("-mavx2" "GCC" CMAKE_CXX_OPT_FLAGS)
+    elseif( CMAKE_COMPILER_IS_CLANGXX )
+        append_cxx_compiler_flags("-mavx2" "CLANG" CMAKE_CXX_OPT_FLAGS)
+    endif()
+        message(STATUS "Your compiler is not supported yet!")
+endif()
+
 add_subdirectory(external)
 add_subdirectory(include)
 add_subdirectory(lib)

diff --git a/CMakeModules/CheckAVX2.cmake b/CMakeModules/CheckAVX2.cmake
@@ -0,0 +1,24 @@
+# Check if the CPU provides fast operations
+# for popcount, leftmost and rightmost bit
+
+set(AVX2 0)
+# Check if we are on a Linux system
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	# Use /proc/cpuinfo to get the information
+	file(STRINGS "/proc/cpuinfo" _cpuinfo)
+	if(_cpuinfo MATCHES "(avx2)")
+    set(AVX2 1)
+	endif()
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+#  handle windows
+#	get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE)
+#	get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE)	
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+#  handle MacOs
+execute_process(COMMAND sysctl -n machdep.cpu.features
+                OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(_cpuinfo MATCHES "AVX2")
+    set(AVX2 1)
+	endif()
+endif()	
+
diff --git a/include/sdsl/bits.hpp b/include/sdsl/bits.hpp
@@ -24,6 +24,9 @@
 #include <stdint.h> // for uint64_t uint32_t declaration
 #include <iostream>// for cerr
 #include <cassert>
+#include <x86intrin.h> // SSE/AVX
+#include "ymm_union.hpp" // convenient YMM register wrapper
+#include "xmm_union.hpp" // convenient XMM register wrapper
 #ifdef __SSE4_2__
 #include <xmmintrin.h>
 #endif
@@ -102,6 +105,22 @@ struct bits {
      */
     static uint64_t cnt(uint64_t x);
 
+    //! Counts the number of set bits in YMM register x.
+    /*! \param  YMM register
+        \return Number of set bits.
+     */
+#ifdef __AVX2__
+    static uint64_t cnt256(__m256i x);
+#endif
+
+    //! Counts the number of set bits in XMM register x.
+    /*! \param  XMM register
+        \return Number of set bits.
+     */ 
+#ifdef __SSE4_2__
+    static uint64_t cnt128(__m128i x);
+#endif
+
     //! Position of the most significant set bit the 64-bit word x
     /*! \param x 64-bit word
         \return The position (in 0..63) of the least significant set bit
@@ -237,6 +256,54 @@ struct bits {
 
 // ============= inline - implementations ================
 
+#ifdef __AVX2__
+inline uint64_t bits::cnt256(__m256i x){
+
+  // 4-bit universal table, 4-bit mask
+  static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F);
+  static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3,
+                                                                    1, 2, 2, 3, 2, 3, 3, 4, 
+                                                                    0, 1, 1, 2, 1, 2, 2, 3, 
+                                                                    1, 2, 2, 3, 2, 3, 3, 4);
+
+  __m256i low, high, bwcount;
+
+  // byte halves stored in separate YMM registers
+  low = _mm256_and_si256(MASK4_256, x);
+  high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4));
+
+  // bytewise population count
+  bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low),
+                          _mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high));
+
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::YMM_union<uint8_t> ymm_union;
+  ymm_union.ymm = _mm256_sad_epu8(bwcount, _mm256_setzero_si256());
+  return ymm_union.values[0] + ymm_union.values[4] + ymm_union.values[8] + ymm_union.values[12];
+}
+#endif
+
+#ifdef __SSE4_2__
+inline uint64_t bits::cnt128(__m128i x){
+
+  // 4-bit universal table, 4-bit mask
+  static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+  static const __m128i MASK4 = _mm_set1_epi8(0x0F);
+
+  __m128i low, high, count;
+
+  low = _mm_and_si128(MASK4, x);
+  high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4));
+  count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low),
+                       _mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high));
+
+  // Use union to access individual bytes (unsigned integers)
+  sdsl::XMM_union<uint8_t> xmm_union;
+  xmm_union.xmm = _mm_sad_epu8(count, _mm_setzero_si128());
+  return xmm_union.values[0] + xmm_union.values[4];
+}
+#endif
+
 // see page 11, Knuth TAOCP Vol 4 F1A
 inline uint64_t bits::cnt(uint64_t x)
 {

diff --git a/include/sdsl/uint256_t.hpp b/include/sdsl/uint256_t.hpp
@@ -62,8 +62,30 @@ class uint256_t
         }
 
         inline uint16_t popcount() {
+#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle
+          sdsl::YMM_union<uint64_t> ymm_union;
+          ymm_union.values[0] = m_lo;
+          ymm_union.values[1] = m_mid;
+          ymm_union.values[2] = m_high >> 64;
+          ymm_union.values[3] = m_high;
+          return bits::cnt256(ymm_union.ymm);
+#endif
+
+#ifdef __SSE4_2__ // 16 table lookups per clock cycle
+          sdsl::XMM_union<uint64_t> xmm_union1;
+          sdsl::XMM_union<uint64_t> xmm_union2;
+          xmm_union1.values[0] = m_lo;
+          xmm_union1.values[1] = m_mid;
+          xmm_union2.values[0] = m_high >> 64;
+          xmm_union2.values[1] = m_high;
+
+          return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm);
+
+
+#else  // byte after byte
             return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid)
                    + bits::cnt(m_high>>64) + bits::cnt(m_high);
+#endif
         }
 
         inline uint16_t hi() {

diff --git a/include/sdsl/xmm_union.hpp b/include/sdsl/xmm_union.hpp
@@ -0,0 +1,37 @@
+/* sdsl - succinct data structures library
+    Copyright (C) 2012 Simon Gog
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see http://www.gnu.org/licenses/ .
+*/
+/*! \file xmm_union.hpp
+   \brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits).
+   \author Diego Havenstein
+*/
+#ifndef INCLUDED_SDSL_XMMUNION
+#define INCLUDED_SDSL_XMMUNION
+
+namespace sdsl
+{
+
+#ifdef __SSE4_2__
+template<typename T>
+union XMM_union {
+  __m128i xmm;
+  T values[16/sizeof(T)];
+};
+#endif
+
+} // end namespace
+
+#endif
diff --git a/include/sdsl/ymm_union.hpp b/include/sdsl/ymm_union.hpp
@@ -0,0 +1,37 @@
+/* sdsl - succinct data structures library
+    Copyright (C) 2012 Simon Gog
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see http://www.gnu.org/licenses/ .
+*/
+/*! \file ymm_union.hpp
+   \brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits).
+   \author Diego Havenstein
+*/
+#ifndef INCLUDED_SDSL_YMMUNION
+#define INCLUDED_SDSL_YMMUNION
+
+namespace sdsl
+{
+
+#ifdef __AVX2__
+template<typename T>
+union YMM_union {
+  __m256i ymm;
+  T values[32/sizeof(T)];
+};
+#endif
+
+} // end namespace
+
+#endif