Skip to content

Commit

Permalink
cbits: Use __builtin_shuffle for generic code
Browse files Browse the repository at this point in the history
GCC has a built-in function to shuffle vectors the way we need to called
`__builtin_shuffle`. This commit changes the code to use the built-in in
the 'generic' case (i.e. when no host-specific SIMD implementation is
available).

One could argue this could be used to replace all host-specific
implementations, but it turns out the built-in generates slightly more
code than the hand-rolled intrinsics calls.

Use of the built-in is depending on a `configure` test. Since this test
needs to execute some real code (to ensure the built-in does what it's
expected to), this will always fail when cross-compiling. In such
scenario, the built-in is not used and the 'old' byte-by-byte generic
shuffle code remains in place.
  • Loading branch information
NicolasT committed Oct 15, 2016
1 parent 54a3bcb commit 724dddb
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 2 deletions.
61 changes: 59 additions & 2 deletions cbits/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,55 @@ AX_CHECK_COMPILE_FLAG([-mavx], [rs_avx=1], [rs_avx=0])
AX_CHECK_COMPILE_FLAG([-mavx2], [rs_avx2=1], [rs_avx2=0])

## Checks for compiler-specific features
AC_MSG_CHECKING([whether C compiler supports __builtin_shuffle])
AC_RUN_IFELSE(
[AC_LANG_PROGRAM(
[
#include <stdint.h>
typedef uint8_t v16u8v __attribute__((vector_size(16)));
],
[[
v16u8v v1 = { 0, 1, 2, 3, 4, 5, 6, 7
, 8, 9, 10, 11, 12, 13, 14, 15 },
v2 = { 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0 },
mask = { 0, 20, 4, 24, 8, 28, 12, 32
, 18, 2, 22, 6, 26, 10, 30, 16 },
result = __builtin_shuffle(v1, v2, mask),
expected = { 0, 0, 4, 0, 8, 0, 12, 0
, 0, 2, 0, 6, 0, 10, 0, 0 };
int equal = result[0] == expected[0]
&& result[1] == expected[1]
&& result[2] == expected[2]
&& result[3] == expected[3]
&& result[4] == expected[4]
&& result[5] == expected[5]
&& result[6] == expected[6]
&& result[7] == expected[7]
&& result[8] == expected[8]
&& result[9] == expected[9]
&& result[10] == expected[10]
&& result[11] == expected[11]
&& result[12] == expected[12]
&& result[13] == expected[13]
&& result[14] == expected[14]
&& result[15] == expected[15];
return equal ? 0 : 1;
]])],
[AC_MSG_RESULT([yes])
rs_have_builtin_shuffle=1],
[AC_MSG_RESULT([no])
rs_have_builtin_shuffle=0],
[AC_MSG_RESULT([no (assumed, cross-compiling)])
rs_have_builtin_shuffle=0])
AC_DEFINE_UNQUOTED(
[RS_HAVE_BUILTIN_SHUFFLE],
[$rs_have_builtin_shuffle],
[Define to 1 if C compiller supports __builtin_shuffle])

AC_MSG_CHECKING([whether Clang's `loop unroll` pragma works])
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM(
Expand Down Expand Up @@ -259,10 +308,18 @@ AC_OUTPUT

rs_backends=""
if test x$rs_generic = x1; then
rs_backends="$rs_backends, generic"
if test x$rs_have_builtin_shuffle = x1; then
rs_backends="$rs_backends, generic (__builtin_shuffle)"
else
rs_backends="$rs_backends, generic"
fi
fi
if test x$rs_sse2 = x1; then
rs_backends="$rs_backends, sse2"
if test x$rs_have_builtin_shuffle = x1; then
rs_backends="$rs_backends, sse2 (__builtin_shuffle)"
else
rs_backends="$rs_backends, sse2"
fi
fi
if test x$rs_ssse3 = x1; then
rs_backends="$rs_backends, ssse3"
Expand Down
4 changes: 4 additions & 0 deletions cbits/reedsolomon.c
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,10 @@ static ALWAYS_INLINE CONST_FUNCTION v shuffle_epi8_v(const v vec, const v mask)
#elif USE_ALTIVEC
const v128 zeros = set1_epi8_v(0),
result = { .uint8x16 = vec_perm(vec.uint8x16, zeros.uint8x16, mask.uint8x16) };
#elif defined(RS_HAVE_BUILTIN_SHUFFLE) && RS_HAVE_BUILTIN_SHUFFLE
const v16u8v zeros = { 0, 0, 0, 0, 0, 0, 0, 0
, 0, 0, 0, 0, 0, 0, 0, 0 };
const v128 result = { .v16u8 = __builtin_shuffle(vec.v16u8, zeros, mask.v16u8) };
#else
v128 result = { .u64 = { 0, 0 } };

Expand Down

0 comments on commit 724dddb

Please sign in to comment.