Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARCH/X86: Use UCS function to count leading zeros #10514

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions config/m4/compiler.m4
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,24 @@ if $CC --version 2>&1 | grep -q Intel; then
[AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])])
fi

#
# Check actual lzcnt support (at least nvc 24.9 fails to link, even if it defines __LZCNT__)
#
SAVE_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS -mlzcnt"
AC_MSG_CHECKING([if lzcnt is supported])
AC_LINK_IFELSE([AC_LANG_SOURCE([[
#include <x86intrin.h>
int main(void) {
return (int)_lzcnt_u32(1) | (int)_lzcnt_u64(2);
}
]])],
[AC_MSG_RESULT([yes])
BASE_CFLAGS="-mlzcnt $BASE_CFLAGS"
AC_DEFINE([HAVE_LZCNT], 1, [LZCNT Intrinsic support])],
[AC_MSG_RESULT([no])])
CFLAGS="$SAVE_CFLAGS"


#
# Set C++ optimization/debug flags to be the same as for C
Expand Down
9 changes: 3 additions & 6 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1128,12 +1128,9 @@ run_release_mode_tests() {
# Run nt_buffer_transfer tests
#
run_nt_buffer_transfer_tests() {
if lscpu | grep -q 'AuthenticAMD'
then
build release --enable-gtest --enable-optimizations
echo "==== Running nt_buffer_transfer tests ===="
./test/gtest/gtest --gtest_filter="test_arch.nt_buffer_transfer_*"
fi
build release --enable-gtest --enable-optimizations
echo "==== Running test_arch tests with optimizations ===="
./test/gtest/gtest --gtest_filter="test_arch.*"
}

set_ucx_common_test_env() {
Expand Down
15 changes: 13 additions & 2 deletions src/ucs/arch/bitops.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ BEGIN_C_DECLS
#endif


#if defined(HAVE_LZCNT)
# include <x86intrin.h>
#endif


#define ucs_ilog2(_n) \
( \
__builtin_constant_p(_n) ? ( \
Expand Down Expand Up @@ -121,10 +126,16 @@ BEGIN_C_DECLS
((sizeof(_n) <= 4) ? __builtin_ctz((uint32_t)(_n)) : __builtin_ctzl(_n))

/* Returns the number of leading 0-bits in _n.
* If _n is 0, the result is undefined
*/
#if defined(HAVE_LZCNT)
#define ucs_count_leading_zero_bits(_n) \
((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : __builtin_clzl(_n))
((sizeof(_n) <= 4) ? _lzcnt_u32((uint32_t)(_n)) : _lzcnt_u64(_n))
#else
#define ucs_count_leading_zero_bits(_n) \
((_n) ? ((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : \
__builtin_clzl(_n)) : \
(sizeof(_n) * 8))
#endif

/* Returns the number of bits lower than 'bit_index' that are set in 'mask'
* For example: ucs_bitmap2idx(mask=0xF0, idx=6) returns 2
Expand Down
8 changes: 2 additions & 6 deletions src/ucs/arch/x86_64/cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1061,12 +1061,11 @@ size_t ucs_x86_nt_src_buffer_transfer(void *dst, const void *src, size_t len)
}

static UCS_F_ALWAYS_INLINE
void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
void ucs_x86_copy_bytes_le_128(void *dst, const void *src, uint32_t len)
{
#if defined (__LZCNT__)
__m256i y0, y1, y2, y3;
/* Handle lengths that fall usually within eager short range */
switch (_lzcnt_u32(len)) {
switch (ucs_count_leading_zero_bits(len)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the len is zero the code expects '32' as output, is ucs_count_leading_zero_bits(0) return 32?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes you're right, should be fixed now

Copy link
Contributor

@arun-chandran-edarath arun-chandran-edarath Feb 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one issue; _lzcnt_u32(len) is expected to count the leading zeros from a 32bit operand.

_lzcnt_u32(0) should produce 32.

But now ucs_count_leading_zero_bits(len) will use _lzcnt_u64() because 'size_t len' will be 8 byte and the outputs produced makes the switch_cases wrong. [_lzcnt_u64(0) gives 64]

It should be ucs_count_leading_zero_bits((uint32_t) len) right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, also it's catched by the test suite, just that it was not running in our CI. Fixed by changing function params, pls double check that it makes sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks ok to me.

/* 0 */
case 32:
break;
Expand Down Expand Up @@ -1121,9 +1120,6 @@ void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
_mm256_storeu_si256(UCS_PTR_BYTE_OFFSET(dst, len - 32), y3);
break;
}
#else
memcpy(dst, src, len);
#endif
}

/* This is an adaptation of the memcpy code from https://github.com/amd/aocl-libmem
Expand Down
17 changes: 15 additions & 2 deletions test/gtest/ucs/arch/test_x86_64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ class test_arch : public ucs::test {
test_window_size = 8 * 1024;
hole_size = 2 * align;

auto msg = [&]() {
std::stringstream ss;
ss << "using length=" << len << " src_align=" << i << " dst_align=" << j;
return ss.str();
};

/*
* Allocate a hole above and below the test_window_size
* to check for writes beyond the designated area.
Expand Down Expand Up @@ -113,14 +119,20 @@ class test_arch : public ucs::test {
/* Perform the transfer */
ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint, len);
result = memcmp(src + j, dst + i, len);
EXPECT_EQ(0, result);
EXPECT_EQ(0, result) << msg();
if (result) {
goto terminate;
}

/* reset the copied region back to zero */
memset(dst + i, 0x0, len);

/* check for any modifications in the holes */
result = memcmp(test_window_dst, dup, total_size);
EXPECT_EQ(0, result);
EXPECT_EQ(0, result) << msg();
if (result) {
goto terminate;
}
}
}
/* Check for each len for less than 1k sizes
Expand All @@ -133,6 +145,7 @@ class test_arch : public ucs::test {
}
}

terminate:
free(dup);

dup_fail:
Expand Down
20 changes: 20 additions & 0 deletions test/gtest/ucs/test_bitops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,26 @@ UCS_TEST_F(test_bitops, is_equal) {
test_bitops::check_bitwise_equality(buffer1, buffer2, indices, 0);
}

template <typename T> void test_clz()
{
constexpr int bits = sizeof(T) * 8;
T v = 1;

for (int i = bits - 1; v != 0; v <<= 1, --i) {
ASSERT_EQ(i, ucs_count_leading_zero_bits(v));
}

ASSERT_EQ(bits, ucs_count_leading_zero_bits(v));
}

UCS_TEST_F(test_bitops, clz)
{
test_clz<uint32_t>();
test_clz<uint64_t>();
test_clz<int32_t>();
test_clz<int64_t>();
}

template<typename Type> void test_mask()
{
Type expected = 0;
Expand Down
Loading