Skip to content

Commit

Permalink
Help the compiler vectorize adjacent_difference (#4958)
Browse files Browse the repository at this point in the history
Co-authored-by: Stephan T. Lavavej <[email protected]>
  • Loading branch information
AlexGuteniev and StephanTLavavej authored Oct 30, 2024
1 parent ca1553d commit 1990083
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 0 deletions.
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ function(add_benchmark name)
target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark)
endfunction()

add_benchmark(adjacent_difference src/adjacent_difference.cpp)
add_benchmark(bitset_from_string src/bitset_from_string.cpp)
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp)
Expand Down
57 changes: 57 additions & 0 deletions benchmarks/src/adjacent_difference.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <algorithm>
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <numeric>
#include <random>
#include <type_traits>
#include <vector>

using namespace std;

template <class T>
void bm(benchmark::State& state) {
mt19937 gen(96337);

const size_t size = static_cast<size_t>(state.range(0));

vector<T> input(size);
vector<T> output(size);

if constexpr (is_floating_point_v<T>) {
normal_distribution<T> dis(-100.0, 100.0);
ranges::generate(input, [&] { return dis(gen); });
} else {
static_assert(is_unsigned_v<T>, "This avoids signed integers to avoid UB; they shouldn't perform differently");
uniform_int_distribution<conditional_t<sizeof(T) != 1, T, unsigned int>> dis(0, numeric_limits<T>::max());
ranges::generate(input, [&] { return static_cast<T>(dis(gen)); });
}

for (auto _ : state) {
benchmark::DoNotOptimize(input);
adjacent_difference(input.begin(), input.end(), output.begin());
benchmark::DoNotOptimize(output);
}
}

void common_args(auto bm) {
bm->Arg(2255);
}

#pragma warning(push)
#pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data
BENCHMARK(bm<uint8_t>)->Apply(common_args);
BENCHMARK(bm<uint16_t>)->Apply(common_args);
#pragma warning(pop)

BENCHMARK(bm<uint32_t>)->Apply(common_args);
BENCHMARK(bm<uint64_t>)->Apply(common_args);

BENCHMARK(bm<float>)->Apply(common_args);
BENCHMARK(bm<double>)->Apply(common_args);

BENCHMARK_MAIN();
40 changes: 40 additions & 0 deletions stl/inc/numeric
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,20 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _
_UnaryOp _Transform_op) noexcept; // terminates
#endif // _HAS_CXX17

template <class _TyDest, class _TySrc, class _BinOp>
void _Adjacent_difference_no_overlap(
_TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const ptrdiff_t _Count, _BinOp _Func) {
_Dest[0] = _Src[0];
for (ptrdiff_t _Ix = 1; _Ix != _Count; ++_Ix) {
#if _HAS_CXX20
_TySrc _Tmp = _Src[_Ix - 1];
_Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp));
#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
_Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]);
#endif // ^^^ !_HAS_CXX20 ^^^
}
}

_EXPORT_STD template <class _InIt, class _OutIt, class _BinOp>
_CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _OutIt _Dest, _BinOp _Func) {
// compute adjacent differences into _Dest
Expand All @@ -469,6 +483,32 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast));
if (_UFirst != _ULast) {
if constexpr (_Iterators_are_contiguous<decltype(_UFirst), decltype(_UDest)> && !_Iterator_is_volatile<_InIt>
&& is_trivially_copyable_v<_Iter_value_t<_InIt>>) {
#if _HAS_CXX20
if (!_STD is_constant_evaluated())
#endif
{
// Go with pointers and without loop-carried dependency to enable vectorization
const auto _First_ptr = _STD _To_address(_UFirst);
const auto _Last_ptr = _STD _To_address(_ULast);
const auto _Dest_ptr = _STD _To_address(_UDest);
const auto _Count = _Last_ptr - _First_ptr;

// Need to perform aliasing analysis.
// The vectorizer is generally able to do that on its own, and would guard the vectorized code with
// that, but when we eliminate the loop-carried dependency we change the semantics of the unvectorized
// code too. So we need to perform this check manually, and after that we can tell the compiler that
// there's no aliasing, to avoid it checking for that again.
if (reinterpret_cast<uintptr_t>(_Dest_ptr + _Count) <= reinterpret_cast<uintptr_t>(_First_ptr)
|| reinterpret_cast<uintptr_t>(_Last_ptr) <= reinterpret_cast<uintptr_t>(_Dest_ptr)) {
_STD _Adjacent_difference_no_overlap(_Dest_ptr, _First_ptr, _Count, _STD _Pass_fn(_Func));
_STD _Seek_wrapped(_Dest, _UDest + _Count);
return _Dest;
}
}
}

_Iter_value_t<_InIt> _Val(*_UFirst);
*_UDest = _Val;
while (++_UFirst != _ULast) { // compute another difference
Expand Down
81 changes: 81 additions & 0 deletions tests/std/tests/VSO_0000000_vector_algorithms/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <functional>
#include <limits>
#include <list>
#include <numeric>
#include <random>
#include <stdexcept>
#include <string>
Expand All @@ -34,6 +35,74 @@ using namespace std;
#pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension
#endif // __clang__

template <class InIt, class OutIt, class BinOp>
OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest, BinOp binop) {
if (first == last) {
return dest;
}

auto val = *first;
*dest = val;

for (++first, ++dest; first != last; ++first, ++dest) {
auto tmp = *first;
*dest = binop(tmp, val);
val = tmp;
}

return dest;
}

template <class T>
void test_case_adj_diff(const vector<T>& input, vector<T>& output_expected, vector<T>& output_actual) {
// Avoid truncation warnings:
const auto subtract = [](const T& left, const T& right) { return static_cast<T>(left - right); };
const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin(), subtract);
const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin(), subtract);

assert(actual - output_actual.begin() == expected - output_expected.begin());
assert(output_actual == output_expected);
}

template <class T>
void test_adjacent_difference(mt19937_64& gen) {
using Limits = numeric_limits<T>;

uniform_int_distribution<conditional_t<sizeof(T) == 1, int, T>> dis(
is_signed_v<T> ? static_cast<T>(Limits::min() / 2) : Limits::min(),
is_signed_v<T> ? static_cast<T>(Limits::max() / 2) : Limits::max());

vector<T> input;
vector<T> output_expected;
vector<T> output_actual;

for (const auto& v : {&input, &output_expected, &output_actual}) {
v->reserve(dataCount);
}

test_case_adj_diff(input, output_expected, output_actual);
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
input.push_back(static_cast<T>(dis(gen)));

for (const auto& v : {&output_expected, &output_actual}) {
v->assign(input.size(), 0);
}

test_case_adj_diff(input, output_expected, output_actual);
}
}

void test_adjacent_difference_with_heterogeneous_types() {
const vector<unsigned char> input = {10, 70, 20, 90};
vector<int> output(4);

const auto result = adjacent_difference(input.begin(), input.end(), output.begin());
assert(result == output.end());

const vector<int> expected = {10, 60, -50, 70};
assert(output == expected);
}

template <class FwdIt, class T>
ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) {
ptrdiff_t result = 0;
Expand Down Expand Up @@ -895,6 +964,18 @@ void test_swap_arrays(mt19937_64& gen) {
}

void test_vector_algorithms(mt19937_64& gen) {
test_adjacent_difference<char>(gen);
test_adjacent_difference<signed char>(gen);
test_adjacent_difference<unsigned char>(gen);
test_adjacent_difference<short>(gen);
test_adjacent_difference<unsigned short>(gen);
test_adjacent_difference<int>(gen);
test_adjacent_difference<unsigned int>(gen);
test_adjacent_difference<long long>(gen);
test_adjacent_difference<unsigned long long>(gen);

test_adjacent_difference_with_heterogeneous_types();

test_count<char>(gen);
test_count<signed char>(gen);
test_count<unsigned char>(gen);
Expand Down

0 comments on commit 1990083

Please sign in to comment.