Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Help the compiler vectorize adjacent_difference #4958

Merged
merged 27 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
85a67ec
benchmark
AlexGuteniev Sep 14, 2024
b0bab69
test coverage
AlexGuteniev Sep 14, 2024
eb0cf5a
the optimization
AlexGuteniev Sep 14, 2024
eed039f
You shall pass!
AlexGuteniev Sep 14, 2024
bc92d54
constexpr
AlexGuteniev Sep 14, 2024
fd51080
ADL
AlexGuteniev Sep 14, 2024
3db07e7
rvalue
AlexGuteniev Sep 14, 2024
53a60d4
Review comments
AlexGuteniev Sep 14, 2024
51595a3
types
AlexGuteniev Sep 14, 2024
99e6058
Pointer math
AlexGuteniev Sep 19, 2024
869a0cc
Merge remote-tracking branch 'upstream/main' into adjacent
AlexGuteniev Oct 18, 2024
3cdbc88
std already
AlexGuteniev Oct 24, 2024
18f64ca
const
AlexGuteniev Oct 24, 2024
c68b402
oops loops
AlexGuteniev Oct 24, 2024
da4ef2b
Merge remote-tracking branch 'upstream/main' into adjacent
AlexGuteniev Oct 24, 2024
ba9a098
U
AlexGuteniev Oct 24, 2024
3ddaf9e
typos
AlexGuteniev Oct 24, 2024
944672b
includes
AlexGuteniev Oct 24, 2024
0042fe7
non-random filler
AlexGuteniev Oct 24, 2024
d06ea71
stray
AlexGuteniev Oct 25, 2024
df487ce
Merge branch 'main' into adjacent
StephanTLavavej Oct 26, 2024
f132d46
Test 8-bit and 16-bit, avoid truncation warnings.
StephanTLavavej Oct 26, 2024
cfb92a4
static_assert: The documentation that rewards poor reading comprehens…
StephanTLavavej Oct 26, 2024
c2a130f
Consistently order output_expected before output_actual.
StephanTLavavej Oct 26, 2024
7a110a8
Minor comment grammar improvements.
StephanTLavavej Oct 26, 2024
d86bf29
Fix perf bug: Inspect unwrapped iterators with `_Iterators_are_contig…
StephanTLavavej Oct 26, 2024
9b85627
Fix and test adjacent_difference with heterogeneous types.
StephanTLavavej Oct 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ function(add_benchmark name)
target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark)
endfunction()

add_benchmark(adjacent_difference src/adjacent_difference.cpp)
add_benchmark(bitset_from_string src/bitset_from_string.cpp)
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp)
Expand Down
58 changes: 58 additions & 0 deletions benchmarks/src/adjacent_difference.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <algorithm>
#include <benchmark/benchmark.h>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <numeric>
#include <random>
#include <type_traits>
#include <vector>

using namespace std;

template <class T>
void bm(benchmark::State& state) {
mt19937 gen(96337);

const size_t size = static_cast<size_t>(state.range(0));
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved

vector<T> input(size);
vector<T> output(size);

if constexpr (is_floating_point_v<T>) {
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
normal_distribution<T> dis(-100.0, 100.0);
ranges::generate(input, [&] { return dis(gen); });
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
} else {
uniform_int_distribution<conditional_t<sizeof(T) != 1, T, unsigned int>> dis(0, numeric_limits<T>::max());
ranges::generate(input, [&] { return static_cast<T>(dis(gen)); });
}

for (auto _ : state) {
benchmark::DoNotOptimize(input);
adjacent_difference(input.begin(), input.end(), output.begin());
benchmark::DoNotOptimize(output);
}
}

void common_args(auto bm) {
bm->Arg(2255);
}

// Avoid signed integers to avoid UB; they shouldn't perform differently from the unsigned
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved

#pragma warning(push)
#pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data
BENCHMARK(bm<uint8_t>)->Apply(common_args);
BENCHMARK(bm<uint16_t>)->Apply(common_args);
#pragma warning(pop)

BENCHMARK(bm<uint32_t>)->Apply(common_args);
BENCHMARK(bm<uint64_t>)->Apply(common_args);

BENCHMARK(bm<float>)->Apply(common_args);
BENCHMARK(bm<double>)->Apply(common_args);

BENCHMARK_MAIN();
38 changes: 38 additions & 0 deletions stl/inc/numeric
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,20 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _
_UnaryOp _Transform_op) noexcept; // terminates
#endif // _HAS_CXX17

template <class _TyDest, class _TySrc, class _BinOp>
void _Adjacent_difference_no_overlap(
_TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const size_t _Count, _BinOp _Func) {
_Dest[0] = _Src[0];
for (size_t _Ix = 1; _Ix != _Count; ++_Ix) {
#if _HAS_CXX20
_TySrc _Tmp = _Src[_Ix - 1];
_Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp));
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
_Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]);
#endif // ^^^ !_HAS_CXX20 ^^^
}
}

_EXPORT_STD template <class _InIt, class _OutIt, class _BinOp>
_CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _OutIt _Dest, _BinOp _Func) {
// compute adjacent differences into _Dest
Expand All @@ -469,6 +483,30 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast));
if (_UFirst != _ULast) {
if constexpr (_Iterators_are_contiguous<_InIt, _OutIt> && !_Iterator_is_volatile<_InIt>
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
&& is_trivially_copyable_v<_Iter_value_t<_InIt>>) {
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
#if _HAS_CXX20
if (!_STD is_constant_evaluated())
#endif
{
// Go with pointers and without loop-carried dependency to enable vectorization
const auto _Count = _ULast - _UFirst;
const auto _Count_as_size = static_cast<size_t>(_Count);
const auto _Src_ptr = _STD _To_address(_UFirst);
const auto _Dest_ptr = _STD _To_address(_UDest);
// Need to perform aliasing analysis.
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
// The vectorizer is generally able to do that on its own, and would guard the vectorized code with
// that, but when we eliminate loop-carried dependency we change the semantics of the unvectorized code
// too. So we need to perform this check manually, and after that can tell the compiler that there's no
// aliasing, to avoid it checking for that again.
if (_Dest_ptr + _Count_as_size <= _Src_ptr || _Src_ptr + _Count_as_size <= _Dest_ptr) {
_STD _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, _Count_as_size, _STD _Pass_fn(_Func));
_STD _Seek_wrapped(_Dest, _UDest + _Count);
return _Dest;
}
}
}

_Iter_value_t<_InIt> _Val(*_UFirst);
*_UDest = _Val;
while (++_UFirst != _ULast) { // compute another difference
Expand Down
65 changes: 65 additions & 0 deletions tests/std/tests/VSO_0000000_vector_algorithms/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <functional>
#include <limits>
#include <list>
#include <numeric>
#include <random>
#include <stdexcept>
#include <string>
Expand All @@ -34,6 +35,65 @@ using namespace std;
#pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension
#endif // __clang__

template <class InIt, class OutIt>
OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) {
if (first == last) {
return dest;
}

auto val = *first;
*dest = val;

for (++first, ++dest; first != last; ++first, ++dest) {
auto tmp = *first;
*dest = tmp - val;
val = tmp;
}

return dest;
}

template <class T>
void test_case_adj_diff(const vector<T>& input, vector<T>& output_expected, vector<T>& output_actual) {
const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin());
const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin());

assert(actual - output_actual.begin() == expected - output_expected.begin());
assert(output_actual == output_expected);
}

template <class T>
void test_adjacent_difference(mt19937_64& gen) {
using Limits = numeric_limits<T>;

uniform_int_distribution<conditional_t<sizeof(T) == 1, int, T>> dis(
is_signed_v<T> ? static_cast<T>(Limits::min() / 2) : Limits::min(),
is_signed_v<T> ? static_cast<T>(Limits::max() / 2) : Limits::max());

vector<T> input;
vector<T> output_actual;
vector<T> output_expected;

for (const auto& v : {&input, &output_actual, &output_expected}) {
v->reserve(dataCount);
}

test_case_adj_diff(input, output_expected, output_actual);
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
input.push_back(static_cast<T>(dis(gen)));

for (const auto& v : {&output_actual, &output_expected}) {
v->assign(input.size(), 0);
}
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved

for (const auto& v : {&input, &output_actual, &output_expected}) {
v->push_back(static_cast<T>(dis(gen)));
}

test_case_adj_diff(input, output_expected, output_actual);
}
}

template <class FwdIt, class T>
ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) {
ptrdiff_t result = 0;
Expand Down Expand Up @@ -777,6 +837,11 @@ void test_swap_ranges(mt19937_64& gen) {
}

void test_vector_algorithms(mt19937_64& gen) {
test_adjacent_difference<int>(gen);
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
test_adjacent_difference<unsigned int>(gen);
test_adjacent_difference<long long>(gen);
test_adjacent_difference<unsigned long long>(gen);

test_count<char>(gen);
test_count<signed char>(gen);
test_count<unsigned char>(gen);
Expand Down