diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 74b694f0314..122d11879bc 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -106,6 +106,7 @@ function(add_benchmark name) target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark) endfunction() +add_benchmark(adjacent_difference src/adjacent_difference.cpp) add_benchmark(bitset_from_string src/bitset_from_string.cpp) add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp) diff --git a/benchmarks/src/adjacent_difference.cpp b/benchmarks/src/adjacent_difference.cpp new file mode 100644 index 00000000000..fdda4326059 --- /dev/null +++ b/benchmarks/src/adjacent_difference.cpp @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +template +void bm(benchmark::State& state) { + mt19937 gen(96337); + + const size_t size = static_cast(state.range(0)); + + vector input(size); + vector output(size); + + if constexpr (is_floating_point_v) { + normal_distribution dis(-100.0, 100.0); + ranges::generate(input, [&] { return dis(gen); }); + } else { + static_assert(is_unsigned_v, "This avoids signed integers to avoid UB; they shouldn't perform differently"); + uniform_int_distribution> dis(0, numeric_limits::max()); + ranges::generate(input, [&] { return static_cast(dis(gen)); }); + } + + for (auto _ : state) { + benchmark::DoNotOptimize(input); + adjacent_difference(input.begin(), input.end(), output.begin()); + benchmark::DoNotOptimize(output); + } +} + +void common_args(auto bm) { + bm->Arg(2255); +} + +#pragma warning(push) +#pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +#pragma warning(pop) + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK_MAIN(); diff --git a/stl/inc/numeric b/stl/inc/numeric index 528969fdc63..b0d0e8a25c3 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -461,6 +461,20 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ _UnaryOp _Transform_op) noexcept; // terminates #endif // _HAS_CXX17 +template +void _Adjacent_difference_no_overlap( + _TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const ptrdiff_t _Count, _BinOp _Func) { + _Dest[0] = _Src[0]; + for (ptrdiff_t _Ix = 1; _Ix != _Count; ++_Ix) { +#if _HAS_CXX20 + _TySrc _Tmp = _Src[_Ix - 1]; + _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + _Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]); +#endif // ^^^ !_HAS_CXX20 ^^^ + } +} + _EXPORT_STD template _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _OutIt _Dest, _BinOp _Func) { // compute adjacent differences into _Dest @@ -469,6 +483,32 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _ULast = _STD _Get_unwrapped(_Last); auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast)); if (_UFirst != _ULast) { + if constexpr (_Iterators_are_contiguous && !_Iterator_is_volatile<_InIt> + && is_trivially_copyable_v<_Iter_value_t<_InIt>>) { +#if _HAS_CXX20 + if (!_STD is_constant_evaluated()) +#endif + { + // Go with pointers and without loop-carried dependency to enable vectorization + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Last_ptr = _STD _To_address(_ULast); + const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _Count = _Last_ptr - _First_ptr; + + // Need to perform aliasing analysis. + // The vectorizer is generally able to do that on its own, and would guard the vectorized code with + // that, but when we eliminate the loop-carried dependency we change the semantics of the unvectorized + // code too. So we need to perform this check manually, and after that we can tell the compiler that + // there's no aliasing, to avoid it checking for that again. + if (reinterpret_cast(_Dest_ptr + _Count) <= reinterpret_cast(_First_ptr) + || reinterpret_cast(_Last_ptr) <= reinterpret_cast(_Dest_ptr)) { + _STD _Adjacent_difference_no_overlap(_Dest_ptr, _First_ptr, _Count, _STD _Pass_fn(_Func)); + _STD _Seek_wrapped(_Dest, _UDest + _Count); + return _Dest; + } + } + } + _Iter_value_t<_InIt> _Val(*_UFirst); *_UDest = _Val; while (++_UFirst != _ULast) { // compute another difference diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 8498602479f..3e81ff63309 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,74 @@ using namespace std; #pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension #endif // __clang__ +template +OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest, BinOp binop) { + if (first == last) { + return dest; + } + + auto val = *first; + *dest = val; + + for (++first, ++dest; first != last; ++first, ++dest) { + auto tmp = *first; + *dest = binop(tmp, val); + val = tmp; + } + + return dest; +} + +template +void test_case_adj_diff(const vector& input, vector& output_expected, vector& output_actual) { + // Avoid truncation warnings: + const auto subtract = [](const T& left, const T& right) { return static_cast(left - right); }; + const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin(), subtract); + const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin(), subtract); + + assert(actual - output_actual.begin() == expected - output_expected.begin()); + assert(output_actual == output_expected); +} + +template +void test_adjacent_difference(mt19937_64& gen) { + using Limits = numeric_limits; + + uniform_int_distribution> dis( + is_signed_v ? static_cast(Limits::min() / 2) : Limits::min(), + is_signed_v ? static_cast(Limits::max() / 2) : Limits::max()); + + vector input; + vector output_expected; + vector output_actual; + + for (const auto& v : {&input, &output_expected, &output_actual}) { + v->reserve(dataCount); + } + + test_case_adj_diff(input, output_expected, output_actual); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(static_cast(dis(gen))); + + for (const auto& v : {&output_expected, &output_actual}) { + v->assign(input.size(), 0); + } + + test_case_adj_diff(input, output_expected, output_actual); + } +} + +void test_adjacent_difference_with_heterogeneous_types() { + const vector input = {10, 70, 20, 90}; + vector output(4); + + const auto result = adjacent_difference(input.begin(), input.end(), output.begin()); + assert(result == output.end()); + + const vector expected = {10, 60, -50, 70}; + assert(output == expected); +} + template ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) { ptrdiff_t result = 0; @@ -895,6 +964,18 @@ void test_swap_arrays(mt19937_64& gen) { } void test_vector_algorithms(mt19937_64& gen) { + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + + test_adjacent_difference_with_heterogeneous_types(); + test_count(gen); test_count(gen); test_count(gen);