From 85a67ecfc13113c340d267f182ac33e363bc3b01 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 09:50:29 +0300 Subject: [PATCH 01/24] benchmark --- benchmarks/CMakeLists.txt | 1 + benchmarks/src/adjacent_difference.cpp | 55 ++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 benchmarks/src/adjacent_difference.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 31572a968f..204f8ae410 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -106,6 +106,7 @@ function(add_benchmark name) target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark) endfunction() +add_benchmark(adjacent_difference src/adjacent_difference.cpp) add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp) add_benchmark(find_and_count src/find_and_count.cpp) diff --git a/benchmarks/src/adjacent_difference.cpp b/benchmarks/src/adjacent_difference.cpp new file mode 100644 index 0000000000..133ed4dead --- /dev/null +++ b/benchmarks/src/adjacent_difference.cpp @@ -0,0 +1,55 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +using namespace std; + +template +void bm(benchmark::State& state) { + mt19937 gen(96337); + + const size_t size = static_cast(state.range(0)); + + std::vector input(size); + std::vector output(size); + + if constexpr (is_floating_point_v) { + normal_distribution dis(-100.0, 100.0); + ranges::generate(input, [&] { return dis(gen); }); + } else { + uniform_int_distribution> dis(0, numeric_limits::max()); + ranges::generate(input, [&] { return static_cast(dis(gen)); }); + } + + for (auto _ : state) { + benchmark::DoNotOptimize(input); + adjacent_difference(input.begin(), input.end(), output.begin()); + benchmark::DoNotOptimize(output); + } +} + +void common_args(auto bm) { + bm->Arg(2255); +} + +// Avoid signed integers to avoid UB; they shouldn't perform differently from the unsigned + +#pragma warning(push) +#pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +#pragma warning(pop) + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK_MAIN(); From b0bab6952d8a8ba9aaa21fff3de6bcecc8ef95ed Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 12:25:59 +0300 Subject: [PATCH 02/24] test coverage --- .../VSO_0000000_vector_algorithms/test.cpp | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 28f5c92d37..a01028f23c 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,66 @@ using namespace std; #pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension #endif // __clang__ +template +OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) { + if (first == last) { + return dest; + } + + auto val = *first; + *dest = val; + + for (++first, ++dest; first != last; ++first, ++dest) { + auto tmp = *first; + *dest = tmp - val; + val = tmp; + } + + return dest; +} + +template +void test_case_adj_diff(const vector& input, vector& output_expected, vector& output_actual) { + auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin()); + auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin()); + + assert(actual - output_actual.begin() == expected - output_expected.begin()); + assert(output_actual == output_expected); +} + +template +void test_adjacent_difference(mt19937_64& gen) { + using Limits = numeric_limits; + + uniform_int_distribution> dis( + is_signed_v ? static_cast(Limits::min() / 2) : Limits::min(), + is_signed_v ? static_cast(Limits::max() / 2) : Limits::max()); + + vector input; + vector output_actual; + vector output_expected; + + vector* const all__output_vectors[] = {&output_actual, &output_expected}; + vector* const all_vectors[] = {&input, &output_actual, &output_expected}; + + for (auto v : all_vectors) { + v->reserve(dataCount); + } + + test_case_adj_diff(input, output_expected, output_actual); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + for (auto v : all__output_vectors) { + generate(v->begin(), v->end(), [&] { return static_cast(dis(gen)); }); + } + + for (auto v : all_vectors) { + v->push_back(static_cast(dis(gen))); + } + + test_case_adj_diff(input, output_expected, output_actual); + } +} + template ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) { ptrdiff_t result = 0; @@ -776,6 +837,11 @@ void test_swap_ranges(mt19937_64& gen) { } void test_vector_algorithms(mt19937_64& gen) { + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_count(gen); test_count(gen); test_count(gen); From eb0cf5a3e982f85304e1f01282814c13e9ab76fa Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 12:40:46 +0300 Subject: [PATCH 03/24] the optimization --- stl/inc/numeric | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/stl/inc/numeric b/stl/inc/numeric index 528969fdc6..2b2180861a 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -461,6 +461,23 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ _UnaryOp _Transform_op) noexcept; // terminates #endif // _HAS_CXX17 +template +_CONSTEXPR20 bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const size_t _Common_count) { + const uintptr_t _First_val = reinterpret_cast(_First); + const uintptr_t _Second_val = reinterpret_cast(_Second); + const size_t Size_bytes = _Common_count * sizeof(_Ty); + return _First_val + Size_bytes > _Second_val && _Second_val + Size_bytes > _First_val; +} + +template +_CONSTEXPR20 void _Adjacent_difference_no_overlap( + _Ty* __restrict _Dest, const _Ty* __restrict _Src, const size_t _Count, _BinOp _Func) { + _Dest[0] = _Src[0]; + for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { + _Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]); + } +} + _EXPORT_STD template _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _OutIt _Dest, _BinOp _Func) { // compute adjacent differences into _Dest @@ -469,6 +486,24 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _ULast = _STD _Get_unwrapped(_Last); auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast)); if (_UFirst != _ULast) { + if constexpr (_Iterators_are_contiguous<_InIt, _OutIt> && !_Iterator_is_volatile<_InIt> + && is_trivially_copyable_v<_Iter_value_t<_InIt>>) { + // Go with pointers and without loop-carried dependency to enable vectorization + const auto _Count = _ULast - _UFirst; + const auto _Src_ptr = _To_address(_UFirst); + const auto _Dest_ptr = _To_address(_Dest); + // Need to perform aliasing analysis. + // The vectorizer is generally able to do that on its own, and would guard the vectorized code with that, + // but when we elimniate loop-carried dependency we change the semantic of the unvectorized code too. + // So we need to perform this check manually, and after that can tell the compiler that there's no aliasing, + // to avoid it checking for that again. + if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { + _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count), _Func); + _STD _Seek_wrapped(_Dest, _UDest + _Count); + return _Dest; + } + } + _Iter_value_t<_InIt> _Val(*_UFirst); *_UDest = _Val; while (++_UFirst != _ULast) { // compute another difference From eed039f798b1c35ed1df2c9426e9f333208fd98e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 14:02:45 +0300 Subject: [PATCH 04/24] You shall pass! --- stl/inc/numeric | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 2b2180861a..a6e1791e70 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -498,7 +498,7 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ // So we need to perform this check manually, and after that can tell the compiler that there's no aliasing, // to avoid it checking for that again. if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { - _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count), _Func); + _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count), _STD _Pass_fn(_Func)); _STD _Seek_wrapped(_Dest, _UDest + _Count); return _Dest; } From bc92d5495e48e4e804a5b8488d4647aa034ef5f1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 14:27:04 +0300 Subject: [PATCH 05/24] constexpr --- stl/inc/numeric | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index a6e1791e70..65e9ef8c5d 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -462,7 +462,7 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ #endif // _HAS_CXX17 template -_CONSTEXPR20 bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const size_t _Common_count) { +bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const size_t _Common_count) { const uintptr_t _First_val = reinterpret_cast(_First); const uintptr_t _Second_val = reinterpret_cast(_Second); const size_t Size_bytes = _Common_count * sizeof(_Ty); @@ -470,7 +470,7 @@ _CONSTEXPR20 bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Sec } template -_CONSTEXPR20 void _Adjacent_difference_no_overlap( +void _Adjacent_difference_no_overlap( _Ty* __restrict _Dest, const _Ty* __restrict _Src, const size_t _Count, _BinOp _Func) { _Dest[0] = _Src[0]; for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { @@ -488,19 +488,25 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ if (_UFirst != _ULast) { if constexpr (_Iterators_are_contiguous<_InIt, _OutIt> && !_Iterator_is_volatile<_InIt> && is_trivially_copyable_v<_Iter_value_t<_InIt>>) { - // Go with pointers and without loop-carried dependency to enable vectorization - const auto _Count = _ULast - _UFirst; - const auto _Src_ptr = _To_address(_UFirst); - const auto _Dest_ptr = _To_address(_Dest); - // Need to perform aliasing analysis. - // The vectorizer is generally able to do that on its own, and would guard the vectorized code with that, - // but when we elimniate loop-carried dependency we change the semantic of the unvectorized code too. - // So we need to perform this check manually, and after that can tell the compiler that there's no aliasing, - // to avoid it checking for that again. - if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { - _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count), _STD _Pass_fn(_Func)); - _STD _Seek_wrapped(_Dest, _UDest + _Count); - return _Dest; +#if _HAS_CXX20 + if (!_STD is_constant_evaluated()) +#endif + { + // Go with pointers and without loop-carried dependency to enable vectorization + const auto _Count = _ULast - _UFirst; + const auto _Src_ptr = _To_address(_UFirst); + const auto _Dest_ptr = _To_address(_Dest); + // Need to perform aliasing analysis. + // The vectorizer is generally able to do that on its own, and would guard the vectorized code with + // that, but when we elimniate loop-carried dependency we change the semantic of the unvectorized code + // too. So we need to perform this check manually, and after that can tell the compiler that there's no + // aliasing, to avoid it checking for that again. + if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { + _Adjacent_difference_no_overlap( + _Dest_ptr, _Src_ptr, static_cast(_Count), _STD _Pass_fn(_Func)); + _STD _Seek_wrapped(_Dest, _UDest + _Count); + return _Dest; + } } } From fd5108047d8356c71d896daad46b978536131956 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 14:30:30 +0300 Subject: [PATCH 06/24] ADL --- stl/inc/numeric | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 65e9ef8c5d..a8a494032c 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -494,15 +494,15 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ { // Go with pointers and without loop-carried dependency to enable vectorization const auto _Count = _ULast - _UFirst; - const auto _Src_ptr = _To_address(_UFirst); - const auto _Dest_ptr = _To_address(_Dest); + const auto _Src_ptr = _STD _To_address(_UFirst); + const auto _Dest_ptr = _STD _To_address(_Dest); // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with // that, but when we elimniate loop-carried dependency we change the semantic of the unvectorized code // too. So we need to perform this check manually, and after that can tell the compiler that there's no // aliasing, to avoid it checking for that again. - if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { - _Adjacent_difference_no_overlap( + if (!_STD _Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { + _STD _Adjacent_difference_no_overlap( _Dest_ptr, _Src_ptr, static_cast(_Count), _STD _Pass_fn(_Func)); _STD _Seek_wrapped(_Dest, _UDest + _Count); return _Dest; From 3db07e7485ddd15930dd1186a5d3d4c32b7c5585 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 16:02:53 +0300 Subject: [PATCH 07/24] rvalue --- stl/inc/numeric | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index a8a494032c..998a9e40d9 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -474,7 +474,8 @@ void _Adjacent_difference_no_overlap( _Ty* __restrict _Dest, const _Ty* __restrict _Src, const size_t _Count, _BinOp _Func) { _Dest[0] = _Src[0]; for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { - _Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]); + _Ty _Tmp = _Src[_Ix - 1]; + _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); } } From 53a60d4b203a4b772f1d664333a0da4cc32cbe7d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 16:22:09 +0300 Subject: [PATCH 08/24] Review comments --- stl/inc/numeric | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 998a9e40d9..0e39f9fd38 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -471,11 +471,15 @@ bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const si template void _Adjacent_difference_no_overlap( - _Ty* __restrict _Dest, const _Ty* __restrict _Src, const size_t _Count, _BinOp _Func) { + _Ty* const __restrict _Dest, _Ty* const __restrict _Src, const size_t _Count, _BinOp _Func) { _Dest[0] = _Src[0]; for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { +#if _HAS_CXX20 _Ty _Tmp = _Src[_Ix - 1]; _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + _Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]); +#endif // ^^^ !_HAS_CXX20 ^^^ } } From 51595a3d0b2ead1ed192301553a674cc158edf35 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 Sep 2024 16:50:56 +0300 Subject: [PATCH 09/24] types --- stl/inc/numeric | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 0e39f9fd38..9b861d74a6 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -461,22 +461,22 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ _UnaryOp _Transform_op) noexcept; // terminates #endif // _HAS_CXX17 -template -bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const size_t _Common_count) { +template +bool _Arrays_overlap(const _Ty1* const _First, const _Ty2* const _Second, const size_t _Common_count) { const uintptr_t _First_val = reinterpret_cast(_First); const uintptr_t _Second_val = reinterpret_cast(_Second); - const size_t Size_bytes = _Common_count * sizeof(_Ty); - return _First_val + Size_bytes > _Second_val && _Second_val + Size_bytes > _First_val; + return _First_val + _Common_count * sizeof(_Ty1) > _Second_val + && _Second_val + _Common_count * sizeof(_Ty2) > _First_val; } -template +template void _Adjacent_difference_no_overlap( - _Ty* const __restrict _Dest, _Ty* const __restrict _Src, const size_t _Count, _BinOp _Func) { + _TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const size_t _Count, _BinOp _Func) { _Dest[0] = _Src[0]; for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { #if _HAS_CXX20 - _Ty _Tmp = _Src[_Ix - 1]; - _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); + _TySrc _Tmp = _Src[_Ix - 1]; + _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); #else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv _Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]); #endif // ^^^ !_HAS_CXX20 ^^^ From 99e6058b07d191c4a6b0850f87f70cb417b904b1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 19 Sep 2024 18:28:01 +0300 Subject: [PATCH 10/24] Pointer math --- stl/inc/numeric | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 9b861d74a6..4e598cf590 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -461,14 +461,6 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ _UnaryOp _Transform_op) noexcept; // terminates #endif // _HAS_CXX17 -template -bool _Arrays_overlap(const _Ty1* const _First, const _Ty2* const _Second, const size_t _Common_count) { - const uintptr_t _First_val = reinterpret_cast(_First); - const uintptr_t _Second_val = reinterpret_cast(_Second); - return _First_val + _Common_count * sizeof(_Ty1) > _Second_val - && _Second_val + _Common_count * sizeof(_Ty2) > _First_val; -} - template void _Adjacent_difference_no_overlap( _TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const size_t _Count, _BinOp _Func) { @@ -498,17 +490,17 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ #endif { // Go with pointers and without loop-carried dependency to enable vectorization - const auto _Count = _ULast - _UFirst; - const auto _Src_ptr = _STD _To_address(_UFirst); - const auto _Dest_ptr = _STD _To_address(_Dest); + const auto _Count = _ULast - _UFirst; + const auto _Count_as_size = static_cast(_Count); + const auto _Src_ptr = _STD _To_address(_UFirst); + const auto _Dest_ptr = _STD _To_address(_Dest); // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with // that, but when we elimniate loop-carried dependency we change the semantic of the unvectorized code // too. So we need to perform this check manually, and after that can tell the compiler that there's no // aliasing, to avoid it checking for that again. - if (!_STD _Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast(_Count))) { - _STD _Adjacent_difference_no_overlap( - _Dest_ptr, _Src_ptr, static_cast(_Count), _STD _Pass_fn(_Func)); + if (_Dest_ptr + _Count_as_size <= _Src_ptr || _Src_ptr + _Count_as_size <= _Dest_ptr) { + _STD _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, _Count_as_size, _STD _Pass_fn(_Func)); _STD _Seek_wrapped(_Dest, _UDest + _Count); return _Dest; } From 3cdbc88525aad6a35d147fbf325a38a0a35a0462 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:22:45 +0300 Subject: [PATCH 11/24] std already --- benchmarks/src/adjacent_difference.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/src/adjacent_difference.cpp b/benchmarks/src/adjacent_difference.cpp index 133ed4dead..96a0454bc4 100644 --- a/benchmarks/src/adjacent_difference.cpp +++ b/benchmarks/src/adjacent_difference.cpp @@ -16,8 +16,8 @@ void bm(benchmark::State& state) { const size_t size = static_cast(state.range(0)); - std::vector input(size); - std::vector output(size); + vector input(size); + vector output(size); if constexpr (is_floating_point_v) { normal_distribution dis(-100.0, 100.0); From 18f64cabaa75fe8ae446c5f4166f3fbf9adb1d7a Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:23:06 +0300 Subject: [PATCH 12/24] const --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 3f73605ef5..0fefe6cc25 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -55,8 +55,8 @@ OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) { template void test_case_adj_diff(const vector& input, vector& output_expected, vector& output_actual) { - auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin()); - auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin()); + const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin()); + const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin()); assert(actual - output_actual.begin() == expected - output_expected.begin()); assert(output_actual == output_expected); From c68b402f717edde2b6fc0fe5a0a9682d7aedbc2d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:26:39 +0300 Subject: [PATCH 13/24] oops loops --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 0fefe6cc25..7fafdded0a 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -74,20 +74,17 @@ void test_adjacent_difference(mt19937_64& gen) { vector output_actual; vector output_expected; - vector* const all__output_vectors[] = {&output_actual, &output_expected}; - vector* const all_vectors[] = {&input, &output_actual, &output_expected}; - - for (auto v : all_vectors) { + for (const auto& v : {&input, &output_actual, &output_expected}) { v->reserve(dataCount); } test_case_adj_diff(input, output_expected, output_actual); for (size_t attempts = 0; attempts < dataCount; ++attempts) { - for (auto v : all__output_vectors) { + for (const auto& v : {&output_actual, &output_expected}) { generate(v->begin(), v->end(), [&] { return static_cast(dis(gen)); }); } - for (auto v : all_vectors) { + for (const auto& v : {&input, &output_actual, &output_expected}) { v->push_back(static_cast(dis(gen))); } From ba9a0989d02965d80b1e293292ebf57c2a7153e5 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:34:34 +0300 Subject: [PATCH 14/24] U --- stl/inc/numeric | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 4e598cf590..c0d8687d64 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -493,7 +493,7 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _Count = _ULast - _UFirst; const auto _Count_as_size = static_cast(_Count); const auto _Src_ptr = _STD _To_address(_UFirst); - const auto _Dest_ptr = _STD _To_address(_Dest); + const auto _Dest_ptr = _STD _To_address(_UDest); // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with // that, but when we elimniate loop-carried dependency we change the semantic of the unvectorized code From 3ddaf9e7c43f0b17684751e18126c046934e66d8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:35:27 +0300 Subject: [PATCH 15/24] typos --- stl/inc/numeric | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index c0d8687d64..4a270f15a3 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -496,7 +496,7 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _Dest_ptr = _STD _To_address(_UDest); // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with - // that, but when we elimniate loop-carried dependency we change the semantic of the unvectorized code + // that, but when we eliminate loop-carried dependency we change the semantics of the unvectorized code // too. So we need to perform this check manually, and after that can tell the compiler that there's no // aliasing, to avoid it checking for that again. if (_Dest_ptr + _Count_as_size <= _Src_ptr || _Src_ptr + _Count_as_size <= _Dest_ptr) { From 944672bfd1df08bbaba96563a52c8a2ad892d862 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:36:44 +0300 Subject: [PATCH 16/24] includes --- benchmarks/src/adjacent_difference.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/src/adjacent_difference.cpp b/benchmarks/src/adjacent_difference.cpp index 96a0454bc4..de6c924e09 100644 --- a/benchmarks/src/adjacent_difference.cpp +++ b/benchmarks/src/adjacent_difference.cpp @@ -1,11 +1,14 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include +#include #include #include #include #include +#include #include using namespace std; From 0042fe79d659aa52a95e53d6bb30349578fd33a1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 24 Oct 2024 20:37:48 +0300 Subject: [PATCH 17/24] non-random filler --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 7fafdded0a..27945f2f99 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -80,8 +80,10 @@ void test_adjacent_difference(mt19937_64& gen) { test_case_adj_diff(input, output_expected, output_actual); for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(static_cast(dis(gen))); + for (const auto& v : {&output_actual, &output_expected}) { - generate(v->begin(), v->end(), [&] { return static_cast(dis(gen)); }); + v->assign(input.size(), 0); } for (const auto& v : {&input, &output_actual, &output_expected}) { From d06ea71a325d072eff85c840f500f4f008746cdf Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 25 Oct 2024 06:36:15 +0300 Subject: [PATCH 18/24] stray --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 27945f2f99..ab8b4dcb9f 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -86,10 +86,6 @@ void test_adjacent_difference(mt19937_64& gen) { v->assign(input.size(), 0); } - for (const auto& v : {&input, &output_actual, &output_expected}) { - v->push_back(static_cast(dis(gen))); - } - test_case_adj_diff(input, output_expected, output_actual); } } From f132d46893b55761cf7a80083aad2d628cf08c86 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 26 Oct 2024 06:18:45 -0700 Subject: [PATCH 19/24] Test 8-bit and 16-bit, avoid truncation warnings. --- .../VSO_0000000_vector_algorithms/test.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index a0a2193794..a6a211d0fb 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -35,8 +35,8 @@ using namespace std; #pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension #endif // __clang__ -template -OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) { +template +OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest, BinOp binop) { if (first == last) { return dest; } @@ -46,7 +46,7 @@ OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) { for (++first, ++dest; first != last; ++first, ++dest) { auto tmp = *first; - *dest = tmp - val; + *dest = binop(tmp, val); val = tmp; } @@ -55,8 +55,10 @@ OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) { template void test_case_adj_diff(const vector& input, vector& output_expected, vector& output_actual) { - const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin()); - const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin()); + // Avoid truncation warnings: + const auto subtract = [](const T& left, const T& right) { return static_cast(left - right); }; + const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin(), subtract); + const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin(), subtract); assert(actual - output_actual.begin() == expected - output_expected.begin()); assert(output_actual == output_expected); @@ -951,6 +953,11 @@ void test_swap_arrays(mt19937_64& gen) { } void test_vector_algorithms(mt19937_64& gen) { + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); + test_adjacent_difference(gen); test_adjacent_difference(gen); test_adjacent_difference(gen); test_adjacent_difference(gen); From cfb92a42d938a5523683699613396b806d5698f3 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 26 Oct 2024 06:40:13 -0700 Subject: [PATCH 20/24] static_assert: The documentation that rewards poor reading comprehension with DOOM! --- benchmarks/src/adjacent_difference.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/src/adjacent_difference.cpp b/benchmarks/src/adjacent_difference.cpp index de6c924e09..fdda432605 100644 --- a/benchmarks/src/adjacent_difference.cpp +++ b/benchmarks/src/adjacent_difference.cpp @@ -26,6 +26,7 @@ void bm(benchmark::State& state) { normal_distribution dis(-100.0, 100.0); ranges::generate(input, [&] { return dis(gen); }); } else { + static_assert(is_unsigned_v, "This avoids signed integers to avoid UB; they shouldn't perform differently"); uniform_int_distribution> dis(0, numeric_limits::max()); ranges::generate(input, [&] { return static_cast(dis(gen)); }); } @@ -41,8 +42,6 @@ void common_args(auto bm) { bm->Arg(2255); } -// Avoid signed integers to avoid UB; they shouldn't perform differently from the unsigned - #pragma warning(push) #pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data BENCHMARK(bm)->Apply(common_args); From c2a130fa9e1ea0ae0d98d238cbe33dc38a8b151a Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 26 Oct 2024 06:59:26 -0700 Subject: [PATCH 21/24] Consistently order output_expected before output_actual. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index a6a211d0fb..8ededcd750 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -57,8 +57,8 @@ template void test_case_adj_diff(const vector& input, vector& output_expected, vector& output_actual) { // Avoid truncation warnings: const auto subtract = [](const T& left, const T& right) { return static_cast(left - right); }; - const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin(), subtract); const auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin(), subtract); + const auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin(), subtract); assert(actual - output_actual.begin() == expected - output_expected.begin()); assert(output_actual == output_expected); @@ -73,10 +73,10 @@ void test_adjacent_difference(mt19937_64& gen) { is_signed_v ? static_cast(Limits::max() / 2) : Limits::max()); vector input; - vector output_actual; vector output_expected; + vector output_actual; - for (const auto& v : {&input, &output_actual, &output_expected}) { + for (const auto& v : {&input, &output_expected, &output_actual}) { v->reserve(dataCount); } @@ -84,7 +84,7 @@ void test_adjacent_difference(mt19937_64& gen) { for (size_t attempts = 0; attempts < dataCount; ++attempts) { input.push_back(static_cast(dis(gen))); - for (const auto& v : {&output_actual, &output_expected}) { + for (const auto& v : {&output_expected, &output_actual}) { v->assign(input.size(), 0); } From 7a110a8e6f302b2aa6c4b9afc17077684ee1c9f5 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 26 Oct 2024 07:27:44 -0700 Subject: [PATCH 22/24] Minor comment grammar improvements. --- stl/inc/numeric | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index 4a270f15a3..f081e13e7b 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -496,9 +496,9 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _Dest_ptr = _STD _To_address(_UDest); // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with - // that, but when we eliminate loop-carried dependency we change the semantics of the unvectorized code - // too. So we need to perform this check manually, and after that can tell the compiler that there's no - // aliasing, to avoid it checking for that again. + // that, but when we eliminate the loop-carried dependency we change the semantics of the unvectorized + // code too. So we need to perform this check manually, and after that we can tell the compiler that + // there's no aliasing, to avoid it checking for that again. if (_Dest_ptr + _Count_as_size <= _Src_ptr || _Src_ptr + _Count_as_size <= _Dest_ptr) { _STD _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, _Count_as_size, _STD _Pass_fn(_Func)); _STD _Seek_wrapped(_Dest, _UDest + _Count); From d86bf297ff6792f966046d40811c35b423978958 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 26 Oct 2024 07:36:23 -0700 Subject: [PATCH 23/24] Fix perf bug: Inspect unwrapped iterators with `_Iterators_are_contiguous`. --- stl/inc/numeric | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index f081e13e7b..e8935bdc0b 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -483,7 +483,7 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ const auto _ULast = _STD _Get_unwrapped(_Last); auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast)); if (_UFirst != _ULast) { - if constexpr (_Iterators_are_contiguous<_InIt, _OutIt> && !_Iterator_is_volatile<_InIt> + if constexpr (_Iterators_are_contiguous && !_Iterator_is_volatile<_InIt> && is_trivially_copyable_v<_Iter_value_t<_InIt>>) { #if _HAS_CXX20 if (!_STD is_constant_evaluated()) From 9b85627d848ad56e0afa081d453e1b995414525f Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Tue, 29 Oct 2024 15:50:30 -0700 Subject: [PATCH 24/24] Fix and test adjacent_difference with heterogeneous types. --- stl/inc/numeric | 18 ++++++++++-------- .../VSO_0000000_vector_algorithms/test.cpp | 13 +++++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/stl/inc/numeric b/stl/inc/numeric index e8935bdc0b..b0d0e8a25c 100644 --- a/stl/inc/numeric +++ b/stl/inc/numeric @@ -463,9 +463,9 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _ template void _Adjacent_difference_no_overlap( - _TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const size_t _Count, _BinOp _Func) { + _TyDest* const __restrict _Dest, _TySrc* const __restrict _Src, const ptrdiff_t _Count, _BinOp _Func) { _Dest[0] = _Src[0]; - for (size_t _Ix = 1; _Ix != _Count; ++_Ix) { + for (ptrdiff_t _Ix = 1; _Ix != _Count; ++_Ix) { #if _HAS_CXX20 _TySrc _Tmp = _Src[_Ix - 1]; _Dest[_Ix] = _Func(_Src[_Ix], _STD move(_Tmp)); @@ -490,17 +490,19 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _ #endif { // Go with pointers and without loop-carried dependency to enable vectorization - const auto _Count = _ULast - _UFirst; - const auto _Count_as_size = static_cast(_Count); - const auto _Src_ptr = _STD _To_address(_UFirst); - const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Last_ptr = _STD _To_address(_ULast); + const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _Count = _Last_ptr - _First_ptr; + // Need to perform aliasing analysis. // The vectorizer is generally able to do that on its own, and would guard the vectorized code with // that, but when we eliminate the loop-carried dependency we change the semantics of the unvectorized // code too. So we need to perform this check manually, and after that we can tell the compiler that // there's no aliasing, to avoid it checking for that again. - if (_Dest_ptr + _Count_as_size <= _Src_ptr || _Src_ptr + _Count_as_size <= _Dest_ptr) { - _STD _Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, _Count_as_size, _STD _Pass_fn(_Func)); + if (reinterpret_cast(_Dest_ptr + _Count) <= reinterpret_cast(_First_ptr) + || reinterpret_cast(_Last_ptr) <= reinterpret_cast(_Dest_ptr)) { + _STD _Adjacent_difference_no_overlap(_Dest_ptr, _First_ptr, _Count, _STD _Pass_fn(_Func)); _STD _Seek_wrapped(_Dest, _UDest + _Count); return _Dest; } diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 8ededcd750..3e81ff6330 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -92,6 +92,17 @@ void test_adjacent_difference(mt19937_64& gen) { } } +void test_adjacent_difference_with_heterogeneous_types() { + const vector input = {10, 70, 20, 90}; + vector output(4); + + const auto result = adjacent_difference(input.begin(), input.end(), output.begin()); + assert(result == output.end()); + + const vector expected = {10, 60, -50, 70}; + assert(output == expected); +} + template ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) { ptrdiff_t result = 0; @@ -963,6 +974,8 @@ void test_vector_algorithms(mt19937_64& gen) { test_adjacent_difference(gen); test_adjacent_difference(gen); + test_adjacent_difference_with_heterogeneous_types(); + test_count(gen); test_count(gen); test_count(gen);