diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 22503345a4a..f363c012e82 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -107,6 +107,7 @@ function(add_benchmark name) endfunction() add_benchmark(adjacent_difference src/adjacent_difference.cpp) +add_benchmark(adjacent_find src/adjacent_find.cpp) add_benchmark(bitset_from_string src/bitset_from_string.cpp) add_benchmark(bitset_to_string src/bitset_to_string.cpp) add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp) diff --git a/benchmarks/src/adjacent_find.cpp b/benchmarks/src/adjacent_find.cpp new file mode 100644 index 00000000000..67036be67bf --- /dev/null +++ b/benchmarks/src/adjacent_find.cpp @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +using namespace std; + +enum class AlgType { Std, Rng }; + +template +void bm(benchmark::State& state) { + const size_t size = static_cast(state.range(0)); + const size_t pos = static_cast(state.range(1)); + + vector v(size); + + for (size_t i = 0; i != size; ++i) { + v[i] = static_cast(i & 3); + } + + if (pos == 0 || pos >= size) { + abort(); + } + + v[pos] = v[pos - 1]; + + for (auto _ : state) { + benchmark::DoNotOptimize(v); + if constexpr (Alg == AlgType::Std) { + benchmark::DoNotOptimize(adjacent_find(v.begin(), v.end())); + } else { + benchmark::DoNotOptimize(ranges::adjacent_find(v)); + } + } +} + +void common_args(auto bm) { + bm->ArgPair(2525, 1142); +} + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); +BENCHMARK(bm)->Apply(common_args); + +BENCHMARK_MAIN(); diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 8109baf3a16..f4f7b097c88 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -542,6 +542,24 @@ _NODISCARD _CONSTEXPR20 _FwdIt adjacent_find(const _FwdIt _First, _FwdIt _Last, auto _UFirst = _STD _Get_unwrapped(_First); auto _ULast = _STD _Get_unwrapped(_Last); if (_UFirst != _ULast) { +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Equal_memcmp_is_safe) { + if (!_STD _Is_constant_evaluated()) { + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _STD _To_address(_ULast)); + + if constexpr (is_pointer_v) { + _ULast = _Result; + } else { + _ULast = _UFirst + (_Result - _First_ptr); + } + + _STD _Seek_wrapped(_Last, _ULast); + return _Last; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (auto _UNext = _UFirst; ++_UNext != _ULast; _UFirst = _UNext) { if (_Pred(*_UFirst, *_UNext)) { _ULast = _UFirst; diff --git a/stl/inc/xutility b/stl/inc/xutility index 4abd43c637c..222054fefaf 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -98,6 +98,11 @@ const void* __stdcall __std_find_last_trivial_2(const void* _First, const void* const void* __stdcall __std_find_last_trivial_4(const void* _First, const void* _Last, uint32_t _Val) noexcept; const void* __stdcall __std_find_last_trivial_8(const void* _First, const void* _Last, uint64_t _Val) noexcept; +const void* __stdcall __std_adjacent_find_1(const void* _First, const void* _Last) noexcept; +const void* __stdcall __std_adjacent_find_2(const void* _First, const void* _Last) noexcept; +const void* __stdcall __std_adjacent_find_4(const void* _First, const void* _Last) noexcept; +const void* __stdcall __std_adjacent_find_8(const void* _First, const void* _Last) noexcept; + const void* __stdcall __std_search_1( const void* _First1, const void* _Last1, const void* _First2, size_t _Count2) noexcept; const void* __stdcall __std_search_2( @@ -240,6 +245,21 @@ _Ty* _Find_last_vectorized(_Ty* const _First, _Ty* const _Last, const _TVal _Val } } +template +_Ty* _Adjacent_find_vectorized(_Ty* const _First, _Ty* const _Last) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return const_cast<_Ty*>(static_cast(::__std_adjacent_find_1(_First, _Last))); + } else if constexpr (sizeof(_Ty) == 2) { + return const_cast<_Ty*>(static_cast(::__std_adjacent_find_2(_First, _Last))); + } else if constexpr (sizeof(_Ty) == 4) { + return const_cast<_Ty*>(static_cast(::__std_adjacent_find_4(_First, _Last))); + } else if constexpr (sizeof(_Ty) == 8) { + return const_cast<_Ty*>(static_cast(::__std_adjacent_find_8(_First, _Last))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected size + } +} + // find_first_of vectorization is likely to be a win after this size (in elements) _INLINE_VAR constexpr ptrdiff_t _Threshold_find_first_of = 16; @@ -6786,6 +6806,23 @@ namespace ranges { return _First; } +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Equal_memcmp_is_safe<_It, _It, _Pr> && sized_sentinel_for<_Se, _It> + && is_same_v<_Pj, identity>) { + if (!_STD is_constant_evaluated()) { + const auto _First_ptr = _STD _To_address(_First); + const auto _Last_ptr = _First_ptr + (_Last - _First); + + const auto _Result = _STD _Adjacent_find_vectorized(_First_ptr, _Last_ptr); + if constexpr (is_pointer_v<_It>) { + return _Result; + } else { + return _First + (_Result - _First_ptr); + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (auto _Next = _First;; ++_First) { if (++_Next == _Last) { return _Next; diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 559de319996..aa7348263ee 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -2751,6 +2751,96 @@ namespace { } } + template + const void* __stdcall __std_adjacent_find_impl(const void* _First, const void* const _Last) noexcept { + if (_First == _Last) { + return _Last; + } + +#ifndef _M_ARM64EC + const size_t _Size_bytes = _Byte_length(_First, _Last) - sizeof(_Ty); + + if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) { + _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + + const void* _Stop_at = _First; + _Advance_bytes(_Stop_at, _Avx_size); + + do { + const void* _Next = _First; + _Advance_bytes(_Next, sizeof(_Ty)); + + const __m256i _Data = _mm256_loadu_si256(static_cast(_First)); + const __m256i _Comparand = _mm256_loadu_si256(static_cast(_Next)); + const int _Bingo = _mm256_movemask_epi8(_Traits::_Cmp_avx(_Data, _Comparand)); + + if (_Bingo != 0) { + const unsigned long _Offset = _tzcnt_u32(_Bingo); + _Advance_bytes(_First, _Offset); + return _First; + } + + _Advance_bytes(_First, 32); + } while (_First != _Stop_at); + + if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) { + const void* _Next = _First; + _Advance_bytes(_Next, sizeof(_Ty)); + + const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size >> 2); + const __m256i _Data = _mm256_maskload_epi32(static_cast(_First), _Tail_mask); + const __m256i _Comparand = _mm256_maskload_epi32(static_cast(_Next), _Tail_mask); + const int _Bingo = + _mm256_movemask_epi8(_mm256_and_si256(_Traits::_Cmp_avx(_Data, _Comparand), _Tail_mask)); + + if (_Bingo != 0) { + const unsigned long _Offset = _tzcnt_u32(_Bingo); + _Advance_bytes(_First, _Offset); + return _First; + } + + _Advance_bytes(_First, _Avx_tail_size); + } + + if constexpr (sizeof(_Ty) >= 4) { + return _Last; + } + } else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) { + const void* _Stop_at = _First; + _Advance_bytes(_Stop_at, _Sse_size); + + do { + const void* _Next = _First; + _Advance_bytes(_Next, sizeof(_Ty)); + + const __m128i _Data = _mm_loadu_si128(static_cast(_First)); + const __m128i _Comparand = _mm_loadu_si128(static_cast(_Next)); + const int _Bingo = _mm_movemask_epi8(_Traits::_Cmp_sse(_Data, _Comparand)); + + if (_Bingo != 0) { + unsigned long _Offset; + // CodeQL [SM02313] _Offset is always initialized: we just tested `if (_Bingo != 0)`. + _BitScanForward(&_Offset, _Bingo); + _Advance_bytes(_First, _Offset); + return _First; + } + + _Advance_bytes(_First, 16); + } while (_First != _Stop_at); + } +#endif // !_M_ARM64EC + + auto _Ptr = static_cast(_First); + auto _Next = _Ptr + 1; + for (; _Next != _Last; ++_Ptr, ++_Next) { + if (*_Ptr == *_Next) { + return _Ptr; + } + } + + return _Last; + } + struct _Count_traits_8 : _Find_traits_8 { #ifndef _M_ARM64EC static __m256i _Sub_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept { @@ -4788,6 +4878,22 @@ __declspec(noalias) size_t __stdcall __std_find_last_not_ch_pos_8( return __std_find_last_pos<_Find_traits_8, _Find_one_predicate::_Not_equal>(_First, _Last, _Val); } +const void* __stdcall __std_adjacent_find_1(const void* const _First, const void* const _Last) noexcept { + return __std_adjacent_find_impl<_Find_traits_1, uint8_t>(_First, _Last); +} + +const void* __stdcall __std_adjacent_find_2(const void* const _First, const void* const _Last) noexcept { + return __std_adjacent_find_impl<_Find_traits_2, uint16_t>(_First, _Last); +} + +const void* __stdcall __std_adjacent_find_4(const void* const _First, const void* const _Last) noexcept { + return __std_adjacent_find_impl<_Find_traits_4, uint32_t>(_First, _Last); +} + +const void* __stdcall __std_adjacent_find_8(const void* const _First, const void* const _Last) noexcept { + return __std_adjacent_find_impl<_Find_traits_8, uint64_t>(_First, _Last); +} + __declspec(noalias) size_t __stdcall __std_count_trivial_1( const void* const _First, const void* const _Last, const uint8_t _Val) noexcept { return __std_count_trivial_impl<_Count_traits_1>(_First, _Last, _Val); diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 2e5a86f1a35..ffb9261f1a2 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -103,6 +103,67 @@ void test_adjacent_difference_with_heterogeneous_types() { assert(output == expected); } +template +FwdIt last_known_good_adj_find(FwdIt first, FwdIt last) { + if (first == last) { + return last; + } + + auto next = first; + for (++next; next != last; ++first, ++next) { + if (*first == *next) { + return first; + } + } + + return last; +} + +template +void test_case_adj_find(const vector& input) { + const auto actual = adjacent_find(input.begin(), input.end()); + const auto expected = last_known_good_adj_find(input.begin(), input.end()); + assert(actual == expected); + +#if _HAS_CXX20 + const auto actual_r = ranges::adjacent_find(input); + assert(actual_r == expected); +#endif // _HAS_CXX20 +} + +template +void test_adjacent_find(mt19937_64& gen) { + constexpr size_t replicaCount = 4; + + using Limits = numeric_limits; + + uniform_int_distribution> dis(Limits::min(), Limits::max()); + + vector original_input; + vector input; + + original_input.reserve(dataCount); + input.reserve(dataCount); + + test_case_adj_find(input); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + original_input.push_back(static_cast(dis(gen))); + input = original_input; + + test_case_adj_find(input); + + if (original_input.size() > 2) { + uniform_int_distribution pos_dis(0, original_input.size() - 2); + + for (size_t replicas = 0; replicas < replicaCount; ++replicas) { + const size_t replica_pos = pos_dis(gen); + input[replica_pos] = input[replica_pos + 1]; + test_case_adj_find(input); + } + } + } +} + template ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) { ptrdiff_t result = 0; @@ -763,6 +824,16 @@ void test_vector_algorithms(mt19937_64& gen) { test_adjacent_difference_with_heterogeneous_types(); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_adjacent_find(gen); + test_count(gen); test_count(gen); test_count(gen);