microsoft · StephanTLavavej · Aug 25, 2024 · Jun 18, 2024 · Jun 19, 2024 · Jun 20, 2024
@@ -10205,7 +10205,7 @@ _NODISCARD constexpr pair<_Ty, _Ty> minmax(initializer_list<_Ty> _Ilist, _Pr _Pr
     _STL_ASSERT(
         _Ilist.size() != 0, "An initializer_list passed to std::minmax must not be empty. (N4971 [alg.min.max]/21)");
 #if _USE_STD_VECTOR_ALGORITHMS
-    if constexpr (_Is_min_max_optimization_safe<const _Ty*, _Pr>) {
+    if constexpr (_Is_min_max_value_optimization_safe<const _Ty*, _Pr>) {
         if (!_STD _Is_constant_evaluated()) {
             const auto _Result = _STD _Minmax_vectorized(_Ilist.begin(), _Ilist.end());
             return {static_cast<_Ty>(_Result._Min), static_cast<_Ty>(_Result._Max)};
@@ -10334,13 +10334,21 @@ namespace ranges {
 
             using _Vty = iter_value_t<_It>;
 #if _USE_STD_VECTOR_ALGORITHMS
-            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_optimization_safe<_It, _Pr>
-                          && sized_sentinel_for<_Se, _It>) {
-                if (!_STD is_constant_evaluated()) {
-                    const auto _First_ptr = _STD to_address(_First);
-                    const auto _Last_ptr  = _First_ptr + (_Last - _First);
-                    const auto _Result    = _STD _Minmax_vectorized(_First_ptr, _Last_ptr);
-                    return {static_cast<_Vty>(_Result._Min), static_cast<_Vty>(_Result._Max)};
+            if constexpr (is_same_v<_Pj, identity> && sized_sentinel_for<_Se, _It>) {
+                if constexpr (_Is_min_max_value_optimization_safe<_It, _Pr>) {
+                    if (!_STD is_constant_evaluated()) {
+                        const auto _First_ptr = _STD to_address(_First);
+                        const auto _Last_ptr  = _First_ptr + (_Last - _First);
+                        const auto _Result    = _STD _Minmax_vectorized(_First_ptr, _Last_ptr);
+                        return {static_cast<_Vty>(_Result._Min), static_cast<_Vty>(_Result._Max)};
+                    }
+                } else if constexpr (_Is_min_max_optimization_safe<_It, _Pr>) {
+                    if (!_STD is_constant_evaluated()) {
+                        const auto _First_ptr = _STD to_address(_First);
+                        const auto _Last_ptr  = _First_ptr + (_Last - _First);
+                        const auto _Result    = _STD _Minmax_element_vectorized(_First_ptr, _Last_ptr);
+                        return {*static_cast<const _Vty*>(_Result.first), *static_cast<const _Vty*>(_Result.second)};
+                    }
                 }
             }
 #endif // _USE_STD_VECTOR_ALGORITHMS

@@ -6783,6 +6783,24 @@ constexpr bool _Is_min_max_optimization_safe = // Activate the vector algorithms
 #endif // _HAS_CXX20
             is_same<_Pr, less<>>, is_same<_Pr, less<_Elem>>>>; // predicate is less
 
+// The value-based vectorized rather than the position-based one does not always produce
+// the expected results for floatting point types.
+//
+// Efficient vectorization needs to find vertical minmax first, and then the horizontal one.
+// This alters order of comparison: index zero element is first compared against
+// vector size equal index element and only in the end against index one element.
+// With equivalent but distinguishable +0.0 and -0.0 values, the altered comparison order
+// will not produce the expected result in some cases (will return +0.0 instead of -0.0 or the reverse)
+//
+// The result is still acceptable for /fp:fast when +0.0 / -0.0 are not expected to be properly distinguished,
+// and the compiler itself takes advantage of it.
+template <class _Iter, class _Pr, class _Elem = _Iter_value_t<_Iter>>
+constexpr bool _Is_min_max_value_optimization_safe = // Activate the vector algorithms for ranges::min/max?
+#ifndef _M_FP_FAST
+    !is_floating_point_v<_Elem> &&
+#endif // !_M_FP_FAST
+    _Is_min_max_optimization_safe<_Iter, _Pr, _Elem>;
+
 template <class _FwdIt, class _Pr>
 constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element
 #if _USE_STD_VECTOR_ALGORITHMS
@@ -6913,7 +6931,7 @@ _NODISCARD constexpr _Ty(max)(initializer_list<_Ty> _Ilist, _Pr _Pred) {
     _STL_ASSERT(
         _Ilist.size() != 0, "An initializer_list passed to std::max must not be empty. (N4971 [alg.min.max]/13)");
 #if _USE_STD_VECTOR_ALGORITHMS
-    if constexpr (_Is_min_max_optimization_safe<const _Ty*, _Pr>) {
+    if constexpr (_Is_min_max_value_optimization_safe<const _Ty*, _Pr>) {
         if (!_Is_constant_evaluated()) {
             return static_cast<_Ty>(_STD _Max_vectorized(_Ilist.begin(), _Ilist.end()));
         }
@@ -6959,7 +6977,7 @@ namespace ranges {
             _STL_ASSERT(_First != _Last,
                 "An initializer_list passed to std::ranges::max must not be empty. (N4971 [alg.min.max]/13)");
 #if _USE_STD_VECTOR_ALGORITHMS
-            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_optimization_safe<const _Ty*, _Pr>) {
+            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_value_optimization_safe<const _Ty*, _Pr>) {
                 if (!_STD is_constant_evaluated()) {
                     return static_cast<_Ty>(_STD _Max_vectorized(_First, _Last));
                 }
@@ -6978,7 +6996,7 @@ namespace ranges {
             _STL_ASSERT(
                 _UFirst != _ULast, "A range passed to std::ranges::max must not be empty. (N4971 [alg.min.max]/13)");
 #if _USE_STD_VECTOR_ALGORITHMS
-            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_optimization_safe<decltype(_UFirst), _Pr>
+            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_value_optimization_safe<decltype(_UFirst), _Pr>
                           && sized_sentinel_for<decltype(_ULast), decltype(_UFirst)>) {
                 if (!_STD is_constant_evaluated()) {
                     const auto _First_ptr = _STD to_address(_UFirst);
@@ -7137,7 +7155,7 @@ _NODISCARD constexpr _Ty(min)(initializer_list<_Ty> _Ilist, _Pr _Pred) {
     _STL_ASSERT(
         _Ilist.size() != 0, "An initializer_list passed to std::min must not be empty. (N4971 [alg.min.max]/5)");
 #if _USE_STD_VECTOR_ALGORITHMS
-    if constexpr (_Is_min_max_optimization_safe<const _Ty*, _Pr>) {
+    if constexpr (_Is_min_max_value_optimization_safe<const _Ty*, _Pr>) {
         if (!_Is_constant_evaluated()) {
             return static_cast<_Ty>(_STD _Min_vectorized(_Ilist.begin(), _Ilist.end()));
         }
@@ -7177,7 +7195,7 @@ namespace ranges {
             _STL_ASSERT(_First != _Last,
                 "An initializer_list passed to std::ranges::min must not be empty. (N4971 [alg.min.max]/5)");
 #if _USE_STD_VECTOR_ALGORITHMS
-            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_optimization_safe<const _Ty*, _Pr>) {
+            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_value_optimization_safe<const _Ty*, _Pr>) {
                 if (!_STD is_constant_evaluated()) {
                     return static_cast<_Ty>(_STD _Min_vectorized(_First, _Last));
                 }
@@ -7196,7 +7214,7 @@ namespace ranges {
             _STL_ASSERT(
                 _UFirst != _ULast, "A range passed to std::ranges::min must not be empty. (N4971 [alg.min.max]/5)");
 #if _USE_STD_VECTOR_ALGORITHMS
-            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_optimization_safe<decltype(_UFirst), _Pr>
+            if constexpr (is_same_v<_Pj, identity> && _Is_min_max_value_optimization_safe<decltype(_UFirst), _Pr>
                           && sized_sentinel_for<decltype(_ULast), decltype(_UFirst)>) {
                 if (!_STD is_constant_evaluated()) {
                     const auto _First_ptr = _STD to_address(_UFirst);

@@ -1415,8 +1415,8 @@ namespace {
         template <class _Fn>
         static __m128 _H_func(const __m128 _Cur, _Fn _Funct) noexcept {
             __m128 _H_min_val = _Cur;
-            _H_min_val        = _Funct(_H_min_val, _mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(1, 0, 3, 2)));
-            _H_min_val        = _Funct(_H_min_val, _mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(2, 3, 0, 1)));
+            _H_min_val        = _Funct(_mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(2, 3, 0, 1)), _H_min_val);
+            _H_min_val        = _Funct(_mm_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(1, 0, 3, 2)), _H_min_val);
             return _H_min_val;
         }
 
@@ -1457,11 +1457,11 @@ namespace {
         }
 
         static __m128 _Min(const __m128 _First, const __m128 _Second, __m128 = _mm_undefined_ps()) noexcept {
-            return _mm_min_ps(_First, _Second);
+            return _mm_min_ps(_Second, _First);
         }
 
         static __m128 _Max(const __m128 _First, const __m128 _Second, __m128 = _mm_undefined_ps()) noexcept {
-            return _mm_max_ps(_First, _Second);
+            return _mm_max_ps(_Second, _First);
         }
 
         static __m128i _Mask_cast(const __m128 _Mask) noexcept {
@@ -1485,9 +1485,9 @@ namespace {
         template <class _Fn>
         static __m256 _H_func(const __m256 _Cur, _Fn _Funct) noexcept {
             __m256 _H_min_val = _Cur;
-            _H_min_val        = _Funct(_H_min_val, _mm256_permute2f128_ps(_H_min_val, _mm256_undefined_ps(), 0x01));
-            _H_min_val        = _Funct(_H_min_val, _mm256_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(1, 0, 3, 2)));
-            _H_min_val        = _Funct(_H_min_val, _mm256_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(2, 3, 0, 1)));
+            _H_min_val        = _Funct(_mm256_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(2, 3, 0, 1)), _H_min_val);
+            _H_min_val        = _Funct(_mm256_shuffle_ps(_H_min_val, _H_min_val, _MM_SHUFFLE(1, 0, 3, 2)), _H_min_val);
+            _H_min_val        = _Funct(_mm256_permute2f128_ps(_H_min_val, _mm256_undefined_ps(), 0x01), _H_min_val);
             return _H_min_val;
         }
 
@@ -1528,11 +1528,11 @@ namespace {
         }
 
         static __m256 _Min(const __m256 _First, const __m256 _Second, __m256 = _mm256_undefined_ps()) noexcept {
-            return _mm256_min_ps(_First, _Second);
+            return _mm256_min_ps(_Second, _First);
         }
 
         static __m256 _Max(const __m256 _First, const __m256 _Second, __m256 = _mm256_undefined_ps()) noexcept {
-            return _mm256_max_ps(_First, _Second);
+            return _mm256_max_ps(_Second, _First);
         }
 
         static __m256i _Mask_cast(const __m256 _Mask) noexcept {
@@ -1575,7 +1575,7 @@ namespace {
         template <class _Fn>
         static __m128d _H_func(const __m128d _Cur, _Fn _Funct) noexcept {
             __m128d _H_min_val = _Cur;
-            _H_min_val         = _Funct(_H_min_val, _mm_shuffle_pd(_H_min_val, _H_min_val, 1));
+            _H_min_val         = _Funct(_mm_shuffle_pd(_H_min_val, _H_min_val, 1), _H_min_val);
             return _H_min_val;
         }
 
@@ -1615,11 +1615,11 @@ namespace {
         }
 
         static __m128d _Min(const __m128d _First, const __m128d _Second, __m128d = _mm_undefined_pd()) noexcept {
-            return _mm_min_pd(_First, _Second);
+            return _mm_min_pd(_Second, _First);
         }
 
         static __m128d _Max(const __m128d _First, const __m128d _Second, __m128d = _mm_undefined_pd()) noexcept {
-            return _mm_max_pd(_First, _Second);
+            return _mm_max_pd(_Second, _First);
         }
 
         static __m128i _Mask_cast(const __m128d _Mask) noexcept {
@@ -1643,8 +1643,8 @@ namespace {
         template <class _Fn>
         static __m256d _H_func(const __m256d _Cur, _Fn _Funct) noexcept {
             __m256d _H_min_val = _Cur;
-            _H_min_val         = _Funct(_H_min_val, _mm256_permute4x64_pd(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2)));
-            _H_min_val         = _Funct(_H_min_val, _mm256_shuffle_pd(_H_min_val, _H_min_val, 0b0101));
+            _H_min_val         = _Funct(_mm256_shuffle_pd(_H_min_val, _H_min_val, 0b0101), _H_min_val);
+            _H_min_val         = _Funct(_mm256_permute4x64_pd(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2)), _H_min_val);
             return _H_min_val;
         }
 
@@ -1685,11 +1685,11 @@ namespace {
         }
 
         static __m256d _Min(const __m256d _First, const __m256d _Second, __m256d = _mm256_undefined_pd()) noexcept {
-            return _mm256_min_pd(_First, _Second);
+            return _mm256_min_pd(_Second, _First);
         }
 
         static __m256d _Max(const __m256d _First, const __m256d _Second, __m256d = _mm256_undefined_pd()) noexcept {
-            return _mm256_max_pd(_First, _Second);
+            return _mm256_max_pd(_Second, _First);
         }
 
         static __m256i _Mask_cast(const __m256d _Mask) noexcept {
@@ -1982,6 +1982,18 @@ namespace {
 
     template <_Min_max_mode _Mode, class _Traits, bool _Sign>
     auto __std_minmax_impl(const void* _First, const void* const _Last) noexcept {
+        // The value-based vectorized rather than the position-based one does not always produce
+        // the expected results for floatting point types.
+        //
+        // Efficient vectorization needs to find vertical minmax first, and then the horizontal one.
+        // This alters order of comparison: index zero element is first compared against
+        // vector size equal index element and only in the end against index one element.
+        // With equivalent but distinguishable +0.0 and -0.0 values, the altered comparison order
+        // will not produce the expected result in some cases (will return +0.0 instead of -0.0 or the reverse)
+        //
+        // The result is still acceptable for /fp:fast when +0.0 / -0.0 are not expected to be properly distinguished,
+        // and the compiler itself takes advantage of it.
+
         using _Ty = std::conditional_t<_Sign, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>;
 
         _Ty _Cur_min_val; // initialized in both of the branches below

@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -116,6 +117,17 @@ void test_case_min_max_element(const std::vector<T>& input) {
         assert(*expected_max == actual_max_value);
         assert(*expected_minmax.first == actual_minmax_value.min);
         assert(*expected_minmax.second == actual_minmax_value.max);
+
+#ifndef _M_FP_FAST
+        // With /fp:fast mode the compiler does not try to produce the code that correctly
+        // distincts +0.0 and -0.0, so the algorithms are not expected to either.
+        if constexpr (std::is_floating_point_v<T>) {
+            assert(signbit(*expected_min) == signbit(actual_min_value));
+            assert(signbit(*expected_max) == signbit(actual_max_value));
+            assert(signbit(*expected_minmax.first) == signbit(actual_minmax_value.min));
+            assert(signbit(*expected_minmax.second) == signbit(actual_minmax_value.max));
+        }
+#endif // !defined(_M_FP_FAST)
     }
 #endif // _HAS_CXX20
 }
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <isa_availability.h>
+#include <random>
+#include <vector>
+
+#pragma warning(disable : 4984) // 'if constexpr' is a C++17 language extension
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension
+#endif // __clang__
+
+void initialize_randomness(std::mt19937_64& gen) {
+    constexpr size_t n = std::mt19937_64::state_size;
+    constexpr size_t w = std::mt19937_64::word_size;
+    static_assert(w % 32 == 0, "w should be evenly divisible by 32");
+    constexpr size_t k = w / 32;
+
+    std::vector<uint32_t> vec(n * k);
+
+    std::random_device rd;
+    std::generate(vec.begin(), vec.end(), ref(rd));
+
+    printf("This is a randomized test.\n");
+    printf("DO NOT IGNORE/RERUN ANY FAILURES.\n");
+    printf("You must report them to the STL maintainers.\n\n");
+
+    printf("Seed vector: ");
+    for (const auto& e : vec) {
+        printf("%u,", e);
+    }
+    printf("\n");
+
+    std::seed_seq seq(vec.cbegin(), vec.cend());
+    gen.seed(seq);
+}
+
+#if (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_CEE_PURE)
+extern "C" long __isa_enabled;
+
+void disable_instructions(ISA_AVAILABILITY isa) {
+    __isa_enabled &= ~(1UL << static_cast<unsigned long>(isa));
+}
+#endif // (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_CEE_PURE)
+
+constexpr size_t dataCount = 1024;
+
+void run_randomized_tests_with_different_isa_levels(void tests(std::mt19937_64& gen)) {
+    std::mt19937_64 gen;
+    initialize_randomness(gen);
+
+    tests(gen);
+#ifndef _M_CEE_PURE
+#if defined(_M_IX86) || defined(_M_X64)
+    disable_instructions(__ISA_AVAILABLE_AVX2);
+    tests(gen);
+
+    disable_instructions(__ISA_AVAILABLE_SSE42);
+    tests(gen);
+#endif // defined(_M_IX86) || defined(_M_X64)
+#endif // _M_CEE_PURE
+}