microsoft · StephanTLavavej · Jun 27, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
@@ -261,30 +261,37 @@ private:
     vector<result_type> _Myvec;
 };
 
-_NODISCARD constexpr int _Generate_canonical_iterations(const int _Bits, const uint64_t _Gmin, const uint64_t _Gmax) {
-    // For a URBG type `G` with range == `(G::max() - G::min()) + 1`, returns the number of calls to generate at least
-    // _Bits bits of entropy. Specifically, max(1, ceil(_Bits / log2(range))).
-
-    _STL_INTERNAL_CHECK(0 <= _Bits && _Bits <= 64);
+struct _Urbg_gen_can_params {
+    int _Kx          = 0; // $k$ in N4981 [rand.util.canonical]/1.4
+    uint64_t _Xx     = 0; // $x$ in N4981 [rand.util.canonical]/1.5
+    int _Smax_bits   = 0; // number of bits in $R^k$
+    float _Scale     = 0.0f; // $r^{-d}$, always representable as a float
+    bool _Rx_is_pow2 = false;
+};
 
-    if (_Bits == 0 || (_Gmax == UINT64_MAX && _Gmin == 0)) {
-        return 1;
+_NODISCARD constexpr _Urbg_gen_can_params _Generate_canonical_params(size_t _Bits, uint64_t _Rm1) {
+    const auto _Rx        = _Unsigned128{_Rm1} + 1; // $R$ in N4981 [rand.util.canonical]/1.2
+    const auto _Target    = (_Unsigned128{1} << _Bits); // $r^d$
+    _Unsigned128 _Product = 1;
+    _Urbg_gen_can_params _Result;
+    _Result._Rx_is_pow2 = (_Rx & (_Rx - 1)) == 0;
+    _Result._Kx         = 0;
+    while (_Product < _Target) {
+        _Product *= _Rx;
+        ++_Result._Kx;
     }
 
-    const auto _Range  = (_Gmax - _Gmin) + 1;
-    const auto _Target = ~uint64_t{0} >> (64 - _Bits);
-    uint64_t _Prod     = 1;
-    int _Ceil          = 0;
-    while (_Prod <= _Target) {
-        ++_Ceil;
-        if (_Prod > UINT64_MAX / _Range) {
-            break;
-        }
-
-        _Prod *= _Range;
+    _Result._Xx        = static_cast<uint64_t>(_Product / _Target);
+    _Result._Scale     = 1.0f / static_cast<float>(1ULL << _Bits);
+    _Result._Smax_bits = 0;
+    // $S \in [0, _Product)$
+    --_Product;
+    while (_Product > 0) {
+        ++_Result._Smax_bits;
+        _Product >>= 1;
     }
 
-    return _Ceil;
+    return _Result;
 }
 
 _EXPORT_STD template <class _Real, size_t _Bits, class _Gen>
@@ -294,27 +301,110 @@ _NODISCARD _Real generate_canonical(_Gen& _Gx) { // build a floating-point value
     constexpr auto _Digits  = static_cast<size_t>(numeric_limits<_Real>::digits);
     constexpr auto _Minbits = static_cast<int>(_Digits < _Bits ? _Digits : _Bits);
 
-    constexpr auto _Gxmin = static_cast<_Real>((_Gen::min)());
-    constexpr auto _Gxmax = static_cast<_Real>((_Gen::max)());
-    constexpr auto _Rx    = (_Gxmax - _Gxmin) + _Real{1};
+    if constexpr (_Minbits == 0) {
+        return _Real{0};
+    }
+
+    using _Result_int_type = std::conditional_t<_Minbits <= 32, uint32_t, uint64_t>;
+
+    constexpr auto _Gxmin                  = (_Gen::min)();
+    constexpr auto _Gxmax                  = (_Gen::max)();
+    constexpr _Urbg_gen_can_params _Params = _Generate_canonical_params(_Minbits, _Gxmax - _Gxmin);
+
+    using _Sx_type = std::conditional_t<_Params._Smax_bits <= 32, uint32_t,
+        std::conditional_t<_Params._Smax_bits <= 64, uint64_t, _Unsigned128>>;
+    _Sx_type _Sx;
+
+    if constexpr (_Params._Rx_is_pow2) {
+        // Always needs only one attempt. Multiplications can be replaced with shift/add. Optimize k=1 case.
+        if constexpr (_Params._Kx == 1) {
+            _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+        } else if constexpr (_Params._Kx > 1) {
+            constexpr int _Rx_bits = _Params._Smax_bits / _Params._Kx;
+            _Sx                    = 0;
+            int _Shift             = 0;
+            for (int _Idx = 0; _Idx < _Params._Kx; ++_Idx) {
+                _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) << _Shift;
+                _Shift += _Rx_bits;
+            }
+        }
+
+        _Sx >>= (_Params._Smax_bits - _Minbits);
+        return static_cast<_Real>(static_cast<_Result_int_type>(_Sx)) * static_cast<_Real>(_Params._Scale);
+    }
+
+    constexpr auto _Rx        = _Sx_type{_Gxmax - _Gxmin} + 1u;
+    constexpr _Sx_type _Limit = static_cast<_Sx_type>(_Params._Xx) * (static_cast<_Sx_type>(1) << _Minbits);
+
+    do {
+        // unroll first two iterations to avoid unnecessary multiplications
+        _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+        if constexpr (_Params._Kx == 2) {
+            _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+        } else if constexpr (_Params._Kx > 2) {
+            _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+            _Sx_type _Factor = _Rx;
+            for (int _Idx = 2; _Idx < _Params._Kx; ++_Idx) {
+                _Factor *= _Rx;
+                _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Factor;
+            }
+        }
+    } while (_Sx >= _Limit);
+
+    return static_cast<_Real>(static_cast<_Result_int_type>(_Sx / _Params._Xx)) * static_cast<_Real>(_Params._Scale);
+}
+
+#pragma warning(push)
+#pragma warning(disable : 4701) //  potentially uninitialized local variable [_Xx and _Limit]
+template <class _Uint_type, class _Gen, class _Real>
+bool _Nrand_for_tr1(
+    const uint64_t _Rd, const uint64_t _Rx, _Gen& _Gx, _Real& _Val, uint64_t& _Sx_init, uint64_t& _Factor_init) {
+    // Minimally-constexpr generate_canonical algorithm. Will save work and exit if _Factor would overflow.
+    _Uint_type _Sx             = _Sx_init;
+    _Uint_type _Factor         = _Factor_init;
+    const auto _Gxmin          = (_Gx.min)();
+    const auto _Overflow_limit = UINT64_MAX / _Rx;
+
+    _Uint_type _Xx;
+    _Uint_type _Limit;
+    bool _Init = false;
+    while (true) {
+        while (_Factor < _Rd) {
+            if constexpr (is_same_v<_Uint_type, uint64_t>) {
+                if (_Factor > _Overflow_limit) {
+                    _Sx_init     = _Sx;
+                    _Factor_init = _Factor;
+                    return true;
+                }
+            }
 
-    constexpr int _Kx = _Generate_canonical_iterations(_Minbits, (_Gen::min)(), (_Gen::max)());
+            _Sx += (_Gx() - _Gxmin) * _Factor;
+            _Factor *= _Rx;
+        }
 
-    _Real _Ans{0};
-    _Real _Factor{1};
+        if (!_Init) {
+            _Init  = true;
+            _Xx    = _Factor / _Rd;
+            _Limit = _Xx * _Rd;
+        }
 
-    for (int _Idx = 0; _Idx < _Kx; ++_Idx) { // add in another set of bits
-        _Ans += (static_cast<_Real>(_Gx()) - _Gxmin) * _Factor;
-        _Factor *= _Rx;
+        if (_Sx < _Limit) {
+            break;
+        } else {
+            _Sx     = 0;
+            _Factor = 1;
+        }
     }
 
-    return _Ans / _Factor;
+    _Val = static_cast<_Real>(static_cast<uint64_t>(_Sx / _Xx));
+    return false;
 }
+#pragma warning(pop)
 
 template <class _Gen, class = void>
 struct _Has_static_min_max : false_type {};
 
-// This checks a requirement of N4950 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
+// This checks a requirement of N4981 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
 // to implement the whole concept - we just need to distinguish Standard machinery from tr1 machinery.
 template <class _Gen>
 struct _Has_static_min_max<_Gen, void_t<decltype(bool_constant<(_Gen::min)() < (_Gen::max)()>::value)>> : true_type {};
@@ -329,7 +419,8 @@ _NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from r
 
     if constexpr (_Has_static_min_max<_Gen>::value && _Minbits <= 64) {
         return _STD generate_canonical<_Real, _Minbits>(_Gx);
-    } else { // TRANSITION, for tr1 machinery only; Standard machinery can call generate_canonical directly
+    } else if constexpr (!is_integral_v<typename _Gen::result_type>) {
+        // the pre-P0952R2 algorithm
         const _Real _Gxmin = static_cast<_Real>((_Gx.min)());
         const _Real _Gxmax = static_cast<_Real>((_Gx.max)());
         const _Real _Rx    = (_Gxmax - _Gxmin) + _Real{1};
@@ -346,6 +437,32 @@ _NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from r
         }
 
         return _Ans / _Factor;
+    } else {
+        // TRANSITION, for integral tr1 machinery only; Standard machinery can call generate_canonical directly
+        const auto _Gxmax = (_Gx.max)();
+        const auto _Gxmin = (_Gx.min)();
+        _Real _Val;
+
+        if (_Gxmax == UINT64_MAX && _Gxmin == 0) {
+            // special case when uint64_t can't hold the full URBG range
+#pragma warning(push)
+#pragma warning(disable : 4293) // shift count negative or too big
+            _Val = static_cast<_Real>(static_cast<uint64_t>(_Gx()) >> (64 - _Minbits));
+#pragma warning(pop)
+        } else {
+            constexpr auto _Rd = 1ULL << _Minbits;
+            const auto _Rx     = static_cast<uint64_t>(_Gxmax - _Gxmin) + 1;
+            uint64_t _Sx       = 0;
+            uint64_t _Factor   = 1;
+
+            // Try with 64 bits first, upgrade to 128 if necessary.
+            const bool _Would_overflow = _Nrand_for_tr1<uint64_t>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            if (_Would_overflow) {
+                _Nrand_for_tr1<_Unsigned128>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            }
+        }
+
+        return _STD ldexp(_Val, -static_cast<int>(_Minbits));
     }
 }
 
@@ -1968,8 +2085,8 @@ public:
     explicit _Rng_from_urng_v2(_Urng& _Func) noexcept : _Ref(_Func) {}
 
     _Diff operator()(_Diff _Index) { // adapt _Urng closed range to [0, _Index)
-        // From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29 (1),
-        // 2019.
+        // From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29
+        // (1), 2019.
         //
         // Algorithm 5 <-> This Code:
         //   m <-> _Product
@@ -2925,8 +3042,8 @@ public:
 
 private:
     template <class _Engine>
-    result_type _Eval(
-        _Engine& _Eng, const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
+    result_type _Eval(_Engine& _Eng,
+        const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
         _Ty _Res;
         if (_Par0._Tx < 25) { // generate directly
             _Res = 0;
@@ -2938,7 +3055,8 @@ private:
 
             return _Res;
         } else if (_Par0._Mean < 1.0) {
-            // Events are rare, use waiting time method (Luc Devroye, Non-Uniform Random Variate Generation, p. 525).
+            // Events are rare, use waiting time method (Luc Devroye, Non-Uniform Random Variate Generation, p.
+            // 525).
             const _Ty1 _Rand = _NRAND(_Eng, _Ty1);
 
             // The exit condition is log(1 - _Rand)/t < log(1-p), which is equivalent to _Rand > 1 - (1-p)^t. If

@@ -62,6 +62,7 @@
 // P0883R2 Fixing Atomic Initialization
 // P0935R0 Eradicating Unnecessarily Explicit Default Constructors
 // P0941R2 Feature-Test Macros
+// P0952R2 A New Specification For generate_canonical()
 // P0972R0 noexcept For <chrono> zero(), min(), max()
 // P1065R2 constexpr INVOKE
 //     (the std::invoke function only; other components like bind and reference_wrapper are C++20 only)

@@ -582,6 +582,9 @@ std/numerics/rand/rand.dist/rand.dist.uni/rand.dist.uni.real/param_ctor.pass.cpp
 # test4() constructs a negative_binomial_distribution from (40, 1); [rand.dist.bern.negbin] says p == 1 generates undefined probabilities.
 std/numerics/rand/rand.dist/rand.dist.bern/rand.dist.bern.negbin/eval.pass.cpp FAIL
 
+# libc++ hasn't implemented P0952, which changes the generate_canonical algorithm.
+std/numerics/rand/rand.util/rand.util.canonical/generate_canonical.pass.cpp FAIL
+
 # Assertion failed: (std::lerp(T(2.3), T(2.3), inf) == T(2.3))
 # Asserts `(std::lerp(T(2.3), T(2.3), inf) == T(2.3))` and `std::isnan(std::lerp(T( 0), T( 0), inf))`
 # They shouldn't behave differently. Both of them should probably return NaN.

@@ -198,7 +198,6 @@ tests\GH_001850_clog_tied_to_cout
 tests\GH_001858_iostream_exception
 tests\GH_001912_random_distribution_operator_const
 tests\GH_001914_cached_position
-tests\GH_001964_constexpr_generate_canonical
 tests\GH_002030_asan_annotate_string
 tests\GH_002030_asan_annotate_vector
 tests\GH_002039_byte_is_not_trivially_swappable
@@ -511,6 +510,7 @@ tests\P0898R3_identity
 tests\P0912R5_coroutine
 tests\P0919R3_heterogeneous_unordered_lookup
 tests\P0943R6_stdatomic_h
+tests\P0952R2_new_generate_canonical
 tests\P0966R1_string_reserve_should_not_shrink
 tests\P0980R1_constexpr_strings
 tests\P1004R2_constexpr_vector