From 0f42c17d85f26bbbd6a30d71ce8306d371426bfc Mon Sep 17 00:00:00 2001 From: jk-jeon <33922675+jk-jeon@users.noreply.github.com> Date: Sat, 14 Jan 2023 11:30:20 -0800 Subject: [PATCH] Implement a new formatting algorithm for small given precision (#3269) Implement the formatting algorithm for small given precision discussed in https://github.com/fmtlib/fmt/issues/3262 and https://github.com/fmtlib/fmt/pull/2750 --- include/fmt/format-inl.h | 88 +++------- include/fmt/format.h | 342 ++++++++++++++++++++++++++++++++++++++- test/format-impl-test.cc | 2 +- 3 files changed, 363 insertions(+), 69 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index fa7b1e385307..523fd514f972 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -174,58 +174,10 @@ FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept { return (n >> r) | (n << (64 - r)); } -// Computes 128-bit result of multiplication of two 64-bit unsigned integers. -inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept { -#if FMT_USE_INT128 - auto p = static_cast(x) * static_cast(y); - return {static_cast(p >> 64), static_cast(p)}; -#elif defined(_MSC_VER) && defined(_M_X64) - auto result = uint128_fallback(); - result.lo_ = _umul128(x, y, &result.hi_); - return result; -#else - const uint64_t mask = static_cast(max_value()); - - uint64_t a = x >> 32; - uint64_t b = x & mask; - uint64_t c = y >> 32; - uint64_t d = y & mask; - uint64_t ac = a * c; - uint64_t bc = b * c; - uint64_t ad = a * d; - uint64_t bd = b * d; - - uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask); - - return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32), - (intermediate << 32) + (bd & mask)}; -#endif -} // Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox. namespace dragonbox { -// Computes upper 64 bits of multiplication of two 64-bit unsigned integers. -inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept { -#if FMT_USE_INT128 - auto p = static_cast(x) * static_cast(y); - return static_cast(p >> 64); -#elif defined(_MSC_VER) && defined(_M_X64) - return __umulh(x, y); -#else - return umul128(x, y).high(); -#endif -} - -// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a -// 128-bit unsigned integer. -inline uint128_fallback umul192_upper128(uint64_t x, - uint128_fallback y) noexcept { - uint128_fallback r = umul128(x, y.high()); - r += umul128_upper64(x, y.low()); - return r; -} - // Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a // 64-bit unsigned integer. inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept { @@ -247,19 +199,7 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept { return x * y; } -// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from -// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1. -inline int floor_log10_pow2(int e) noexcept { - FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent"); - static_assert((-1 >> 1) == -1, "right shift is not arithmetic"); - return (e * 315653) >> 20; -} - // Various fast log computations. -inline int floor_log2_pow10(int e) noexcept { - FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent"); - return (e * 1741647) >> 19; -} inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept { FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent"); return (e * 631305 - 261663) >> 21; @@ -1040,8 +980,23 @@ template <> struct cache_accessor { {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6}, {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2}, {0xc5a05277621be293, 0xc7098b7305241886}, - { 0xf70867153aa2db38, - 0xb8cbee4fc66d1ea8 } + {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8}, + {0x9a65406d44a5c903, 0x737f74f1dc043329}, + {0xc0fe908895cf3b44, 0x505f522e53053ff3}, + {0xf13e34aabb430a15, 0x647726b9e7c68ff0}, + {0x96c6e0eab509e64d, 0x5eca783430dc19f6}, + {0xbc789925624c5fe0, 0xb67d16413d132073}, + {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890}, + {0x933e37a534cbaae7, 0x8e91b962f7b6f15a}, + {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1}, + {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d}, + {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2}, + {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e}, + {0xe0accfa875af45a7, 0x93eb1b80a33b8606}, + {0x8c6c01c9498d8b88, 0xbc72f130660533c4}, + {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5}, + { 0xdb68c2ca82ed2a05, + 0xa67398db9f6820e2 } #else {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b}, {0xce5d73ff402d98e3, 0xfb0a3d212dc81290}, @@ -1065,8 +1020,9 @@ template <> struct cache_accessor { {0x8da471a9de737e24, 0x5ceaecfed289e5d3}, {0xe4d5e82392a40515, 0x0fabaf3feaa5334b}, {0xb8da1662e7b00a17, 0x3d6a751f3b936244}, - { 0x95527a5202df0ccb, - 0x0f37801e0c43ebc9 } + {0x95527a5202df0ccb, 0x0f37801e0c43ebc9}, + { 0xf13e34aabb430a15, + 0x647726b9e7c68ff0 } #endif }; @@ -1169,6 +1125,10 @@ template <> struct cache_accessor { } }; +FMT_FUNC uint128_fallback get_cached_power(int k) noexcept { + return cache_accessor::get_cached_power(k); +} + // Various integer checks template bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept { diff --git a/include/fmt/format.h b/include/fmt/format.h index 8f05c7d92cf9..fef477a8d602 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -495,6 +495,16 @@ FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int { return lz; } +FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int { +#ifdef FMT_BUILTIN_CLZLL + if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n); +#endif + int lz = 0; + constexpr uint64_t msb_mask = 1ull << (num_bits() - 1); + for (; (n & msb_mask) == 0; n <<= 1) lz++; + return lz; +} + FMT_INLINE void assume(bool condition) { (void)condition; #if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION @@ -1204,7 +1214,7 @@ FMT_CONSTEXPR auto count_digits(UInt n) -> int { FMT_INLINE auto do_count_digits(uint32_t n) -> int { // An optimization by Kendall Willets from https://bit.ly/3uOIQrB. // This increments the upper 32 bits (log10(T) - 1) when >= T is added. -# define FMT_INC(T) (((sizeof(# T) - 1ull) << 32) - T) +# define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T) static constexpr uint64_t table[] = { FMT_INC(0), FMT_INC(0), FMT_INC(0), // 8 FMT_INC(10), FMT_INC(10), FMT_INC(10), // 64 @@ -1365,7 +1375,71 @@ class utf8_to_utf16 { auto str() const -> std::wstring { return {&buffer_[0], size()}; } }; +// Computes 128-bit result of multiplication of two 64-bit unsigned integers. +inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept { +#if FMT_USE_INT128 + auto p = static_cast(x) * static_cast(y); + return {static_cast(p >> 64), static_cast(p)}; +#elif defined(_MSC_VER) && defined(_M_X64) + auto result = uint128_fallback(); + result.lo_ = _umul128(x, y, &result.hi_); + return result; +#else + const uint64_t mask = static_cast(max_value()); + + uint64_t a = x >> 32; + uint64_t b = x & mask; + uint64_t c = y >> 32; + uint64_t d = y & mask; + + uint64_t ac = a * c; + uint64_t bc = b * c; + uint64_t ad = a * d; + uint64_t bd = b * d; + + uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask); + + return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32), + (intermediate << 32) + (bd & mask)}; +#endif +} + namespace dragonbox { +// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from +// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1. +inline int floor_log10_pow2(int e) noexcept { + FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent"); + static_assert((-1 >> 1) == -1, "right shift is not arithmetic"); + return (e * 315653) >> 20; +} + +inline int floor_log2_pow10(int e) noexcept { + FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent"); + return (e * 1741647) >> 19; +} + +// Computes upper 64 bits of multiplication of two 64-bit unsigned integers. +inline uint64_t umul128_upper64(uint64_t x, uint64_t y) noexcept { +#if FMT_USE_INT128 + auto p = static_cast(x) * static_cast(y); + return static_cast(p >> 64); +#elif defined(_MSC_VER) && defined(_M_X64) + return __umulh(x, y); +#else + return umul128(x, y).high(); +#endif +} + +// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a +// 128-bit unsigned integer. +inline uint128_fallback umul192_upper128(uint64_t x, + uint128_fallback y) noexcept { + uint128_fallback r = umul128(x, y.high()); + r += umul128_upper64(x, y.low()); + return r; +} + +FMT_API uint128_fallback get_cached_power(int k) noexcept; // Type-specific information that Dragonbox uses. template struct float_info; @@ -1389,7 +1463,7 @@ template <> struct float_info { static const int big_divisor = 1000; static const int small_divisor = 100; static const int min_k = -292; - static const int max_k = 326; + static const int max_k = 341; static const int shorter_interval_tie_lower_threshold = -77; static const int shorter_interval_tie_upper_threshold = -77; }; @@ -1614,12 +1688,28 @@ template struct basic_data { static constexpr uint64_t power_of_10_64[20] = { 1, FMT_POWERS_OF_10(1ULL), FMT_POWERS_OF_10(1000000000ULL), 10000000000000000000ULL}; + + // For checking rounding thresholds. + // The kth entry is chosen to be the smallest integer such that the + // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k. + static constexpr uint32_t fractional_part_rounding_thresholds[8] = { + 2576980378, // ceil(2^31 + 2^32/10^1) + 2190433321, // ceil(2^31 + 2^32/10^2) + 2151778616, // ceil(2^31 + 2^32/10^3) + 2147913145, // ceil(2^31 + 2^32/10^4) + 2147526598, // ceil(2^31 + 2^32/10^5) + 2147487943, // ceil(2^31 + 2^32/10^6) + 2147484078, // ceil(2^31 + 2^32/10^7) + 2147483691 // ceil(2^31 + 2^32/10^8) + }; }; #if FMT_CPLUSPLUS < 201703L template constexpr uint64_t basic_data::pow10_significands[]; template constexpr int16_t basic_data::pow10_exponents[]; template constexpr uint64_t basic_data::power_of_10_64[]; +template +constexpr uint32_t basic_data::fractional_part_rounding_thresholds[]; #endif // This is a struct rather than an alias to avoid shadowing warnings in gcc. @@ -3330,7 +3420,7 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs, auto dec = dragonbox::to_decimal(static_cast(value)); write(buffer_appender(buf), dec.significand); return dec.exponent; - } else { + } else if (is_constant_evaluated()) { // Use Grisu + Dragon4 for the given precision: // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf. const int min_exp = -60; // alpha in Grisu. @@ -3349,6 +3439,248 @@ FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs, exp += handler.size - cached_exp10 - 1; precision = handler.precision; } + } else { + // Extract significand bits and exponent bits. + using info = dragonbox::float_info; + auto br = bit_cast(static_cast(value)); + + const uint64_t significand_mask = + (static_cast(1) << num_significand_bits()) - 1; + uint64_t significand = (br & significand_mask); + int exponent = static_cast((br & exponent_mask()) >> + num_significand_bits()); + + if (exponent != 0) { // Check if normal. + exponent -= exponent_bias() + num_significand_bits(); + significand |= + (static_cast(1) << num_significand_bits()); + significand <<= 1; + } else { + // Normalize subnormal inputs. + FMT_ASSERT(significand != 0, "zeros should not appear hear"); + int shift = countl_zero(significand); + FMT_ASSERT(shift >= num_bits() - num_significand_bits(), + ""); + shift -= (num_bits() - num_significand_bits() - 2); + exponent = (std::numeric_limits::min_exponent - + num_significand_bits()) - + shift; + significand <<= shift; + } + + // Compute the first several nonzero decimal significand digits. + // We call the number we get the first segment. + const int k = info::kappa - dragonbox::floor_log10_pow2(exponent); + exp = -k; + const int beta = exponent + dragonbox::floor_log2_pow10(k); + uint64_t first_segment; + bool has_more_segments; + int digits_in_the_first_segment; + { + const auto r = dragonbox::umul192_upper128( + significand << beta, dragonbox::get_cached_power(k)); + first_segment = r.high(); + has_more_segments = r.low() != 0; + + // The first segment can have 18 ~ 19 digits. + if (first_segment >= 1000000000000000000ULL) { + digits_in_the_first_segment = 19; + } else { + // When it is of 18-digits, we align it to 19-digits by adding a bogus + // zero at the end. + digits_in_the_first_segment = 18; + first_segment *= 10; + } + } + + // Compute the actual number of decimal digits to print + if (fixed) { + adjust_precision(precision, exp + digits_in_the_first_segment); + } + + // Use Dragon4 only when there might be not enough digits in the first + // segment. + if (digits_in_the_first_segment > precision) { + use_dragon = false; + + if (precision <= 0) { + exp += digits_in_the_first_segment; + + if (precision < 0) { + // Nothing to do, since all we have are just leading zeros. + buf.try_resize(0); + } else { + // We may need to round-up. + buf.try_resize(1); + if ((first_segment | static_cast(has_more_segments)) > + 5000000000000000000ULL) { + buf[0] = '1'; + } else { + buf[0] = '0'; + } + } + } // precision <= 0 + else { + exp += digits_in_the_first_segment - precision; + + // When precision > 0, we divide the first segment into three + // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits + // in 32-bits which usually allows faster calculation than in + // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize + // division-by-constant for large 64-bit divisors, we do it here + // manually. The magic number 7922816251426433760 below is equal to + // ceil(2^(64+32) / 10^10). + const uint32_t first_subsegment = static_cast( + dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >> + 32); + const uint64_t second_third_subsegments = + first_segment - first_subsegment * 10000000000ULL; + + uint64_t prod; + uint32_t digits; + bool should_round_up; + int number_of_digits_to_print = precision > 9 ? 9 : precision; + + // Print a 9-digits subsegment, either the first or the second. + auto print_subsegment = [&](uint32_t subsegment, char* buffer) { + int number_of_digits_printed = 0; + + // If we want to print an odd number of digits from the subsegment, + if ((number_of_digits_to_print & 1) != 0) { + // Convert to 64-bit fixed-point fractional form with 1-digit + // integer part. The magic number 720575941 is a good enough + // approximation of 2^(32 + 24) / 10^8; see + // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case + // for details. + prod = ((subsegment * static_cast(720575941)) >> 24) + 1; + digits = static_cast(prod >> 32); + *buffer = static_cast('0' + digits); + number_of_digits_printed++; + } + // If we want to print an even number of digits from the + // first_subsegment, + else { + // Convert to 64-bit fixed-point fractional form with 2-digits + // integer part. The magic number 450359963 is a good enough + // approximation of 2^(32 + 20) / 10^7; see + // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case + // for details. + prod = ((subsegment * static_cast(450359963)) >> 20) + 1; + digits = static_cast(prod >> 32); + copy2(buffer, digits2(digits)); + number_of_digits_printed += 2; + } + + // Print all digit pairs. + while (number_of_digits_printed < number_of_digits_to_print) { + prod = static_cast(prod) * static_cast(100); + digits = static_cast(prod >> 32); + copy2(buffer + number_of_digits_printed, digits2(digits)); + number_of_digits_printed += 2; + } + }; + + // Print first subsegment. + print_subsegment(first_subsegment, buf.data()); + + // Perform rounding if the first subsegment is the last subsegment to + // print. + if (precision <= 9) { + // Rounding inside the subsegment. + // We round-up if: + // - either the fractional part is strictly larger than 1/2, or + // - the fractional part is exactly 1/2 and the last digit is odd. + // We rely on the following observations: + // - If fractional_part >= threshold, then the fractional part is + // strictly larger than 1/2. + // - If the MSB of fractional_part is set, then the fractional part + // must be at least 1/2. + // - When the MSB of fractional_part is set, either + // second_third_subsegments being nonzero or has_more_segments + // being true means there are further digits not printed, so the + // fractional part is strictly larger than 1/2. + if (precision < 9) { + uint32_t fractional_part = static_cast(prod); + should_round_up = fractional_part >= + data::fractional_part_rounding_thresholds + [8 - number_of_digits_to_print] || + ((fractional_part >> 31) & + ((digits & 1) | (second_third_subsegments != 0) | + has_more_segments)) != 0; + } + // Rounding at the subsegment boundary. + // In this case, the fractional part is at least 1/2 if and only if + // second_third_subsegments >= 5000000000ULL, and is strictly larger + // than 1/2 if we further have either second_third_subsegments > + // 5000000000ULL or has_more_segments == true. + else { + should_round_up = second_third_subsegments > 5000000000ULL || + (second_third_subsegments == 5000000000ULL && + ((digits & 1) != 0 || has_more_segments)); + } + } + // Otherwise, print the second subsegment. + else { + // Compilers are not aware of how to leverage the maximum value of + // second_third_subsegments to find out a better magic number which + // allows us to eliminate an additional shift. 1844674407370955162 = + // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))). + const uint32_t second_subsegment = + static_cast(dragonbox::umul128_upper64( + second_third_subsegments, 1844674407370955162ULL)); + const uint32_t third_subsegment = + static_cast(second_third_subsegments) - + second_subsegment * 10; + + number_of_digits_to_print = precision - 9; + print_subsegment(second_subsegment, buf.data() + 9); + + // Rounding inside the subsegment. + if (precision < 18) { + // The condition third_subsegment != 0 implies that the segment was + // of 19 digits, so in this case the third segment should be + // consisting of a genuine digit from the input. + uint32_t fractional_part = static_cast(prod); + should_round_up = fractional_part >= + data::fractional_part_rounding_thresholds + [8 - number_of_digits_to_print] || + ((fractional_part >> 31) & + ((digits & 1) | (third_subsegment != 0) | + has_more_segments)) != 0; + } + // Rounding at the subsegment boundary. + else { + // In this case, the segment must be of 19 digits, thus + // the third subsegment should be consisting of a genuine digit from + // the input. + should_round_up = third_subsegment > 5 || + (third_subsegment == 5 && + ((digits & 1) != 0 || has_more_segments)); + } + } + + // Round-up if necessary. + if (should_round_up) { + ++buf[precision - 1]; + for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) { + buf[i] = '0'; + ++buf[i - 1]; + } + if (buf[0] > '9') { + buf[0] = '1'; + if (fixed) + buf[precision++] = '0'; + else + ++exp; + } + } + buf.try_resize(to_unsigned(precision)); + } + } // if (digits_in_the_first_segment > precision) + else { + // Adjust the exponent for its use in Dragon4. + exp += digits_in_the_first_segment - 1; + } } if (use_dragon) { auto f = basic_fp(); @@ -3964,7 +4296,9 @@ template <> struct formatter { }; // group_digits_view is not derived from view because it copies the argument. -template struct group_digits_view { T value; }; +template struct group_digits_view { + T value; +}; /** \rst diff --git a/test/format-impl-test.cc b/test/format-impl-test.cc index 7876ba7868bf..e39bde740680 100644 --- a/test/format-impl-test.cc +++ b/test/format-impl-test.cc @@ -246,7 +246,7 @@ TEST(fp_test, dragonbox_max_k) { fmt::detail::const_check(double_info::max_k), double_info::kappa - floor_log10_pow2(std::numeric_limits::min_exponent - - fmt::detail::num_significand_bits() - 1)); + 2 * fmt::detail::num_significand_bits() - 1)); } TEST(fp_test, get_round_direction) {