From b9c69ec3c3a318b8279ec69b3548e0dfdd462038 Mon Sep 17 00:00:00 2001 From: Eduard-Mihai Burtescu Date: Wed, 23 Aug 2017 02:44:41 +0300 Subject: [PATCH] Speed up APFloat division by using short division for small divisors. --- src/librustc_apfloat/ieee.rs | 126 ++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 30 deletions(-) diff --git a/src/librustc_apfloat/ieee.rs b/src/librustc_apfloat/ieee.rs index 3545a77c75de6..124c840cc56d6 100644 --- a/src/librustc_apfloat/ieee.rs +++ b/src/librustc_apfloat/ieee.rs @@ -460,18 +460,15 @@ impl fmt::Display for IeeeFloat { // rem <- sig % 10 // sig <- sig / 10 let mut rem = 0; - for limb in sig.iter_mut().rev() { - // We don't have an integer doubly wide than Limb, - // so we have to split the divrem on two halves. - const HALF_BITS: usize = LIMB_BITS / 2; - let mut halves = [*limb & ((1 << HALF_BITS) - 1), *limb >> HALF_BITS]; - for half in halves.iter_mut().rev() { - *half |= rem << HALF_BITS; - rem = *half % 10; - *half /= 10; - } - *limb = halves[0] | (halves[1] << HALF_BITS); - } + + // Use 64-bit division and remainder, with 32-bit chunks from sig. + sig::each_chunk(&mut sig, 32, |chunk| { + let chunk = chunk as u32; + let combined = ((rem as u64) << 32) | (chunk as u64); + rem = (combined % 10) as u8; + (combined / 10) as u32 as Limb + }); + // Reduce the sigificand to avoid wasting time dividing 0's. while sig.last() == Some(&0) { sig.pop(); @@ -491,7 +488,7 @@ impl fmt::Display for IeeeFloat { exp += 1; } else { in_trail = false; - buffer.push(b'0' + digit as u8); + buffer.push(b'0' + digit); } } @@ -2065,7 +2062,7 @@ impl IeeeFloat { }; // Attempt dec_sig * 10^dec_exp with increasing precision. - let mut attempt = 1; + let mut attempt = 0; loop { let calc_precision = (LIMB_BITS << attempt) - 1; attempt += 1; @@ -2310,6 +2307,17 @@ mod sig { limbs.iter().all(|&l| l == 0) } + /// One, not zero, based LSB. That is, returns 0 for a zeroed significand. + pub(super) fn olsb(limbs: &[Limb]) -> usize { + for i in 0..limbs.len() { + if limbs[i] != 0 { + return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1; + } + } + + 0 + } + /// One, not zero, based MSB. That is, returns 0 for a zeroed significand. pub(super) fn omsb(limbs: &[Limb]) -> usize { for i in (0..limbs.len()).rev() { @@ -2468,6 +2476,20 @@ mod sig { } } + /// For every consecutive chunk of `bits` bits from `limbs`, + /// going from most significant to the least significant bits, + /// call `f` to transform those bits and store the result back. + pub(super) fn each_chunk Limb>(limbs: &mut [Limb], bits: usize, mut f: F) { + assert_eq!(LIMB_BITS % bits, 0); + for limb in limbs.iter_mut().rev() { + let mut r = 0; + for i in (0..LIMB_BITS / bits).rev() { + r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits); + } + *limb = r; + } + } + /// Increment in-place, return the carry flag. pub(super) fn increment(dst: &mut [Limb]) -> Limb { for x in dst { @@ -2686,10 +2708,6 @@ mod sig { divisor: &mut [Limb], precision: usize, ) -> Loss { - // Zero the quotient before setting bits in it. - for x in &mut quotient[..limbs_for_bits(precision)] { - *x = 0; - } // Normalize the divisor. let bits = precision - omsb(divisor); @@ -2700,6 +2718,13 @@ mod sig { let bits = precision - omsb(dividend); shift_left(dividend, exp, bits); + // Division by 1. + let olsb_divisor = olsb(divisor); + if olsb_divisor == precision { + quotient.copy_from_slice(dividend); + return Loss::ExactlyZero; + } + // Ensure the dividend >= divisor initially for the loop below. // Incidentally, this means that the division loop below is // guaranteed to set the integer bit to one. @@ -2708,6 +2733,58 @@ mod sig { assert_ne!(cmp(dividend, divisor), Ordering::Less) } + // Helper for figuring out the lost fraction. + let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| { + match cmp(dividend, divisor) { + Ordering::Greater => Loss::MoreThanHalf, + Ordering::Equal => Loss::ExactlyHalf, + Ordering::Less => { + if is_all_zeros(dividend) { + Loss::ExactlyZero + } else { + Loss::LessThanHalf + } + } + } + }; + + // Try to perform a (much faster) short division for small divisors. + let divisor_bits = precision - (olsb_divisor - 1); + macro_rules! try_short_div { + ($W:ty, $H:ty, $half:expr) => { + if divisor_bits * 2 <= $half { + // Extract the small divisor. + let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1); + let divisor = divisor[0] as $H as $W; + + // Shift the dividend to produce a quotient with the unit bit set. + let top_limb = *dividend.last().unwrap(); + let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H; + shift_left(dividend, &mut 0, divisor_bits - 1); + + // Apply short division in place on $H (of $half bits) chunks. + each_chunk(dividend, $half, |chunk| { + let chunk = chunk as $H; + let combined = ((rem as $W) << $half) | (chunk as $W); + rem = (combined % divisor) as $H; + (combined / divisor) as $H as Limb + }); + quotient.copy_from_slice(dividend); + + return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]); + } + } + } + + try_short_div!(u32, u16, 16); + try_short_div!(u64, u32, 32); + try_short_div!(u128, u64, 64); + + // Zero the quotient before setting bits in it. + for x in &mut quotient[..limbs_for_bits(precision)] { + *x = 0; + } + // Long division. for bit in (0..precision).rev() { if cmp(dividend, divisor) != Ordering::Less { @@ -2717,17 +2794,6 @@ mod sig { shift_left(dividend, &mut 0, 1); } - // Figure out the lost fraction. - match cmp(dividend, divisor) { - Ordering::Greater => Loss::MoreThanHalf, - Ordering::Equal => Loss::ExactlyHalf, - Ordering::Less => { - if is_all_zeros(dividend) { - Loss::ExactlyZero - } else { - Loss::LessThanHalf - } - } - } + lost_fraction(dividend, divisor) } }