Skip to content

Commit

Permalink
Auto merge of #44051 - eddyb:apfloat-faster-div, r=nagisa
Browse files Browse the repository at this point in the history
Speed up APFloat division by using short division for small divisors.

Fixes #43828 (hopefully), by not doing long division bit-by-bit for small divisors.

When parsing the ~200,000 decimal float literals in the `tuple-stress` benchmark, this change brings roughly a 5x speed increase (from `0.6s` to `0.12s`), and the hottest instructions are native `div`s.
  • Loading branch information
bors committed Aug 24, 2017
2 parents 2c0558f + b9c69ec commit c0771f2
Showing 1 changed file with 96 additions and 30 deletions.
126 changes: 96 additions & 30 deletions src/librustc_apfloat/ieee.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,18 +460,15 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
// rem <- sig % 10
// sig <- sig / 10
let mut rem = 0;
for limb in sig.iter_mut().rev() {
// We don't have an integer doubly wide than Limb,
// so we have to split the divrem on two halves.
const HALF_BITS: usize = LIMB_BITS / 2;
let mut halves = [*limb & ((1 << HALF_BITS) - 1), *limb >> HALF_BITS];
for half in halves.iter_mut().rev() {
*half |= rem << HALF_BITS;
rem = *half % 10;
*half /= 10;
}
*limb = halves[0] | (halves[1] << HALF_BITS);
}

// Use 64-bit division and remainder, with 32-bit chunks from sig.
sig::each_chunk(&mut sig, 32, |chunk| {
let chunk = chunk as u32;
let combined = ((rem as u64) << 32) | (chunk as u64);
rem = (combined % 10) as u8;
(combined / 10) as u32 as Limb
});

// Reduce the sigificand to avoid wasting time dividing 0's.
while sig.last() == Some(&0) {
sig.pop();
Expand All @@ -491,7 +488,7 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
exp += 1;
} else {
in_trail = false;
buffer.push(b'0' + digit as u8);
buffer.push(b'0' + digit);
}
}

Expand Down Expand Up @@ -2065,7 +2062,7 @@ impl<S: Semantics> IeeeFloat<S> {
};

// Attempt dec_sig * 10^dec_exp with increasing precision.
let mut attempt = 1;
let mut attempt = 0;
loop {
let calc_precision = (LIMB_BITS << attempt) - 1;
attempt += 1;
Expand Down Expand Up @@ -2310,6 +2307,17 @@ mod sig {
limbs.iter().all(|&l| l == 0)
}

/// One, not zero, based LSB. That is, returns 0 for a zeroed significand.
pub(super) fn olsb(limbs: &[Limb]) -> usize {
for i in 0..limbs.len() {
if limbs[i] != 0 {
return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1;
}
}

0
}

/// One, not zero, based MSB. That is, returns 0 for a zeroed significand.
pub(super) fn omsb(limbs: &[Limb]) -> usize {
for i in (0..limbs.len()).rev() {
Expand Down Expand Up @@ -2468,6 +2476,20 @@ mod sig {
}
}

/// For every consecutive chunk of `bits` bits from `limbs`,
/// going from most significant to the least significant bits,
/// call `f` to transform those bits and store the result back.
pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) {
assert_eq!(LIMB_BITS % bits, 0);
for limb in limbs.iter_mut().rev() {
let mut r = 0;
for i in (0..LIMB_BITS / bits).rev() {
r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits);
}
*limb = r;
}
}

/// Increment in-place, return the carry flag.
pub(super) fn increment(dst: &mut [Limb]) -> Limb {
for x in dst {
Expand Down Expand Up @@ -2686,10 +2708,6 @@ mod sig {
divisor: &mut [Limb],
precision: usize,
) -> Loss {
// Zero the quotient before setting bits in it.
for x in &mut quotient[..limbs_for_bits(precision)] {
*x = 0;
}

// Normalize the divisor.
let bits = precision - omsb(divisor);
Expand All @@ -2700,6 +2718,13 @@ mod sig {
let bits = precision - omsb(dividend);
shift_left(dividend, exp, bits);

// Division by 1.
let olsb_divisor = olsb(divisor);
if olsb_divisor == precision {
quotient.copy_from_slice(dividend);
return Loss::ExactlyZero;
}

// Ensure the dividend >= divisor initially for the loop below.
// Incidentally, this means that the division loop below is
// guaranteed to set the integer bit to one.
Expand All @@ -2708,6 +2733,58 @@ mod sig {
assert_ne!(cmp(dividend, divisor), Ordering::Less)
}

// Helper for figuring out the lost fraction.
let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| {
match cmp(dividend, divisor) {
Ordering::Greater => Loss::MoreThanHalf,
Ordering::Equal => Loss::ExactlyHalf,
Ordering::Less => {
if is_all_zeros(dividend) {
Loss::ExactlyZero
} else {
Loss::LessThanHalf
}
}
}
};

// Try to perform a (much faster) short division for small divisors.
let divisor_bits = precision - (olsb_divisor - 1);
macro_rules! try_short_div {
($W:ty, $H:ty, $half:expr) => {
if divisor_bits * 2 <= $half {
// Extract the small divisor.
let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1);
let divisor = divisor[0] as $H as $W;

// Shift the dividend to produce a quotient with the unit bit set.
let top_limb = *dividend.last().unwrap();
let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H;
shift_left(dividend, &mut 0, divisor_bits - 1);

// Apply short division in place on $H (of $half bits) chunks.
each_chunk(dividend, $half, |chunk| {
let chunk = chunk as $H;
let combined = ((rem as $W) << $half) | (chunk as $W);
rem = (combined % divisor) as $H;
(combined / divisor) as $H as Limb
});
quotient.copy_from_slice(dividend);

return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]);
}
}
}

try_short_div!(u32, u16, 16);
try_short_div!(u64, u32, 32);
try_short_div!(u128, u64, 64);

// Zero the quotient before setting bits in it.
for x in &mut quotient[..limbs_for_bits(precision)] {
*x = 0;
}

// Long division.
for bit in (0..precision).rev() {
if cmp(dividend, divisor) != Ordering::Less {
Expand All @@ -2717,17 +2794,6 @@ mod sig {
shift_left(dividend, &mut 0, 1);
}

// Figure out the lost fraction.
match cmp(dividend, divisor) {
Ordering::Greater => Loss::MoreThanHalf,
Ordering::Equal => Loss::ExactlyHalf,
Ordering::Less => {
if is_all_zeros(dividend) {
Loss::ExactlyZero
} else {
Loss::LessThanHalf
}
}
}
lost_fraction(dividend, divisor)
}
}

0 comments on commit c0771f2

Please sign in to comment.