@@ -279,7 +279,7 @@ macro_rules! impl_Display {
279279 // Format per two digits from the lookup table.
280280 if remain > 9 {
281281 // SAFETY: All of the decimals fit in buf due to MAX_DEC_N
282- // and the while condition ensures at least 2 more decimals.
282+ // and the if condition ensures at least 2 more decimals.
283283 unsafe { core:: hint:: assert_unchecked( offset >= 2 ) }
284284 // SAFETY: The offset counts down from its initial buf.len()
285285 // without underflow due to the previous precondition.
@@ -565,93 +565,6 @@ mod imp {
565565}
566566impl_Exp ! ( i128 , u128 as u128 via to_u128 named exp_u128) ;
567567
568- /// Helper function for writing a u64 into `buf` going from last to first, with `curr`.
569- fn parse_u64_into < const N : usize > ( mut n : u64 , buf : & mut [ MaybeUninit < u8 > ; N ] , curr : & mut usize ) {
570- let buf_ptr = MaybeUninit :: slice_as_mut_ptr ( buf) ;
571- let lut_ptr = DEC_DIGITS_LUT . as_ptr ( ) ;
572- assert ! ( * curr > 19 ) ;
573-
574- // SAFETY:
575- // Writes at most 19 characters into the buffer. Guaranteed that any ptr into LUT is at most
576- // 198, so will never OOB. There is a check above that there are at least 19 characters
577- // remaining.
578- unsafe {
579- if n >= 1e16 as u64 {
580- let to_parse = n % 1e16 as u64 ;
581- n /= 1e16 as u64 ;
582-
583- // Some of these are nops but it looks more elegant this way.
584- let d1 = ( ( to_parse / 1e14 as u64 ) % 100 ) << 1 ;
585- let d2 = ( ( to_parse / 1e12 as u64 ) % 100 ) << 1 ;
586- let d3 = ( ( to_parse / 1e10 as u64 ) % 100 ) << 1 ;
587- let d4 = ( ( to_parse / 1e8 as u64 ) % 100 ) << 1 ;
588- let d5 = ( ( to_parse / 1e6 as u64 ) % 100 ) << 1 ;
589- let d6 = ( ( to_parse / 1e4 as u64 ) % 100 ) << 1 ;
590- let d7 = ( ( to_parse / 1e2 as u64 ) % 100 ) << 1 ;
591- let d8 = ( ( to_parse / 1e0 as u64 ) % 100 ) << 1 ;
592-
593- * curr -= 16 ;
594-
595- ptr:: copy_nonoverlapping ( lut_ptr. add ( d1 as usize ) , buf_ptr. add ( * curr + 0 ) , 2 ) ;
596- ptr:: copy_nonoverlapping ( lut_ptr. add ( d2 as usize ) , buf_ptr. add ( * curr + 2 ) , 2 ) ;
597- ptr:: copy_nonoverlapping ( lut_ptr. add ( d3 as usize ) , buf_ptr. add ( * curr + 4 ) , 2 ) ;
598- ptr:: copy_nonoverlapping ( lut_ptr. add ( d4 as usize ) , buf_ptr. add ( * curr + 6 ) , 2 ) ;
599- ptr:: copy_nonoverlapping ( lut_ptr. add ( d5 as usize ) , buf_ptr. add ( * curr + 8 ) , 2 ) ;
600- ptr:: copy_nonoverlapping ( lut_ptr. add ( d6 as usize ) , buf_ptr. add ( * curr + 10 ) , 2 ) ;
601- ptr:: copy_nonoverlapping ( lut_ptr. add ( d7 as usize ) , buf_ptr. add ( * curr + 12 ) , 2 ) ;
602- ptr:: copy_nonoverlapping ( lut_ptr. add ( d8 as usize ) , buf_ptr. add ( * curr + 14 ) , 2 ) ;
603- }
604- if n >= 1e8 as u64 {
605- let to_parse = n % 1e8 as u64 ;
606- n /= 1e8 as u64 ;
607-
608- // Some of these are nops but it looks more elegant this way.
609- let d1 = ( ( to_parse / 1e6 as u64 ) % 100 ) << 1 ;
610- let d2 = ( ( to_parse / 1e4 as u64 ) % 100 ) << 1 ;
611- let d3 = ( ( to_parse / 1e2 as u64 ) % 100 ) << 1 ;
612- let d4 = ( ( to_parse / 1e0 as u64 ) % 100 ) << 1 ;
613- * curr -= 8 ;
614-
615- ptr:: copy_nonoverlapping ( lut_ptr. add ( d1 as usize ) , buf_ptr. add ( * curr + 0 ) , 2 ) ;
616- ptr:: copy_nonoverlapping ( lut_ptr. add ( d2 as usize ) , buf_ptr. add ( * curr + 2 ) , 2 ) ;
617- ptr:: copy_nonoverlapping ( lut_ptr. add ( d3 as usize ) , buf_ptr. add ( * curr + 4 ) , 2 ) ;
618- ptr:: copy_nonoverlapping ( lut_ptr. add ( d4 as usize ) , buf_ptr. add ( * curr + 6 ) , 2 ) ;
619- }
620- // `n` < 1e8 < (1 << 32)
621- let mut n = n as u32 ;
622- if n >= 1e4 as u32 {
623- let to_parse = n % 1e4 as u32 ;
624- n /= 1e4 as u32 ;
625-
626- let d1 = ( to_parse / 100 ) << 1 ;
627- let d2 = ( to_parse % 100 ) << 1 ;
628- * curr -= 4 ;
629-
630- ptr:: copy_nonoverlapping ( lut_ptr. add ( d1 as usize ) , buf_ptr. add ( * curr + 0 ) , 2 ) ;
631- ptr:: copy_nonoverlapping ( lut_ptr. add ( d2 as usize ) , buf_ptr. add ( * curr + 2 ) , 2 ) ;
632- }
633-
634- // `n` < 1e4 < (1 << 16)
635- let mut n = n as u16 ;
636- if n >= 100 {
637- let d1 = ( n % 100 ) << 1 ;
638- n /= 100 ;
639- * curr -= 2 ;
640- ptr:: copy_nonoverlapping ( lut_ptr. add ( d1 as usize ) , buf_ptr. add ( * curr) , 2 ) ;
641- }
642-
643- // decode last 1 or 2 chars
644- if n < 10 {
645- * curr -= 1 ;
646- * buf_ptr. add ( * curr) = ( n as u8 ) + b'0' ;
647- } else {
648- let d1 = n << 1 ;
649- * curr -= 2 ;
650- ptr:: copy_nonoverlapping ( lut_ptr. add ( d1 as usize ) , buf_ptr. add ( * curr) , 2 ) ;
651- }
652- }
653- }
654-
655568#[ stable( feature = "rust1" , since = "1.0.0" ) ]
656569impl fmt:: Display for u128 {
657570 fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
@@ -662,90 +575,153 @@ impl fmt::Display for u128 {
662575#[ stable( feature = "rust1" , since = "1.0.0" ) ]
663576impl fmt:: Display for i128 {
664577 fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
665- let is_nonnegative = * self >= 0 ;
666- let n = if is_nonnegative {
667- self . to_u128 ( )
668- } else {
669- // convert the negative num to positive by summing 1 to its 2s complement
670- ( !self . to_u128 ( ) ) . wrapping_add ( 1 )
671- } ;
672- fmt_u128 ( n, is_nonnegative, f)
578+ fmt_u128 ( self . unsigned_abs ( ) , * self >= 0 , f)
673579 }
674580}
675581
676- /// Specialized optimization for u128. Instead of taking two items at a time, it splits
677- /// into at most 2 u64s, and then chunks by 10e16, 10e8, 10e4, 10e2, and then 10e1.
678- /// It also has to handle 1 last item, as 10^40 > 2^128 > 10^39, whereas
679- /// 10^20 > 2^64 > 10^19.
582+ /// Format optimized for u128. Computation of 128 bits is limited by proccessing
583+ /// in batches of 16 decimals at a time.
680584fn fmt_u128 ( n : u128 , is_nonnegative : bool , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
585+ // Optimize common-case zero, which would also need special treatment due to
586+ // its "leading" zero.
587+ if n == 0 {
588+ return f. pad_integral ( true , "" , "0" ) ;
589+ }
590+
591+ // U128::MAX has 39 significant-decimals.
681592 const MAX_DEC_N : usize = u128:: MAX . ilog ( 10 ) as usize + 1 ;
593+ // Buffer decimals with right alignment.
682594 let mut buf = [ MaybeUninit :: < u8 > :: uninit ( ) ; MAX_DEC_N ] ;
683- let mut curr = buf. len ( ) ;
684-
685- let ( n, rem) = udiv_1e19 ( n) ;
686- parse_u64_into ( rem, & mut buf, & mut curr) ;
687-
688- if n != 0 {
689- // 0 pad up to point
690- let target = buf. len ( ) - 19 ;
691- // SAFETY: Guaranteed that we wrote at most 19 bytes, and there must be space
692- // remaining since it has length 39
693- unsafe {
694- ptr:: write_bytes (
695- MaybeUninit :: slice_as_mut_ptr ( & mut buf) . add ( target) ,
696- b'0' ,
697- curr - target,
698- ) ;
699- }
700- curr = target;
701-
702- let ( n, rem) = udiv_1e19 ( n) ;
703- parse_u64_into ( rem, & mut buf, & mut curr) ;
704- // Should this following branch be annotated with unlikely?
705- if n != 0 {
706- let target = buf. len ( ) - 38 ;
707- // The raw `buf_ptr` pointer is only valid until `buf` is used the next time,
708- // buf `buf` is not used in this scope so we are good.
709- let buf_ptr = MaybeUninit :: slice_as_mut_ptr ( & mut buf) ;
710- // SAFETY: At this point we wrote at most 38 bytes, pad up to that point,
711- // There can only be at most 1 digit remaining.
712- unsafe {
713- ptr:: write_bytes ( buf_ptr. add ( target) , b'0' , curr - target) ;
714- curr = target - 1 ;
715- * buf_ptr. add ( curr) = ( n as u8 ) + b'0' ;
716- }
595+
596+ // Take the 16 least-significant decimals.
597+ let ( quot_1e16, mod_1e16) = div_rem_1e16 ( n) ;
598+ let ( mut remain, mut offset) = if quot_1e16 == 0 {
599+ ( mod_1e16, MAX_DEC_N )
600+ } else {
601+ // Write digits at buf[23..39].
602+ enc_16lsd :: < { MAX_DEC_N - 16 } > ( & mut buf, mod_1e16) ;
603+
604+ // Take another 16 decimals.
605+ let ( quot2, mod2) = div_rem_1e16 ( quot_1e16) ;
606+ if quot2 == 0 {
607+ ( mod2, MAX_DEC_N - 16 )
608+ } else {
609+ // Write digits at buf[7..23].
610+ enc_16lsd :: < { MAX_DEC_N - 32 } > ( & mut buf, mod2) ;
611+ // Quot2 has at most 7 decimals remaining after two 1e16 divisions.
612+ ( quot2 as u64 , MAX_DEC_N - 32 )
717613 }
614+ } ;
615+
616+ // Format per four digits from the lookup table.
617+ while remain > 999 {
618+ // SAFETY: All of the decimals fit in buf due to MAX_DEC_N
619+ // and the while condition ensures at least 4 more decimals.
620+ unsafe { core:: hint:: assert_unchecked ( offset >= 4 ) }
621+ // SAFETY: The offset counts down from its initial buf.len()
622+ // without underflow due to the previous precondition.
623+ unsafe { core:: hint:: assert_unchecked ( offset <= buf. len ( ) ) }
624+ offset -= 4 ;
625+
626+ // pull two pairs
627+ let quad = remain % 1_00_00 ;
628+ remain /= 1_00_00 ;
629+ let pair1 = ( quad / 100 ) as usize ;
630+ let pair2 = ( quad % 100 ) as usize ;
631+ buf[ offset + 0 ] . write ( DEC_DIGITS_LUT [ pair1 * 2 + 0 ] ) ;
632+ buf[ offset + 1 ] . write ( DEC_DIGITS_LUT [ pair1 * 2 + 1 ] ) ;
633+ buf[ offset + 2 ] . write ( DEC_DIGITS_LUT [ pair2 * 2 + 0 ] ) ;
634+ buf[ offset + 3 ] . write ( DEC_DIGITS_LUT [ pair2 * 2 + 1 ] ) ;
635+ }
636+
637+ // Format per two digits from the lookup table.
638+ if remain > 9 {
639+ // SAFETY: All of the decimals fit in buf due to MAX_DEC_N
640+ // and the if condition ensures at least 2 more decimals.
641+ unsafe { core:: hint:: assert_unchecked ( offset >= 2 ) }
642+ // SAFETY: The offset counts down from its initial buf.len()
643+ // without underflow due to the previous precondition.
644+ unsafe { core:: hint:: assert_unchecked ( offset <= buf. len ( ) ) }
645+ offset -= 2 ;
646+
647+ let pair = ( remain % 100 ) as usize ;
648+ remain /= 100 ;
649+ buf[ offset + 0 ] . write ( DEC_DIGITS_LUT [ pair * 2 + 0 ] ) ;
650+ buf[ offset + 1 ] . write ( DEC_DIGITS_LUT [ pair * 2 + 1 ] ) ;
651+ }
652+
653+ // Format the last remaining digit, if any.
654+ if remain != 0 {
655+ // SAFETY: All of the decimals fit in buf due to MAX_DEC_N
656+ // and the if condition ensures (at least) 1 more decimals.
657+ unsafe { core:: hint:: assert_unchecked ( offset >= 1 ) }
658+ // SAFETY: The offset counts down from its initial buf.len()
659+ // without underflow due to the previous precondition.
660+ unsafe { core:: hint:: assert_unchecked ( offset <= buf. len ( ) ) }
661+ offset -= 1 ;
662+
663+ // Either the compiler sees that remain < 10, or it prevents
664+ // a boundary check up next.
665+ let last = ( remain & 15 ) as usize ;
666+ buf[ offset] . write ( DEC_DIGITS_LUT [ last * 2 + 1 ] ) ;
667+ // not used: remain = 0;
718668 }
719669
720- // SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid
721- // UTF-8 since `DEC_DIGITS_LUT` is
722- let buf_slice = unsafe {
670+ // SAFETY: All buf content since offset is set.
671+ let written = unsafe { buf. get_unchecked ( offset..) } ;
672+ // SAFETY: Writes use ASCII from the lookup table exclusively.
673+ let as_str = unsafe {
723674 str:: from_utf8_unchecked ( slice:: from_raw_parts (
724- MaybeUninit :: slice_as_mut_ptr ( & mut buf ) . add ( curr ) ,
725- buf . len ( ) - curr ,
675+ MaybeUninit :: slice_as_ptr ( written ) ,
676+ written . len ( ) ,
726677 ) )
727678 } ;
728- f. pad_integral ( is_nonnegative, "" , buf_slice )
679+ f. pad_integral ( is_nonnegative, "" , as_str )
729680}
730681
731- /// Partition of `n` into n > 1e19 and rem <= 1e19
682+ /// Encodes the 16 least-significant decimals of n into `buf[OFFSET .. OFFSET +
683+ /// 16 ]`.
684+ fn enc_16lsd < const OFFSET : usize > ( buf : & mut [ MaybeUninit < u8 > ; 39 ] , n : u64 ) {
685+ // Consume the least-significant decimals from a working copy.
686+ let mut remain = n;
687+
688+ // Format per four digits from the lookup table.
689+ for quad_index in ( 0 ..4 ) . rev ( ) {
690+ // pull two pairs
691+ let quad = remain % 1_00_00 ;
692+ remain /= 1_00_00 ;
693+ let pair1 = ( quad / 100 ) as usize ;
694+ let pair2 = ( quad % 100 ) as usize ;
695+ buf[ quad_index * 4 + OFFSET + 0 ] . write ( DEC_DIGITS_LUT [ pair1 * 2 + 0 ] ) ;
696+ buf[ quad_index * 4 + OFFSET + 1 ] . write ( DEC_DIGITS_LUT [ pair1 * 2 + 1 ] ) ;
697+ buf[ quad_index * 4 + OFFSET + 2 ] . write ( DEC_DIGITS_LUT [ pair2 * 2 + 0 ] ) ;
698+ buf[ quad_index * 4 + OFFSET + 3 ] . write ( DEC_DIGITS_LUT [ pair2 * 2 + 1 ] ) ;
699+ }
700+ }
701+
702+ /// Euclidean division plus remainder with constant 1E16 basically consumes 16
703+ /// decimals from n.
732704///
733- /// Integer division algorithm is based on the following paper:
705+ /// The integer division algorithm is based on the following paper:
734706///
735707/// T. Granlund and P. Montgomery, “Division by Invariant Integers Using Multiplication”
736708/// in Proc. of the SIGPLAN94 Conference on Programming Language Design and
737709/// Implementation, 1994, pp. 61–72
738710///
739- fn udiv_1e19 ( n : u128 ) -> ( u128 , u64 ) {
740- const DIV : u64 = 1e19 as u64 ;
741- const FACTOR : u128 = 156927543384667019095894735580191660403 ;
711+ #[ inline]
712+ fn div_rem_1e16 ( n : u128 ) -> ( u128 , u64 ) {
713+ const D : u128 = 1_0000_0000_0000_0000 ;
714+ // The check inlines well with the caller flow.
715+ if n < D {
716+ return ( 0 , n as u64 ) ;
717+ }
742718
743- let quot = if n < 1 << 83 {
744- ( ( n >> 19 ) as u64 / ( DIV >> 19 ) ) as u128
745- } else {
746- n. widening_mul ( FACTOR ) . 1 >> 62
747- } ;
719+ // These constant values are computed with the CHOOSE_MULTIPLIER procedure
720+ // from the Granlund & Montgomery paper, using N=128, prec=128 and d=1E16.
721+ const M_HIGH : u128 = 76624777043294442917917351357515459181 ;
722+ const SH_POST : u8 = 51 ;
748723
749- let rem = ( n - quot * DIV as u128 ) as u64 ;
750- ( quot, rem)
724+ let quot = n. widening_mul ( M_HIGH ) . 1 >> SH_POST ;
725+ let rem = n - quot * D ;
726+ ( quot, rem as u64 )
751727}
0 commit comments