From ebbc8a799020243f703c9a318239af467d1d1597 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Mon, 11 Sep 2023 14:21:23 -0400
Subject: [PATCH] make "dec" and ryu functions faster and simpler

We had some common code in `Ryu.append_c_digits` that can be combined with
Base logic for the same thing. But it turns out all of this duplicated
code in Ryu seems to just make it run slightly slower in most cases. The
old version had many more branches to check, even though often numbers
are small, so only the last check is meaningful. But the assumption that
it would be faster even if all of them were used also seems to not hold
up in practice. Particularly for a function like `append_nine_digits`
which unrolls completely, but the complicated version has slightly more
data dependencies because of they way it is written.

Similarly, we replace `unsafe_copy` with `@inbounds[]`, since this is
better for the optimizer, which doesn't need to treat this operation as
an unknown reference escape.

Lastly, we use the append_nine_digits trick from Ryu to make printing
of arbitrary big numbers much faster.

```
julia> @btime string(typemax(Int128))
  402.345 ns (2 allocations: 120 bytes) # before
  151.139 ns (2 allocations: 120 bytes) # after
```
---
 base/intfuncs.jl     |  86 ++++++++++++++----
 base/ryu/exp.jl      |  70 ++++++++-------
 base/ryu/fixed.jl    |   2 +-
 base/ryu/shortest.jl | 209 ++++++++++++++++++-------------------------
 base/ryu/utils.jl    | 118 +++---------------------
 5 files changed, 200 insertions(+), 285 deletions(-)

diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index 1b007700f4331..90dc393a0e9b4 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -558,7 +558,7 @@ function bit_ndigits0z(x::Base.BitUnsigned64)
 end
 function bit_ndigits0z(x::UInt128)
     n = 0
-    while x > 0x8ac7230489e80000
+    while x > 0x8ac7230489e80000 # 10e18
         x = div(x,0x8ac7230489e80000)
         n += 19
     end
@@ -724,7 +724,7 @@ function bin(x::Unsigned, pad::Int, neg::Bool)
         x >>= 0x1
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -738,29 +738,77 @@ function oct(x::Unsigned, pad::Int, neg::Bool)
         x >>= 0x3
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
 # 2-digit decimal characters ("00":"99")
-const _dec_d100 = UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99]
+const _dec_d100 = UInt16[
+# generating expression: UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99]
+#    0 0,    0 1,    0 2,    0 3, and so on in little-endian
+  0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
+  0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
+  0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
+  0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
+  0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
+  0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
+  0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
+  0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
+  0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
+  0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939
+]
 
-function dec(x::Unsigned, pad::Int, neg::Bool)
-    n = neg + ndigits(x, pad=pad)
-    a = StringVector(n)
-    i = n
-    @inbounds while i >= 2
-        d, r = divrem(x, 0x64)
-        d100 = _dec_d100[(r % Int)::Int + 1]
-        a[i-1] = d100 % UInt8
-        a[i] = (d100 >> 0x8) % UInt8
-        x = oftype(x, d)
+function append_c_digits(olength::Int, digits::Unsigned, buf, pos::Int)
+    i = olength
+    while i >= 2
+        d, c = divrem(digits, 0x64)
+        digits = oftype(digits, d)
+        @inbounds d100 = _dec_d100[(c % Int) + 1]
+        @inbounds buf[pos + i - 2] = d100 % UInt8
+        @inbounds buf[pos + i - 1] = (d100 >> 0x8) % UInt8
         i -= 2
     end
-    if i > neg
-        @inbounds a[i] = 0x30 + (rem(x, 0xa) % UInt8)::UInt8
+    if i == 1
+        @inbounds buf[pos] = UInt8('0') + rem(digits, 0xa) % UInt8
+        i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    return pos + olength
+end
+
+function append_nine_digits(digits::Unsigned, buf, pos::Int)
+    if digits == 0
+        for _ = 1:9
+            @inbounds buf[pos] = UInt8('0')
+            pos += 1
+        end
+        return pos
+    end
+    return @inline append_c_digits(9, digits, buf, pos) # force loop-unrolling on the length
+end
+
+function append_c_digits_fast(olength::Int, digits::Unsigned, buf, pos::Int)
+    i = olength
+    # n.b. olength may be larger than required to print all of `digits` (and will be padded
+    # with zeros), but the printed number will be undefined if it is smaller, and may include
+    # bits of both the high and low bytes.
+    maxpow10 = 0x3b9aca00 # 10e9 as UInt32
+    while i > 9 && digits > typemax(UInt)
+        # do everything in cheap math chunks, using the processor's native math size
+        d, c = divrem(digits, maxpow10)
+        digits = oftype(digits, d)
+        append_nine_digits(c % UInt32, buf, pos + i - 9)
+        i -= 9
+    end
+    append_c_digits(i, digits % UInt, buf, pos)
+    return pos + olength
+end
+
+
+function dec(x::Unsigned, pad::Int, neg::Bool)
+    n = neg + ndigits(x, pad=pad)
+    a = StringVector(n)
+    append_c_digits_fast(n, x, a, 1)
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -781,7 +829,7 @@ function hex(x::Unsigned, pad::Int, neg::Bool)
         d = (x % UInt8)::UInt8 & 0xf
         @inbounds a[i] = d + ifelse(d > 0x9, 0x57, 0x30)
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -806,7 +854,7 @@ function _base(base::Integer, x::Integer, pad::Int, neg::Bool)
         end
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
diff --git a/base/ryu/exp.jl b/base/ryu/exp.jl
index 30291212d014d..4249a9ea1b519 100644
--- a/base/ryu/exp.jl
+++ b/base/ryu/exp.jl
@@ -8,33 +8,33 @@ function writeexp(buf, pos, v::T,
 
     # special cases
     if x == 0
-        buf[pos] = UInt8('0')
+        @inbounds buf[pos] = UInt8('0')
         pos += 1
         if precision > 0 && !trimtrailingzeros
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
             for _ = 1:precision
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
         elseif hash
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
         end
-        buf[pos] = expchar
-        buf[pos + 1] = UInt8('+')
-        buf[pos + 2] = UInt8('0')
-        buf[pos + 3] = UInt8('0')
+        @inbounds buf[pos] = expchar
+        @inbounds buf[pos + 1] = UInt8('+')
+        @inbounds buf[pos + 2] = UInt8('0')
+        @inbounds buf[pos + 3] = UInt8('0')
         return pos + 4
     elseif isnan(x)
-        buf[pos] = UInt8('N')
-        buf[pos + 1] = UInt8('a')
-        buf[pos + 2] = UInt8('N')
+        @inbounds buf[pos] = UInt8('N')
+        @inbounds buf[pos + 1] = UInt8('a')
+        @inbounds buf[pos + 2] = UInt8('N')
         return pos + 3
     elseif !isfinite(x)
-        buf[pos] = UInt8('I')
-        buf[pos + 1] = UInt8('n')
-        buf[pos + 2] = UInt8('f')
+        @inbounds buf[pos] = UInt8('I')
+        @inbounds buf[pos + 1] = UInt8('n')
+        @inbounds buf[pos + 2] = UInt8('f')
         return pos + 3
     end
 
@@ -80,10 +80,10 @@ function writeexp(buf, pos, v::T,
                 if precision > 1
                     pos = append_d_digits(availableDigits, digits, buf, pos, decchar)
                 else
-                    buf[pos] = UInt8('0') + digits
+                    @inbounds buf[pos] = UInt8('0') + digits
                     pos += 1
                     if hash
-                        buf[pos] = decchar
+                        @inbounds buf[pos] = decchar
                         pos += 1
                     end
                 end
@@ -121,10 +121,10 @@ function writeexp(buf, pos, v::T,
                 if precision > 1
                     pos = append_d_digits(availableDigits, digits, buf, pos, decchar)
                 else
-                    buf[pos] = UInt8('0') + digits
+                    @inbounds buf[pos] = UInt8('0') + digits
                     pos += 1
                     if hash
-                        buf[pos] = decchar
+                        @inbounds buf[pos] = decchar
                         pos += 1
                     end
                 end
@@ -162,7 +162,7 @@ function writeexp(buf, pos, v::T,
     if printedDigits != 0
         if digits == 0
             for _ = 1:maximum
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
         else
@@ -172,10 +172,10 @@ function writeexp(buf, pos, v::T,
         if precision > 1
             pos = append_d_digits(maximum, digits, buf, pos, decchar)
         else
-            buf[pos] = UInt8('0') + digits
+            @inbounds buf[pos] = UInt8('0') + digits
             pos += 1
             if hash
-                buf[pos] = decchar
+                @inbounds buf[pos] = decchar
                 pos += 1
             end
         end
@@ -184,52 +184,56 @@ function writeexp(buf, pos, v::T,
         roundPos = pos
         while true
             roundPos -= 1
-            if roundPos == (startpos - 1) || buf[roundPos] == UInt8('-') || (plus && buf[roundPos] == UInt8('+')) || (space && buf[roundPos] == UInt8(' '))
-                buf[roundPos + 1] = UInt8('1')
+            if roundPos == (startpos - 1) || (@inbounds buf[roundPos]) == UInt8('-') || (plus && (@inbounds buf[roundPos]) == UInt8('+')) || (space && (@inbounds buf[roundPos]) == UInt8(' '))
+                @inbounds buf[roundPos + 1] = UInt8('1')
                 e += 1
                 break
             end
-            c = roundPos > 0 ? buf[roundPos] : 0x00
+            c = roundPos > 0 ? (@inbounds buf[roundPos]) : 0x00
             if c == decchar
                 continue
             elseif c == UInt8('9')
-                buf[roundPos] = UInt8('0')
+                @inbounds buf[roundPos] = UInt8('0')
                 roundUp = 1
                 continue
             else
                 if roundUp == 2 && UInt8(c) % 2 == 0
                     break
                 end
-                buf[roundPos] = c + 1
+                @inbounds buf[roundPos] = c + 1
                 break
             end
         end
     end
     if trimtrailingzeros
-        while buf[pos - 1] == UInt8('0')
+        while @inbounds buf[pos - 1] == UInt8('0')
             pos -= 1
         end
-        if buf[pos - 1] == decchar && !hash
+        if @inbounds buf[pos - 1] == decchar && !hash
             pos -= 1
         end
     end
     buf[pos] = expchar
     pos += 1
     if e < 0
-        buf[pos] = UInt8('-')
+        @inbounds buf[pos] = UInt8('-')
         pos += 1
         e = -e
     else
-        buf[pos] = UInt8('+')
+        @inbounds buf[pos] = UInt8('+')
         pos += 1
     end
     if e >= 100
         c = e % 10
-        unsafe_copyto!(buf, pos, DIGIT_TABLE, 2 * div(e, 10) + 1, 2)
-        buf[pos + 2] = UInt8('0') + c
+        @inbounds d100 = DIGIT_TABLE16[div(e, 10) + 1]
+        @inbounds buf[pos] = d100 % UInt8
+        @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
+        @inbounds buf[pos + 2] = UInt8('0') + c
         pos += 3
     else
-        unsafe_copyto!(buf, pos, DIGIT_TABLE, 2 * e + 1, 2)
+        @inbounds d100 = DIGIT_TABLE16[e + 1]
+        @inbounds buf[pos] = d100 % UInt8
+        @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
         pos += 2
     end
     return pos
diff --git a/base/ryu/fixed.jl b/base/ryu/fixed.jl
index e0085f5c66dab..969dd70665a7e 100644
--- a/base/ryu/fixed.jl
+++ b/base/ryu/fixed.jl
@@ -59,7 +59,7 @@ function writefixed(buf, pos, v::T,
                 pos = append_nine_digits(digits, buf, pos)
             elseif digits != 0
                 olength = decimallength(digits)
-                pos = append_n_digits(olength, digits, buf, pos)
+                pos = append_c_digits(olength, digits, buf, pos)
                 nonzero = true
             end
             i -= 1
diff --git a/base/ryu/shortest.jl b/base/ryu/shortest.jl
index aaa62ba33c703..32aa993467e7a 100644
--- a/base/ryu/shortest.jl
+++ b/base/ryu/shortest.jl
@@ -232,79 +232,79 @@ function writeshortest(buf::Vector{UInt8}, pos, x::T,
     # special cases
     if x == 0
         if typed && x isa Float16
-            buf[pos] = UInt8('F')
-            buf[pos + 1] = UInt8('l')
-            buf[pos + 2] = UInt8('o')
-            buf[pos + 3] = UInt8('a')
-            buf[pos + 4] = UInt8('t')
-            buf[pos + 5] = UInt8('1')
-            buf[pos + 6] = UInt8('6')
-            buf[pos + 7] = UInt8('(')
+            @inbounds buf[pos] = UInt8('F')
+            @inbounds buf[pos + 1] = UInt8('l')
+            @inbounds buf[pos + 2] = UInt8('o')
+            @inbounds buf[pos + 3] = UInt8('a')
+            @inbounds buf[pos + 4] = UInt8('t')
+            @inbounds buf[pos + 5] = UInt8('1')
+            @inbounds buf[pos + 6] = UInt8('6')
+            @inbounds buf[pos + 7] = UInt8('(')
             pos += 8
         end
         pos = append_sign(x, plus, space, buf, pos)
-        buf[pos] = UInt8('0')
+        @inbounds buf[pos] = UInt8('0')
         pos += 1
         if hash
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
         end
         if precision == -1
-            buf[pos] = UInt8('0')
+            @inbounds buf[pos] = UInt8('0')
             pos += 1
             if typed && x isa Float32
-                buf[pos] = UInt8('f')
-                buf[pos + 1] = UInt8('0')
+                @inbounds buf[pos] = UInt8('f')
+                @inbounds buf[pos + 1] = UInt8('0')
                 pos += 2
             end
             if typed && x isa Float16
-                buf[pos] = UInt8(')')
+                @inbounds buf[pos] = UInt8(')')
                 pos += 1
             end
             return pos
         end
         while hash && precision > 1
-            buf[pos] = UInt8('0')
+            @inbounds buf[pos] = UInt8('0')
             pos += 1
             precision -= 1
         end
         if typed && x isa Float32
-            buf[pos] = UInt8('f')
-            buf[pos + 1] = UInt8('0')
+            @inbounds buf[pos] = UInt8('f')
+            @inbounds buf[pos + 1] = UInt8('0')
             pos += 2
         end
         if typed && x isa Float16
-            buf[pos] = UInt8(')')
+            @inbounds buf[pos] = UInt8(')')
             pos += 1
         end
         return pos
     elseif isnan(x)
         pos = append_sign(x, plus, space, buf, pos)
-        buf[pos] = UInt8('N')
-        buf[pos + 1] = UInt8('a')
-        buf[pos + 2] = UInt8('N')
+        @inbounds buf[pos] = UInt8('N')
+        @inbounds buf[pos + 1] = UInt8('a')
+        @inbounds buf[pos + 2] = UInt8('N')
         if typed
             if x isa Float32
-                buf[pos + 3] = UInt8('3')
-                buf[pos + 4] = UInt8('2')
+                @inbounds buf[pos + 3] = UInt8('3')
+                @inbounds buf[pos + 4] = UInt8('2')
             elseif x isa Float16
-                buf[pos + 3] = UInt8('1')
-                buf[pos + 4] = UInt8('6')
+                @inbounds buf[pos + 3] = UInt8('1')
+                @inbounds buf[pos + 4] = UInt8('6')
             end
         end
         return pos + 3 + (typed && x isa Union{Float32, Float16} ? 2 : 0)
     elseif !isfinite(x)
         pos = append_sign(x, plus, space, buf, pos)
-        buf[pos] = UInt8('I')
-        buf[pos + 1] = UInt8('n')
-        buf[pos + 2] = UInt8('f')
+        @inbounds buf[pos] = UInt8('I')
+        @inbounds buf[pos + 1] = UInt8('n')
+        @inbounds buf[pos + 2] = UInt8('f')
         if typed
             if x isa Float32
-                buf[pos + 3] = UInt8('3')
-                buf[pos + 4] = UInt8('2')
+                @inbounds buf[pos + 3] = UInt8('3')
+                @inbounds buf[pos + 4] = UInt8('2')
             elseif x isa Float16
-                buf[pos + 3] = UInt8('1')
-                buf[pos + 4] = UInt8('6')
+                @inbounds buf[pos + 3] = UInt8('1')
+                @inbounds buf[pos + 4] = UInt8('6')
             end
         end
         return pos + 3 + (typed && x isa Union{Float32, Float16} ? 2 : 0)
@@ -313,14 +313,14 @@ function writeshortest(buf::Vector{UInt8}, pos, x::T,
     output, nexp = reduce_shortest(x, compact ? 999_999 : nothing)
 
     if typed && x isa Float16
-        buf[pos] = UInt8('F')
-        buf[pos + 1] = UInt8('l')
-        buf[pos + 2] = UInt8('o')
-        buf[pos + 3] = UInt8('a')
-        buf[pos + 4] = UInt8('t')
-        buf[pos + 5] = UInt8('1')
-        buf[pos + 6] = UInt8('6')
-        buf[pos + 7] = UInt8('(')
+        @inbounds buf[pos] = UInt8('F')
+        @inbounds buf[pos + 1] = UInt8('l')
+        @inbounds buf[pos + 2] = UInt8('o')
+        @inbounds buf[pos + 3] = UInt8('a')
+        @inbounds buf[pos + 4] = UInt8('t')
+        @inbounds buf[pos + 5] = UInt8('1')
+        @inbounds buf[pos + 6] = UInt8('6')
+        @inbounds buf[pos + 7] = UInt8('(')
         pos += 8
     end
     pos = append_sign(x, plus, space, buf, pos)
@@ -332,161 +332,122 @@ function writeshortest(buf::Vector{UInt8}, pos, x::T,
         !(pt >= olength && abs(mod(x + 0.05, 10^(pt - olength)) - 0.05) > 0.05)
         exp_form = false
         if pt <= 0
-            buf[pos] = UInt8('0')
+            @inbounds buf[pos] = UInt8('0')
             pos += 1
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
             for _ = 1:abs(pt)
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
-            # elseif pt >= olength
+        # elseif pt >= olength
             # nothing to do at this point
-            # else
+        # else
             # nothing to do at this point
         end
     else
+        # make space for decchar
         pos += 1
     end
-    i = 0
-    ptr = pointer(buf)
-    ptr2 = pointer(DIGIT_TABLE)
-    if (output >> 32) != 0
-        q = output ÷ 100000000
-        output2 = (output % UInt32) - UInt32(100000000) * (q % UInt32)
-        output = q
 
-        c = output2 % UInt32(10000)
-        output2 = div(output2, UInt32(10000))
-        d = output2 % UInt32(10000)
-        c0 = (c % 100) << 1
-        c1 = (c ÷ 100) << 1
-        d0 = (d % 100) << 1
-        d1 = (d ÷ 100) << 1
-        memcpy(ptr + pos + olength - 3, ptr2 + c0, 2)
-        memcpy(ptr + pos + olength - 5, ptr2 + c1, 2)
-        memcpy(ptr + pos + olength - 7, ptr2 + d0, 2)
-        memcpy(ptr + pos + olength - 9, ptr2 + d1, 2)
-        i += 8
-    end
-    output2 = output % UInt32
-    while output2 >= 10000
-        c = output2 % UInt32(10000)
-        output2 = div(output2, UInt32(10000))
-        c0 = (c % 100) << 1
-        c1 = (c ÷ 100) << 1
-        memcpy(ptr + pos + olength - i - 3, ptr2 + c0, 2)
-        memcpy(ptr + pos + olength - i - 5, ptr2 + c1, 2)
-        i += 4
-    end
-    if output2 >= 100
-        c = (output2 % UInt32(100)) << 1
-        output2 = div(output2, UInt32(100))
-        memcpy(ptr + pos + olength - i - 3, ptr2 + c, 2)
-        i += 2
-    end
-    if output2 >= 10
-        c = output2 << 1
-        buf[pos + 1] = DIGIT_TABLE[c + 2]
-        buf[pos - exp_form] = DIGIT_TABLE[c + 1]
-    else
-        buf[pos - exp_form] = UInt8('0') + (output2 % UInt8)
-    end
+    append_c_digits(olength, output, buf, pos)
 
     if !exp_form
         if pt <= 0
             pos += olength
             precision -= olength
-            while hash && precision > 0
-                buf[pos] = UInt8('0')
-                pos += 1
-                precision -= 1
-            end
         elseif pt >= olength
             pos += olength
             precision -= olength
             for _ = 1:nexp
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
                 precision -= 1
             end
             if hash
-                buf[pos] = decchar
+                @inbounds buf[pos] = decchar
                 pos += 1
                 if precision < 0
-                    buf[pos] = UInt8('0')
+                    @inbounds buf[pos] = UInt8('0')
                     pos += 1
                 end
-                while precision > 0
-                    buf[pos] = UInt8('0')
-                    pos += 1
-                    precision -= 1
-                end
             end
         else
             pointoff = olength - abs(nexp)
+            # shift bytes after pointoff to make room for decchar
+            ptr = pointer(buf)
             memmove(ptr + pos + pointoff, ptr + pos + pointoff - 1, olength - pointoff + 1)
-            buf[pos + pointoff] = decchar
+            @inbounds buf[pos + pointoff] = decchar
             pos += olength + 1
             precision -= olength
-            while hash && precision > 0
-                buf[pos] = UInt8('0')
+        end
+        if hash
+            while precision > 0
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
                 precision -= 1
             end
         end
         if typed && x isa Float32
-            buf[pos] = UInt8('f')
-            buf[pos + 1] = UInt8('0')
+            @inbounds buf[pos] = UInt8('f')
+            @inbounds buf[pos + 1] = UInt8('0')
             pos += 2
         end
     else
+        # move leading digit into place
+        @inbounds buf[pos - 1] = buf[pos]
         if olength > 1 || hash
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += olength
             precision -= olength
         end
-        if hash && olength == 1
-            buf[pos] = UInt8('0')
-            pos += 1
-        end
-        while hash && precision > 0
-            buf[pos] = UInt8('0')
-            pos += 1
-            precision -= 1
+        if hash
+            if olength == 1
+                @inbounds buf[pos] = UInt8('0')
+                pos += 1
+            end
+            while precision > 0
+                @inbounds buf[pos] = UInt8('0')
+                pos += 1
+                precision -= 1
+            end
         end
 
-        buf[pos] = expchar
+        @inbounds buf[pos] = expchar
         pos += 1
         exp2 = nexp + olength - 1
         if exp2 < 0
-            buf[pos] = UInt8('-')
+            @inbounds buf[pos] = UInt8('-')
             pos += 1
             exp2 = -exp2
         elseif padexp
-            buf[pos] = UInt8('+')
+            @inbounds buf[pos] = UInt8('+')
             pos += 1
         end
 
         if exp2 >= 100
             c = exp2 % 10
-            memcpy(ptr + pos - 1, ptr2 + 2 * div(exp2, 10), 2)
-            buf[pos + 2] = UInt8('0') + (c % UInt8)
+            @inbounds d100 = DIGIT_TABLE16[(div(exp2, 10) % Int) + 1]
+            @inbounds buf[pos] = d100 % UInt8
+            @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
+            @inbounds buf[pos + 2] = UInt8('0') + (c % UInt8)
             pos += 3
         elseif exp2 >= 10
-            memcpy(ptr + pos - 1, ptr2 + 2 * exp2, 2)
+            @inbounds d100 = DIGIT_TABLE16[(exp2 % Int) + 1]
+            @inbounds buf[pos] = d100 % UInt8
+            @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
             pos += 2
         else
             if padexp
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
-            buf[pos] = UInt8('0') + (exp2 % UInt8)
+            @inbounds buf[pos] = UInt8('0') + (exp2 % UInt8)
             pos += 1
         end
     end
     if typed && x isa Float16
-        buf[pos] = UInt8(')')
+        @inbounds buf[pos] = UInt8(')')
         pos += 1
     end
 
diff --git a/base/ryu/utils.jl b/base/ryu/utils.jl
index f5a88c057e2b3..2064dfbefcecd 100644
--- a/base/ryu/utils.jl
+++ b/base/ryu/utils.jl
@@ -134,7 +134,7 @@ end
 
 Compute `p = a*b` where `b = bLo + bHi<<64`, returning the result as `pLo, pHi` where `p = pLo + pHi<<128`.
 """
-function umul256(a, bHi, bLo)
+function umul256(a::UInt128, bHi::UInt64, bLo::UInt64)
     aLo = a % UInt64
     aHi = (a >> 64) % UInt64
 
@@ -164,7 +164,7 @@ end
 
 Compute `pHi = (a*b)>>128` where `b = bLo + bHi<<64`.
 """
-umul256_hi(a, bHi, bLo) = umul256(a, bHi, bLo)[2]
+umul256_hi(a::UInt128, bHi::UInt64, bLo::UInt64) = umul256(a, bHi, bLo)[2]
 
 """
     Ryu.mulshiftmod1e9(m, mula, mulb, mulc, j)::UInt32
@@ -183,7 +183,7 @@ function mulshiftmod1e9(m, mula, mulb, mulc, j)
     return (v % UInt32) - UInt32(1000000000) * shifted
 end
 
-function append_sign(x, plus, space, buf, pos)
+function append_sign(x, plus::Bool, space::Bool, buf, pos::Int)
     if signbit(x) && !isnan(x)  # suppress minus sign for signaling NaNs
         buf[pos] = UInt8('-')
         pos += 1
@@ -197,101 +197,14 @@ function append_sign(x, plus, space, buf, pos)
     return pos
 end
 
-function append_n_digits(olength, digits, buf, pos)
-    i = 0
-    while digits >= 10000
-        c = digits % 10000
-        digits = div(digits, 10000)
-        c0 = (c % 100) << 1
-        c1 = div(c, 100) << 1
-        unsafe_copyto!(buf, pos + olength - i - 2, DIGIT_TABLE, c0 + 1, 2)
-        unsafe_copyto!(buf, pos + olength - i - 4, DIGIT_TABLE, c1 + 1, 2)
-        i += 4
-    end
-    if digits >= 100
-        c = (digits % 100) << 1
-        digits = div(digits, 100)
-        unsafe_copyto!(buf, pos + olength - i - 2, DIGIT_TABLE, c + 1, 2)
-        i += 2
-    end
-    if digits >= 10
-        c = digits << 1
-        unsafe_copyto!(buf, pos + olength - i - 2, DIGIT_TABLE, c + 1, 2)
-        i += 2
-    else
-        buf[pos] = UInt8('0') + digits
-        i += 1
-    end
-    return pos + i
-end
-
-function append_d_digits(olength, digits, buf, pos, decchar)
-    i = 0
-    while digits >= 10000
-        c = digits % 10000
-        digits = div(digits, 10000)
-        c0 = (c % 100) << 1
-        c1 = div(c, 100) << 1
-        unsafe_copyto!(buf, pos + olength + 1 - i - 2, DIGIT_TABLE, c0 + 1, 2)
-        unsafe_copyto!(buf, pos + olength + 1 - i - 4, DIGIT_TABLE, c1 + 1, 2)
-        i += 4
-    end
-    if digits >= 100
-        c = (digits % 100) << 1
-        digits = div(digits, 100)
-        unsafe_copyto!(buf, pos + olength + 1 - i - 2, DIGIT_TABLE, c + 1, 2)
-        i += 2
-    end
-    if digits >= 10
-        c = digits << 1
-        buf[pos] = DIGIT_TABLE[c + 1]
-        buf[pos + 1] = decchar
-        buf[pos + 2] = DIGIT_TABLE[c + 2]
-        i += 3
-    else
-        buf[pos] = UInt8('0') + digits
-        buf[pos + 1] = decchar
-        i += 2
-    end
-    return pos + i
-end
 
-function append_c_digits(count, digits, buf, pos)
-    i = 0
-    while i < count - 1
-        c = (digits % 100) << 1
-        digits = div(digits, 100)
-        unsafe_copyto!(buf, pos + count - i - 2, DIGIT_TABLE, c + 1, 2)
-        i += 2
-    end
-    if i < count
-        buf[pos + count - i - 1] = UInt8('0') + (digits % 10)
-        i += 1
-    end
-    return pos + i
-end
+import Base: append_c_digits_fast as append_c_digits, append_nine_digits
 
-function append_nine_digits(digits, buf, pos)
-    if digits == 0
-        for _ = 1:9
-            buf[pos] = UInt8('0')
-            pos += 1
-        end
-        return pos
-    end
-    i = 0
-    while i < 5
-        c = digits % 10000
-        digits = div(digits, 10000)
-        c0 = (c % 100) << 1
-        c1 = div(c, 100) << 1
-        unsafe_copyto!(buf, pos + 7 - i, DIGIT_TABLE, c0 + 1, 2)
-        unsafe_copyto!(buf, pos + 5 - i, DIGIT_TABLE, c1 + 1, 2)
-        i += 4
-    end
-    buf[pos] = UInt8('0') + digits
-    i += 1
-    return pos + i
+function append_d_digits(olength::Int, digits::Unsigned, buf, pos::Int, decchar)
+    newpos = append_c_digits(olength, digits, buf, pos + 1)
+    @inbounds buf[pos] = buf[pos + 1]
+    @inbounds buf[pos + 1] = decchar
+    return newpos # == pos + olength + 1
 end
 
 const BIG_MASK = (big(1) << 64) - 1
@@ -390,18 +303,7 @@ for T in (Float64, Float32, Float16)
     @eval pow5split_lookup(::Type{$T}, i) = @inbounds($table_sym[i+1])
 end
 
-const DIGIT_TABLE = UInt8[
-  '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
-  '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
-  '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
-  '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
-  '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
-  '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
-  '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
-  '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
-  '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
-  '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
-]
+const DIGIT_TABLE16 = Base._dec_d100
 
 const POW10_OFFSET = UInt16[
   0, 2, 5, 8, 12, 16, 21, 26, 32, 39,