diff --git a/base/fastmath.jl b/base/fastmath.jl index 33c6b9884a7ac..8dd929ce4b441 100644 --- a/base/fastmath.jl +++ b/base/fastmath.jl @@ -92,6 +92,9 @@ const rewrite_op = function make_fastmath(expr::Expr) if expr.head === :quote return expr + elseif expr.head == :call && expr.args[1] == :^ && expr.args[3] isa Integer + # literal integer powers can be inlined with @fastmath + return Expr(:call, :pow_fast, expr.args[2], :(Val{$(expr.args[3])})) end op = get(rewrite_op, expr.head, :nothing) if op !== :nothing @@ -245,6 +248,8 @@ end pow_fast(x::Float32, y::Integer) = ccall("llvm.powi.f32", llvmcall, Float32, (Float32, Int32), x, y) pow_fast(x::Float64, y::Integer) = ccall("llvm.powi.f64", llvmcall, Float64, (Float64, Int32), x, y) +@inline @generated pow_fast{p}(x::Base.HWNumber, ::Type{Val{p}}) = Base.inlined_pow(:x, p) +pow_fast{p}(x::FloatTypes, ::Type{Val{p}}) = pow_fast(x, p) # inlines already via llvm.powi # TODO: Change sqrt_llvm intrinsic to avoid nan checking; add nan # checking to sqrt in math.jl; remove sqrt_llvm_fast intrinsic diff --git a/base/gmp.jl b/base/gmp.jl index 5e339929b88cc..8c4a14102689e 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -443,9 +443,6 @@ end ^(x::Integer, y::BigInt ) = bigint_pow(BigInt(x), y) ^(x::Bool , y::BigInt ) = Base.power_by_squaring(x, y) -# override default inlining of x^2 and x^3 etc. -^{p}(x::BigInt, ::Type{Val{p}}) = x^p - function powermod(x::BigInt, p::BigInt, m::BigInt) r = BigInt() ccall((:__gmpz_powm, :libgmp), Void, diff --git a/base/intfuncs.jl b/base/intfuncs.jl index bab711aa566c5..0e2be3f5f9b6d 100644 --- a/base/intfuncs.jl +++ b/base/intfuncs.jl @@ -219,6 +219,113 @@ const HWNumber = Union{HWReal, Complex{<:HWReal}, Rational{<:HWReal}} @inline internal_pow(x::HWNumber, ::Type{Val{2}}) = x*x @inline internal_pow(x::HWNumber, ::Type{Val{3}}) = x*x*x +# This table represents the optimal "power tree" +# based on Knuth's "TAOCP vol 2: Seminumerical Algorithms", +# third edition, section 4.6.3, Fig. 15. It was +# transcribed into this array form in the gcc source +# code (tree-ssa-math-opts.c), but since this is just +# a table of mathematical facts it should not be copyrightable. +# +# To compute x^q in a minimal number of multiplications +# for 1 ≤ q ≤ 255, you compute x^r * x^s for +# r = power_tree[i] and s = i-q, recursively +# (memoizing each power computation), and we implicitly +# define power_tree[0] = 0. +const power_tree = [ + 1, 1, 2, 2, 3, 3, 4, # 1 - 7 + 4, 6, 5, 6, 6, 10, 7, 9, # 8 - 15 + 8, 16, 9, 16, 10, 12, 11, 13, # 16 - 23 + 12, 17, 13, 18, 14, 24, 15, 26, # 24 - 31 + 16, 17, 17, 19, 18, 33, 19, 26, # 32 - 39 + 20, 25, 21, 40, 22, 27, 23, 44, # 40 - 47 + 24, 32, 25, 34, 26, 29, 27, 44, # 48 - 55 + 28, 31, 29, 34, 30, 60, 31, 36, # 56 - 63 + 32, 64, 33, 34, 34, 46, 35, 37, # 64 - 71 + 36, 65, 37, 50, 38, 48, 39, 69, # 72 - 79 + 40, 49, 41, 43, 42, 51, 43, 58, # 80 - 87 + 44, 64, 45, 47, 46, 59, 47, 76, # 88 - 95 + 48, 65, 49, 66, 50, 67, 51, 66, # 96 - 103 + 52, 70, 53, 74, 54, 104, 55, 74, # 104 - 111 + 56, 64, 57, 69, 58, 78, 59, 68, # 112 - 119 + 60, 61, 61, 80, 62, 75, 63, 68, # 120 - 127 + 64, 65, 65, 128, 66, 129, 67, 90, # 128 - 135 + 68, 73, 69, 131, 70, 94, 71, 88, # 136 - 143 + 72, 128, 73, 98, 74, 132, 75, 121, # 144 - 151 + 76, 102, 77, 124, 78, 132, 79, 106, # 152 - 159 + 80, 97, 81, 160, 82, 99, 83, 134, # 160 - 167 + 84, 86, 85, 95, 86, 160, 87, 100, # 168 - 175 + 88, 113, 89, 98, 90, 107, 91, 122, # 176 - 183 + 92, 111, 93, 102, 94, 126, 95, 150, # 184 - 191 + 96, 128, 97, 130, 98, 133, 99, 195, # 192 - 199 + 100, 128, 101, 123, 102, 164, 103, 138, # 200 - 207 + 104, 145, 105, 146, 106, 109, 107, 149, # 208 - 215 + 108, 200, 109, 146, 110, 170, 111, 157, # 216 - 223 + 112, 128, 113, 130, 114, 182, 115, 132, # 224 - 231 + 116, 200, 117, 132, 118, 158, 119, 206, # 232 - 239 + 120, 240, 121, 162, 122, 147, 123, 152, # 240 - 247 + 124, 166, 125, 214, 126, 138, 127, 153, # 248 - 255 +] + +# return the inlined/unrolled expression to compute x^p +# by repeated multiplications (using an optimal addition +# chain for |p|<256 and power-by-squaring thereafter. +function inlined_pow(x::Symbol, p::Int) + p == 0 && return :(one($x)) + ex = Union{Expr,Symbol}[] # expressions to compute intermediate powers + if p < 0 + x′ = gensym() + push!(ex, :($x′ = inv($x))) + x = x′ + p = -p + end + pows_computed = Dict(1 => x) # powers q => variable storing x^q + pows_to_compute = [p] + while !isempty(pows_to_compute) + q = pop!(pows_to_compute) + if !haskey(pows_computed, q) + if q ≤ length(power_tree) # use optimal addition chain + q1 = power_tree[q] + q2 = q-q1 + assert(q1 > 0 && q2 > 0) + has1 = haskey(pows_computed, q1) + has2 = haskey(pows_computed, q2) + if has1 && has2 + xq = gensym() + push!(ex, :($xq = $(pows_computed[q1]) * $(pows_computed[q2]))) + pows_computed[q] = xq + else + push!(pows_to_compute, q) # try again after computing q1, q2 + has1 || push!(pows_to_compute, q1) + has2 || push!(pows_to_compute, q2) + end + else # q too big, use power-by-squaring algorithm + q1 = q >> 1 + if haskey(pows_computed, q1) + xq = gensym() + xq1 = pows_computed[q1] + push!(ex, iseven(q) ? :($xq = $xq1 * $xq1) : :($xq = $xq1 * $xq1 * $x)) + pows_computed[q] = xq + else + push!(pows_to_compute, q) # try again after computing q1 + push!(pows_to_compute, q1) + end + end + end + end + push!(ex, pows_computed[p]) # the variable for the final result x^p + return Expr(:block, ex...) +end +inlined_pow(x::Symbol, p::Integer) = inlined_pow(x, Int(p)) + +# for cases where we use power_by_squaring, inline +# the unrolled expression for literal p +# (this also allows integer^-p to give a floating-point result +# in a type-stable fashion). +@inline @generated internal_pow{p}(x::HWNumber, ::Type{Val{p}}) = inlined_pow(:x, p) + +# Float32/Float64 may not inline without @fastmath, for accuracy +internal_pow{p}(x::Union{Float32,Float64}, ::Type{Val{p}}) = x^p + # b^p mod m """