diff --git a/base/fastmath.jl b/base/fastmath.jl
index 33c6b9884a7ac..8dd929ce4b441 100644
--- a/base/fastmath.jl
+++ b/base/fastmath.jl
@@ -92,6 +92,9 @@ const rewrite_op =
 function make_fastmath(expr::Expr)
     if expr.head === :quote
         return expr
+    elseif expr.head == :call && expr.args[1] == :^ && expr.args[3] isa Integer
+        # literal integer powers can be inlined with @fastmath
+        return Expr(:call, :pow_fast, expr.args[2], :(Val{$(expr.args[3])}))
     end
     op = get(rewrite_op, expr.head, :nothing)
     if op !== :nothing
@@ -245,6 +248,8 @@ end
 
 pow_fast(x::Float32, y::Integer) = ccall("llvm.powi.f32", llvmcall, Float32, (Float32, Int32), x, y)
 pow_fast(x::Float64, y::Integer) = ccall("llvm.powi.f64", llvmcall, Float64, (Float64, Int32), x, y)
+@inline @generated pow_fast{p}(x::Base.HWNumber, ::Type{Val{p}}) = Base.inlined_pow(:x, p)
+pow_fast{p}(x::FloatTypes, ::Type{Val{p}}) = pow_fast(x, p) # inlines already via llvm.powi
 
 # TODO: Change sqrt_llvm intrinsic to avoid nan checking; add nan
 # checking to sqrt in math.jl; remove sqrt_llvm_fast intrinsic
diff --git a/base/gmp.jl b/base/gmp.jl
index 5e339929b88cc..8c4a14102689e 100644
--- a/base/gmp.jl
+++ b/base/gmp.jl
@@ -443,9 +443,6 @@ end
 ^(x::Integer, y::BigInt ) = bigint_pow(BigInt(x), y)
 ^(x::Bool   , y::BigInt ) = Base.power_by_squaring(x, y)
 
-# override default inlining of x^2 and x^3 etc.
-^{p}(x::BigInt, ::Type{Val{p}}) = x^p
-
 function powermod(x::BigInt, p::BigInt, m::BigInt)
     r = BigInt()
     ccall((:__gmpz_powm, :libgmp), Void,
diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index bab711aa566c5..0e2be3f5f9b6d 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -219,6 +219,113 @@ const HWNumber = Union{HWReal, Complex{<:HWReal}, Rational{<:HWReal}}
 @inline internal_pow(x::HWNumber, ::Type{Val{2}}) = x*x
 @inline internal_pow(x::HWNumber, ::Type{Val{3}}) = x*x*x
 
+# This table represents the optimal "power tree"
+# based on Knuth's "TAOCP vol 2: Seminumerical Algorithms",
+# third edition, section 4.6.3, Fig. 15.  It was
+# transcribed into this array form in the gcc source
+# code (tree-ssa-math-opts.c), but since this is just
+# a table of mathematical facts it should not be copyrightable.
+#
+# To compute x^q in a minimal number of multiplications
+# for 1 ≤ q ≤ 255, you compute x^r * x^s for
+# r = power_tree[i] and s = i-q, recursively
+# (memoizing each power computation), and we implicitly
+# define power_tree[0] = 0.
+const power_tree = [
+      1,   1,   2,   2,   3,   3,   4,       #   1 -   7
+      4,   6,   5,   6,   6,  10,   7,   9,  #   8 -  15
+      8,  16,   9,  16,  10,  12,  11,  13,  #  16 -  23
+     12,  17,  13,  18,  14,  24,  15,  26,  #  24 -  31
+     16,  17,  17,  19,  18,  33,  19,  26,  #  32 -  39
+     20,  25,  21,  40,  22,  27,  23,  44,  #  40 -  47
+     24,  32,  25,  34,  26,  29,  27,  44,  #  48 -  55
+     28,  31,  29,  34,  30,  60,  31,  36,  #  56 -  63
+     32,  64,  33,  34,  34,  46,  35,  37,  #  64 -  71
+     36,  65,  37,  50,  38,  48,  39,  69,  #  72 -  79
+     40,  49,  41,  43,  42,  51,  43,  58,  #  80 -  87
+     44,  64,  45,  47,  46,  59,  47,  76,  #  88 -  95
+     48,  65,  49,  66,  50,  67,  51,  66,  #  96 - 103
+     52,  70,  53,  74,  54, 104,  55,  74,  # 104 - 111
+     56,  64,  57,  69,  58,  78,  59,  68,  # 112 - 119
+     60,  61,  61,  80,  62,  75,  63,  68,  # 120 - 127
+     64,  65,  65, 128,  66, 129,  67,  90,  # 128 - 135
+     68,  73,  69, 131,  70,  94,  71,  88,  # 136 - 143
+     72, 128,  73,  98,  74, 132,  75, 121,  # 144 - 151
+     76, 102,  77, 124,  78, 132,  79, 106,  # 152 - 159
+     80,  97,  81, 160,  82,  99,  83, 134,  # 160 - 167
+     84,  86,  85,  95,  86, 160,  87, 100,  # 168 - 175
+     88, 113,  89,  98,  90, 107,  91, 122,  # 176 - 183
+     92, 111,  93, 102,  94, 126,  95, 150,  # 184 - 191
+     96, 128,  97, 130,  98, 133,  99, 195,  # 192 - 199
+    100, 128, 101, 123, 102, 164, 103, 138,  # 200 - 207
+    104, 145, 105, 146, 106, 109, 107, 149,  # 208 - 215
+    108, 200, 109, 146, 110, 170, 111, 157,  # 216 - 223
+    112, 128, 113, 130, 114, 182, 115, 132,  # 224 - 231
+    116, 200, 117, 132, 118, 158, 119, 206,  # 232 - 239
+    120, 240, 121, 162, 122, 147, 123, 152,  # 240 - 247
+    124, 166, 125, 214, 126, 138, 127, 153,  # 248 - 255
+]
+
+# return the inlined/unrolled expression to compute x^p
+# by repeated multiplications (using an optimal addition
+# chain for |p|<256 and power-by-squaring thereafter.
+function inlined_pow(x::Symbol, p::Int)
+    p == 0 && return :(one($x))
+    ex = Union{Expr,Symbol}[] # expressions to compute intermediate powers
+    if p < 0
+        x′ = gensym()
+        push!(ex, :($x′ = inv($x)))
+        x = x′
+        p = -p
+    end
+    pows_computed = Dict(1 => x) # powers q => variable storing x^q
+    pows_to_compute = [p]
+    while !isempty(pows_to_compute)
+        q = pop!(pows_to_compute)
+        if !haskey(pows_computed, q)
+            if q ≤ length(power_tree) # use optimal addition chain
+                q1 = power_tree[q]
+                q2 = q-q1
+                assert(q1 > 0 && q2 > 0)
+                has1 = haskey(pows_computed, q1)
+                has2 = haskey(pows_computed, q2)
+                if has1 && has2
+                    xq = gensym()
+                    push!(ex, :($xq = $(pows_computed[q1]) * $(pows_computed[q2])))
+                    pows_computed[q] = xq
+                else
+                    push!(pows_to_compute, q) # try again after computing q1, q2
+                    has1 || push!(pows_to_compute, q1)
+                    has2 || push!(pows_to_compute, q2)
+                end
+            else # q too big, use power-by-squaring algorithm
+                q1 = q >> 1
+                if haskey(pows_computed, q1)
+                    xq = gensym()
+                    xq1 = pows_computed[q1]
+                    push!(ex, iseven(q) ? :($xq = $xq1 * $xq1) : :($xq = $xq1 * $xq1 * $x))
+                    pows_computed[q] = xq
+                else
+                    push!(pows_to_compute, q) # try again after computing q1
+                    push!(pows_to_compute, q1)
+                end
+            end
+        end
+    end
+    push!(ex, pows_computed[p]) # the variable for the final result x^p
+    return Expr(:block, ex...)
+end
+inlined_pow(x::Symbol, p::Integer) = inlined_pow(x, Int(p))
+
+# for cases where we use power_by_squaring, inline
+# the unrolled expression for literal p
+# (this also allows integer^-p to give a floating-point result
+#  in a type-stable fashion).
+@inline @generated internal_pow{p}(x::HWNumber, ::Type{Val{p}}) = inlined_pow(:x, p)
+
+# Float32/Float64 may not inline without @fastmath, for accuracy
+internal_pow{p}(x::Union{Float32,Float64}, ::Type{Val{p}}) = x^p
+
 # b^p mod m
 
 """