diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 4ffab9dd51bf3..4ef885e249451 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -162,13 +162,6 @@ function squeeze(A::AbstractArray, dims)
     reshape(A, d)
 end
 
-function fill!(A::AbstractArray, x)
-    for i = 1:length(A)
-        A[i] = x
-    end
-    return A
-end
-
 function copy!(dest::AbstractArray, src)
     i = 1
     for x in src
@@ -337,207 +330,12 @@ imag{T<:Real}(x::AbstractArray{T}) = zero(x)
 \(A::Number, B::AbstractArray) = B ./ A
 \(A::AbstractArray, B::Number) = B ./ A
 
-./(x::AbstractArray, y::AbstractArray ) = throw(MethodError(./, (x,y)))
 ./(x::Number,y::AbstractArray )         = throw(MethodError(./, (x,y)))
 ./(x::AbstractArray, y::Number)         = throw(MethodError(./, (x,y)))
 
-.^(x::AbstractArray, y::AbstractArray ) = throw(MethodError(.^, (x,y)))
 .^(x::Number,y::AbstractArray )         = throw(MethodError(.^, (x,y)))
 .^(x::AbstractArray, y::Number)         = throw(MethodError(.^, (x,y)))
 
-## code generator for specializing on the number of dimensions ##
-
-#otherbodies are the bodies that reside between loops, if its a 2 dimension array.
-function make_loop_nest(vars, ranges, body)
-    otherbodies = cell(length(vars),2)
-    #println(vars)
-    for i = 1:2*length(vars)
-        otherbodies[i] = nothing
-    end
-    make_loop_nest(vars, ranges, body, otherbodies)
-end
-
-function make_loop_nest(vars, ranges, body, otherbodies)
-    expr = body
-    len = size(otherbodies)[1]
-    for i=1:length(vars)
-        v = vars[i]
-        r = ranges[i]
-        l = otherbodies[i]
-        j = otherbodies[i+len]
-        expr = quote
-            $l
-            for ($v) = ($r)
-                $expr
-            end
-            $j
-        end
-    end
-    expr
-end
-
-
-## genbodies() is a function that creates an array (potentially 2d),
-## where the first element is inside the inner most array, and the last
-## element is outside most loop, and all the other arguments are
-## between each loop. If it creates a 2d array, it just means that it
-## specifies what it wants to do before and after each loop.
-## If genbodies creates an array it must of length N.
-function gen_cartesian_map(cache, genbodies, ranges, exargnames, exargs...)
-    if ranges === ()
-        ranges = (1,)
-    end
-    N = length(ranges)
-    if !haskey(cache,N)
-        if isdefined(genbodies,:code)
-            mod = genbodies.code.module
-        else
-            mod = Main
-        end
-        dimargnames = { symbol(string("_d",i)) for i=1:N }
-        ivars = { symbol(string("_i",i)) for i=1:N }
-        bodies = genbodies(ivars)
-
-        ## creating a 2d array, to pass as bodies
-        if isa(bodies,Array)
-            if (ndims(bodies)==2)
-                #println("2d array noticed")
-	        body = bodies[1]
-	        bodies = bodies[2:end,:]
-            elseif (ndims(bodies)==1)
-                #println("1d array noticed")
-                body = bodies[1]
-                bodies_tmp = cell(N,2)
-                for i = 1:N
-                    bodies_tmp[i] = bodies[i+1]
-                    bodies_tmp[i+N] = nothing
-                end
-                bodies = bodies_tmp
-            end
-        else
-            #println("no array noticed")
-	    body = bodies
-            bodies = cell(N,2)
-            for i=1:2*N
-                bodies[i] = nothing
-            end
-        end
-        fexpr =
-        quote
-            local _F_
-            function _F_($(dimargnames...), $(exargnames...))
-                $(make_loop_nest(ivars, dimargnames, body, bodies))
-            end
-            _F_
-        end
-        f = eval(mod,fexpr)
-        cache[N] = f
-    else
-        f = cache[N]
-    end
-    return f(ranges..., exargs...)
-end
-
-
-# Generate function bodies which look like this (example for a 3d array):
-#    offset3 = 0
-#    stride1 = 1
-#    stride2 = stride1 * size(A,1)
-#    stride3 = stride2 * size(A,2)
-#    for i3 = ind3
-#        offset2 = offset3 + (i3-1)*stride3
-#        for i2 = ind2
-#            offset1 = offset2 + (i2-1)*stride2
-#            for i1 = ind1
-#                linearind = offset1 + i1
-#                <A function, "body", of linearind>
-#            end
-#        end
-#    end
-function make_arrayind_loop_nest(loopvars, offsetvars, stridevars, linearind, ranges, body, arrayname)
-    # Initialize: calculate the strides
-    offset = offsetvars[end]
-    s = stridevars[1]
-    exinit = quote
-        $offset = 0
-        $s = 1
-    end
-    for i = 2:length(ranges)
-        sprev = s
-        s = stridevars[i]
-        exinit = quote
-            $exinit
-            $s = $sprev * size($arrayname, $i-1)
-        end
-    end
-    # Build the innermost loop (iterating over the first index)
-    v = loopvars[1]
-    r = ranges[1]
-    offset = offsetvars[1]
-    exloop = quote
-        for ($v) = ($r)
-            $linearind = $offset + $v
-            $body
-        end
-    end
-    # Build the remaining loops
-    for i = 2:length(ranges)
-        v = loopvars[i]
-        r = ranges[i]
-        offset = offsetvars[i-1]
-        offsetprev = offsetvars[i]
-        s = stridevars[i]
-        exloop = quote
-            for ($v) = ($r)
-                $offset = $offsetprev + ($v - 1) * $s
-                $exloop
-            end
-        end
-    end
-    # Return the combined result
-    return quote
-        $exinit
-        $exloop
-    end
-end
-
-# Like gen_cartesian_map, except it builds a function creating a
-# loop nest that computes a single linear index (instead of a
-# multidimensional index).
-# Important differences:
-#   - genbody is a scalar-valued function of a single scalar argument,
-#     the linear index. In gen_cartesian_map, this function can return
-#     an array to specify "pre-loop" and "post-loop" operations, but
-#     here those are handled explicitly in make_arrayind_loop_nest.
-#   - exargnames[1] must be the array for which the linear index is
-#     being created (it is used to calculate the strides, which in
-#     turn are used for computing the linear index)
-function gen_array_index_map(cache, genbody, ranges, exargnames, exargs...)
-    N = length(ranges)
-    if !haskey(cache,N)
-        dimargnames = { symbol(string("_d",i)) for i=1:N }
-        loopvars = { symbol(string("_l",i)) for i=1:N }
-        offsetvars = { symbol(string("_offs",i)) for i=1:N }
-        stridevars = { symbol(string("_stri",i)) for i=1:N }
-        linearind = :_li
-        body = genbody(linearind)
-        fexpr = quote
-            local _F_
-            function _F_($(dimargnames...), $(exargnames...))
-                $(make_arrayind_loop_nest(loopvars, offsetvars, stridevars, linearind, dimargnames, body, exargnames[1]))
-            end
-            return _F_
-        end
-        f = eval(fexpr)
-        cache[N] = f
-    else
-        f = cache[N]
-    end
-    return f(ranges..., exargs...)
-end
-
-
-
 
 ## Indexing: getindex ##
 
diff --git a/base/array.jl b/base/array.jl
index fb34506ee7718..7f65cf865e378 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -305,77 +305,6 @@ function getindex{T<:Real}(A::Ranges, I::AbstractVector{T})
     return [ A[i] for i in to_index(I) ]
 end
 
-# 2d indexing
-function getindex(A::Array, I::Range1{Int}, j::Real)
-    j = to_index(j)
-    checkbounds(A, I, j)
-    X = similar(A,length(I))
-    unsafe_copy!(X, 1, A, (j-1)*size(A,1) + first(I), length(I))
-    return X
-end
-function getindex(A::Array, I::Range1{Int}, J::Range1{Int})
-    checkbounds(A, I, J)
-    X = similar(A, index_shape(I, J))
-    if length(I) == size(A,1)
-        unsafe_copy!(X, 1, A, (first(J)-1)*size(A,1) + 1, size(A,1)*length(J))
-    else
-        storeoffset = 1
-        for j = J
-            unsafe_copy!(X, storeoffset, A, (j-1)*size(A,1) + first(I), length(I))
-            storeoffset += length(I)
-        end
-    end
-    return X
-end
-function getindex(A::Array, I::Range1{Int}, J::AbstractVector{Int})
-    checkbounds(A, I, J)
-    X = similar(A, index_shape(I, J))
-    storeoffset = 1
-    for j = J
-        unsafe_copy!(X, storeoffset, A, (j-1)*size(A,1) + first(I), length(I))
-        storeoffset += length(I)
-    end
-    return X
-end
-
-getindex{T<:Real}(A::Array, I::AbstractVector{T}, j::Real) = [ A[i,j] for i=to_index(I) ]
-getindex{T<:Real}(A::Array, I::Real, J::AbstractVector{T}) = [ A[i,j] for i=I,j=to_index(J) ]
-
-# This next is a 2d specialization of the algorithm used for general
-# multidimensional indexing
-function getindex{T<:Real}(A::Array, I::AbstractVector{T}, J::AbstractVector{T})
-    checkbounds(A, I, J)
-    I = to_index(I); J = to_index(J)
-    X = similar(A, index_shape(I, J))
-    storeind = 1
-    for j = J
-        offset = (convert(Int,j)-1)*size(A,1)
-        for i = I
-            X[storeind] = A[convert(Int,i)+offset]
-            storeind += 1
-        end
-    end
-    return X
-end
-# Multidimensional indexing
-let getindex_cache = nothing
-global getindex
-function getindex(A::Array, I::Union(Real,AbstractVector)...)
-    checkbounds(A, I...)
-    I = to_index(I)
-    X = similar(A, eltype(A), index_shape(I...))
-
-    if is(getindex_cache,nothing)
-        getindex_cache = Dict()
-    end
-    gen_array_index_map(getindex_cache, refind -> quote
-            X[storeind] = A[$refind]
-            storeind += 1
-        end, I, (:A, :X, :storeind), A, X, 1)
-    return X
-end
-end
-
 
 # logical indexing
 
@@ -453,145 +382,6 @@ function setindex!{T<:Real}(A::Array, X::AbstractArray, I::AbstractVector{T})
     return A
 end
 
-function setindex!{T<:Real}(A::Array, x, i::Real, J::AbstractVector{T})
-    i = to_index(i)
-    checkbounds(A, i, J)
-    m = size(A, 1)
-    if !isa(x,AbstractArray)
-        for j in J
-            A[(j-1)*m + i] = x
-        end
-    else
-        X = x
-        if length(X) != length(J)
-            throw_setindex_mismatch(X, (i,J))
-        end
-        count = 1
-        for j in J
-            A[(j-1)*m + i] = X[count]
-            count += 1
-        end
-    end
-    return A
-end
-
-function setindex!{T<:Real}(A::Array, x, I::AbstractVector{T}, j::Real)
-    j = to_index(j)
-    checkbounds(A, I, j)
-    m = size(A, 1)
-    offset = (j-1)*m
-
-    if !isa(x,AbstractArray)
-        for i in I
-            A[offset + i] = x
-        end
-    else
-        X = x
-        if length(X) != length(I)
-            throw_setindex_mismatch(X, (I,j))
-        end
-        count = 1
-        for i in I
-            A[offset + i] = X[count]
-            count += 1
-        end
-    end
-    return A
-end
-
-function setindex!{T}(A::Array{T}, X::Array{T}, I::Range1{Int}, j::Real)
-    j = to_index(j)
-    checkbounds(A, I, j)
-    if length(X) != length(I)
-        throw_setindex_mismatch(X, (I,j))
-    end
-    unsafe_copy!(A, first(I) + (j-1)*size(A,1), X, 1, length(I))
-    return A
-end
-
-function setindex!{T}(A::Array{T}, X::Array{T}, I::Range1{Int}, J::Range1{Int})
-    checkbounds(A, I, J)
-    setindex_shape_check(X, I, J)
-    if length(I) == size(A,1)
-        unsafe_copy!(A, first(I) + (first(J)-1)*size(A,1), X, 1, size(A,1)*length(J))
-    else
-        refoffset = 1
-        for j = J
-            unsafe_copy!(A, first(I) + (j-1)*size(A,1), X, refoffset, length(I))
-            refoffset += length(I)
-        end
-    end
-    return A
-end
-
-function setindex!{T}(A::Array{T}, X::Array{T}, I::Range1{Int}, J::AbstractVector{Int})
-    checkbounds(A, I, J)
-    setindex_shape_check(X, I, J)
-    refoffset = 1
-    for j = J
-        unsafe_copy!(A, first(I) + (j-1)*size(A,1), X, refoffset, length(I))
-        refoffset += length(I)
-    end
-    return A
-end
-
-function setindex!{T<:Real}(A::Array, x, I::AbstractVector{T}, J::AbstractVector{T})
-    checkbounds(A, I, J)
-    m = size(A, 1)
-    if !isa(x,AbstractArray)
-        for j in J
-            offset = (j-1)*m
-            for i in I
-                A[offset + i] = x
-            end
-        end
-    else
-        X = x
-        setindex_shape_check(X, I, J)
-        count = 1
-        for j in J
-            offset = (j-1)*m
-            for i in I
-                A[offset + i] = X[count]
-                count += 1
-            end
-        end
-    end
-    return A
-end
-
-let assign_cache = nothing, assign_scalar_cache = nothing
-global setindex!
-function setindex!(A::Array, x, I::Union(Real,AbstractArray)...)
-    checkbounds(A, I...)
-    I = to_index(I)
-    if !isa(x,AbstractArray)
-        if is(assign_scalar_cache,nothing)
-            assign_scalar_cache = Dict()
-        end
-        gen_array_index_map(assign_scalar_cache, storeind -> quote
-                              A[$storeind] = x
-                            end,
-                            I,
-                            (:A, :x),
-                            A, x)
-    else
-        if is(assign_cache,nothing)
-            assign_cache = Dict()
-        end
-        X = x
-        setindex_shape_check(X, I...)
-        gen_array_index_map(assign_cache, storeind -> quote
-                              A[$storeind] = X[refind]
-                              refind += 1
-                            end,
-                            I,
-                            (:A, :X, :refind),
-                            A, X, 1)
-    end
-    return A
-end
-end
 
 # logical indexing
 
@@ -1242,37 +1032,6 @@ function findn(A::AbstractMatrix)
     return (I, J)
 end
 
-let findn_cache = nothing
-function findn_one(ivars)
-    s = { quote I[$i][count] = $(ivars[i]) end for i = 1:length(ivars)}
-    quote
-    	Aind = A[$(ivars...)]
-    	if Aind != z
-    	    $(s...)
-    	    count +=1
-    	end
-    end
-end
-
-global findn
-function findn{T}(A::AbstractArray{T})
-    ndimsA = ndims(A)
-    nnzA = nnz(A)
-    I = ntuple(ndimsA, x->Array(Int, nnzA))
-    if nnzA > 0
-        ranges = ntuple(ndims(A), d->(1:size(A,d)))
-
-        if is(findn_cache,nothing)
-            findn_cache = Dict()
-        end
-
-        gen_cartesian_map(findn_cache, findn_one, ranges,
-                          (:A, :I, :count, :z), A,I,1, zero(T))
-    end
-    return I
-end
-end
-
 function findnz{T}(A::AbstractMatrix{T})
     nnzA = nnz(A)
     I = zeros(Int, nnzA)
diff --git a/base/bitarray.jl b/base/bitarray.jl
index 38c2b7e9e07c2..733fb3b2000bf 100644
--- a/base/bitarray.jl
+++ b/base/bitarray.jl
@@ -422,74 +422,6 @@ function getindex(B::BitArray, I::Real...)
     return B[index]
 end
 
-# note: we can gain some performance if the first dimension is a range;
-# TODO: extend to I:Union(Real,AbstractArray)... (i.e. not necessarily contiguous)
-let getindex_cache = nothing
-    global getindex
-    function getindex(B::BitArray, I0::Range1{Int}, I::Union(Real,Range1{Int})...)
-        # the < should become a != once
-        # the stricter indexing behaviour is enforced
-        if ndims(B) < 1 + length(I)
-            error("wrong number of dimensions")
-        end
-        checkbounds(B, I0, I...)
-        X = BitArray(index_shape(I0, I...))
-        nI = 1 + length(I)
-
-        I = map(x->(isa(x,Real) ? (to_index(x):to_index(x)) : to_index(x)), I[1:nI-1])
-
-        f0 = first(I0)
-        l0 = length(I0)
-
-        gap_lst = Int[last(r)-first(r)+1 for r in I]
-        stride_lst = Array(Int, nI)
-        stride = 1
-        ind = f0
-        for k = 1 : nI - 1
-            stride *= size(B, k)
-            stride_lst[k] = stride
-            ind += stride * (first(I[k]) - 1)
-            gap_lst[k] *= stride
-        end
-        # we only need nI-1 elements, the last one
-        # is dummy (used in bodies[k,2] below)
-        stride_lst[nI] = 0
-
-        if ndims(X) == 1
-            copy_chunks(X.chunks, 1, B.chunks, ind, l0)
-            return X
-        end
-
-        if is(getindex_cache,nothing)
-            getindex_cache = Dict()
-        end
-
-        gen_cartesian_map(getindex_cache,
-            ivars->begin
-                bodies = cell(nI, 2)
-                bodies[1] = quote
-                        copy_chunks(X.chunks, storeind, B.chunks, ind, l0)
-                        storeind += l0
-                        ind += stride_lst[loop_ind]
-                    end
-                for k = 2 : nI
-                    bodies[k, 1] = quote
-                        loop_ind -= 1
-                    end
-                    bodies[k, 2] = quote
-                        ind -= gap_lst[loop_ind]
-                        loop_ind += 1
-                        ind += stride_lst[loop_ind]
-                    end
-                end
-                return bodies
-            end,
-            I, (:B, :X, :storeind, :ind, :l0, :stride_lst, :gap_lst, :loop_ind),
-            B, X, 1, ind, l0, stride_lst, gap_lst, nI)
-        return X
-    end
-end
-
 # note: the Range1{Int} case is still handled by the version above
 #       (which is fine)
 function getindex{T<:Real}(B::BitArray, I::AbstractVector{T})
@@ -508,26 +440,6 @@ function getindex{T<:Real}(B::BitArray, I::AbstractVector{T})
     return X
 end
 
-let getindex_cache = nothing
-    global getindex
-    function getindex(B::BitArray, I::Union(Real,AbstractVector)...)
-        checkbounds(B, I...)
-        I = to_index(I)
-        X = BitArray(index_shape(I...))
-        Xc = X.chunks
-
-        if is(getindex_cache,nothing)
-            getindex_cache = Dict()
-        end
-        gen_cartesian_map(getindex_cache, ivars -> quote
-                #faster X[storeind] = B[$(ivars...)]
-                setindex_unchecked(Xc, B[$(ivars...)], ind)
-                ind += 1
-            end, I, (:B, :Xc, :ind), B, Xc, 1)
-        return X
-    end
-end
-
 # logical indexing
 
 function getindex_bool_1d(B::BitArray, I::AbstractArray{Bool})
@@ -637,72 +549,6 @@ function setindex!(B::BitArray, x, i::Real, I::Real...)
     return B
 end
 
-let setindex_cache = nothing
-    global setindex_array2bitarray_ranges
-    function setindex_array2bitarray_ranges(B::BitArray, X::BitArray, I0::Range1{Int}, I::Range1{Int}...)
-        nI = 1 + length(I)
-        if ndims(B) != nI
-            error("wrong number of dimensions in assigment")
-        end
-        lI = length(I0)
-        for r in I
-            lI *= length(r)
-        end
-        if length(X) != lI
-            error("array assignment dimensions mismatch")
-        end
-        if lI == 0
-            return B
-        end
-        f0 = first(I0)
-        l0 = length(I0)
-        if nI == 1
-            copy_chunks(B.chunks, f0, X.chunks, 1, l0)
-            return B
-        end
-        if is(setindex_cache,nothing)
-            setindex_cache = Dict()
-        end
-        gap_lst = [last(r)-first(r)+1 for r in I]
-        stride_lst = Array(Int, nI)
-        stride = 1
-        ind = f0
-        @inbounds for k = 1 : nI - 1
-            stride *= size(B, k)
-            stride_lst[k] = stride
-            ind += stride * (first(I[k]) - 1)
-            gap_lst[k] *= stride
-        end
-        # we only need nI-1 elements, the last one
-        # is dummy (used in bodies[k,2] below)
-        stride_lst[nI] = 0
-
-        gen_cartesian_map(setindex_cache,
-            ivars->begin
-                bodies = cell(nI, 2)
-                bodies[1] = quote
-                        copy_chunks(B.chunks, ind, X.chunks, refind, l0)
-                        refind += l0
-                        ind += stride_lst[loop_ind]
-                    end
-                for k = 2 : nI
-                    bodies[k, 1] = quote
-                        loop_ind -= 1
-                    end
-                    bodies[k, 2] = quote
-                        ind -= gap_lst[loop_ind]
-                        loop_ind += 1
-                        ind += stride_lst[loop_ind]
-                    end
-                end
-                return bodies
-            end,
-            I, (:B, :X, :refind, :ind, :l0, :stride_lst, :gap_lst, :loop_ind),
-            B, X, 1, ind, l0, stride_lst, gap_lst, nI)
-        return B
-    end
-end
-
 # note: we can gain some performance if the first dimension is a range;
 #       currently this is mainly indended for the general cat case
 # TODO: extend to I:Indices... (i.e. not necessarily contiguous)
@@ -743,36 +589,6 @@ function setindex!(B::BitArray, X::AbstractArray, I0::Real, I::Real...)
     return setindex!(B, X[1], i0, I...)
 end
 
-let setindex_cache = nothing
-    global setindex!
-    function setindex!(B::BitArray, X::AbstractArray, I::Union(Real,AbstractArray)...)
-        I = to_index(I)
-        nel = 1
-        for idx in I
-            nel *= length(idx)
-        end
-        if length(X) != nel
-            error("argument dimensions must match")
-        end
-        if ndims(X) > 1
-            for i = 1:length(I)
-                if size(X,i) != length(I[i])
-                    error("argument dimensions must match")
-                end
-            end
-        end
-        if is(setindex_cache,nothing)
-            setindex_cache = Dict()
-        end
-        gen_cartesian_map(setindex_cache,
-            ivars->:(B[$(ivars...)] = X[refind]; refind += 1),
-            I,
-            (:B, :X, :refind),
-            B, X, 1)
-        return B
-    end
-end
-
 function setindex!{T<:Real}(B::BitArray, x, I::AbstractVector{T})
     x = convert(Bool, x)
     for i in I
@@ -781,23 +597,6 @@ function setindex!{T<:Real}(B::BitArray, x, I::AbstractVector{T})
     return B
 end
 
-let setindex_cache = nothing
-    global setindex!
-    function setindex!(B::BitArray, x, I::Union(Real,AbstractArray)...)
-        x = convert(Bool, x)
-        checkbounds(B, I...)
-        I = to_index(I)
-        if is(setindex_cache,nothing)
-            setindex_cache = Dict()
-        end
-        gen_cartesian_map(setindex_cache, ivars->:(B[$(ivars...)] = x),
-            I,
-            (:B, :x),
-            B, x)
-        return B
-    end
-end
-
 # logical indexing
 
 function setindex_bool_scalar_1d(A::BitArray, x, I::AbstractArray{Bool})
@@ -1896,7 +1695,6 @@ function findn(B::BitMatrix)
     return (I, J)
 end
 
-let findn_cache = nothing
 function findn_one(ivars)
     s = { quote I[$i][count] = $(ivars[i]) end for i = 1:length(ivars)}
     quote
@@ -1908,25 +1706,6 @@ function findn_one(ivars)
     end
 end
 
-global findn
-function findn(B::BitArray)
-    ndimsB = ndims(B)
-    nnzB = nnz(B)
-    I = ntuple(ndimsB, x->Array(Int, nnzB))
-    if nnzB > 0
-        ranges = ntuple(ndims(B), d->(1:size(B,d)))
-
-        if is(findn_cache,nothing)
-            findn_cache = Dict()
-        end
-
-        gen_cartesian_map(findn_cache, findn_one, ranges,
-                          (:B, :I, :count), B, I, 1)
-    end
-    return I
-end
-end
-
 function findnz(B::BitMatrix)
     I, J = findn(B)
     return (I, J, trues(length(I)))
@@ -2160,88 +1939,6 @@ function permute_one_dim(ivars, stridenames)
     toReturn
 end
 
-let permutedims_cache = nothing, stridenames::Array{Any,1} = {}
-global permutedims!
-function permutedims!(P::BitArray,B::BitArray, perm)
-    dimsB = size(B)
-    ndimsB = length(dimsB)
-    (ndimsB == length(perm) && isperm(perm)) || error("no valid permutation of dimensions")
-	dimsP = size(P)
-    for i = 1:length(perm)
-        dimsP[i] == dimsB[perm[i]] || error("destination tensor of incorrect size")
-    end
-	
-    ranges = ntuple(ndimsB, i->(1:dimsP[i]))
-    while length(stridenames) < ndimsB
-        push!(stridenames, gensym())
-    end
-
-    #calculates all the strides
-    strides = [ prod(dimsB[1:(perm[dim]-1)])::Int for dim = 1:length(perm) ]
-
-    #Creates offset, because indexing starts at 1
-    offset = 0
-    for i in strides
-        offset+=i
-    end
-    offset = 1-offset
-
-    if isa(B,SubArray)
-        offset += (B.first_index-1)
-        B = B.parent
-    end
-
-    if is(permutedims_cache,nothing)
-        permutedims_cache = Dict()
-    end
-
-    gen_cartesian_map(permutedims_cache, iv->permute_one_dim(iv,stridenames), ranges,
-                      tuple(:B, :P, :perm, :offset, stridenames[1:ndimsB]...),
-                      B, P, perm, offset, strides...)
-
-    return P
-end
-function permutedims!(P::Array,B::StridedArray, perm)
-    dimsB = size(B)
-    ndimsB = length(dimsB)
-    (ndimsB == length(perm) && isperm(perm)) || error("no valid permutation of dimensions")
-	dimsP = size(P)
-    for i = 1:length(perm)
-        dimsP[i] == dimsB[perm[i]] || error("destination tensor of incorrect size")
-    end
-	
-    ranges = ntuple(ndimsB, i->(1:dimsP[i]))
-    while length(stridenames) < ndimsB
-        push!(stridenames, gensym())
-    end
-
-    #calculates all the strides
-    strides = [ stride(B, perm[dim]) for dim = 1:length(perm) ]
-
-    #Creates offset, because indexing starts at 1
-    offset = 0
-    for i in strides
-        offset+=i
-    end
-    offset = 1-offset
-
-    if isa(B,SubArray)
-        offset += (B.first_index-1)
-        B = B.parent
-    end
-
-    if is(permutedims_cache,nothing)
-        permutedims_cache = Dict()
-    end
-
-    gen_cartesian_map(permutedims_cache, iv->permute_one_dim(iv,stridenames), ranges,
-                      tuple(:B, :P, :perm, :offset, stridenames[1:ndimsB]...),
-                      B, P, perm, offset, strides...)
-
-    return P
-end
-end # let
-
 function permutedims(B::Union(BitArray,StridedArray), perm)
     dimsB = size(B)
     ndimsB = length(dimsB)
diff --git a/base/broadcast.jl b/base/broadcast.jl
index d8d753ea8c899..2fe23688e1ea0 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -1,11 +1,11 @@
 module Broadcast
 
-using ..Meta.quot
+using ..Cartesian
+import Base.promote_eltype
 import Base.(.+), Base.(.-), Base.(.*), Base.(./), Base.(.\)
 export broadcast, broadcast!, broadcast_function, broadcast!_function
 export broadcast_getindex, broadcast_setindex!
 
-
 ## Broadcasting utilities ##
 
 droparg1(a, args...) = args
@@ -57,226 +57,98 @@ function check_broadcast_shape(shape::Dims, As::AbstractArray...)
     end
 end
 
-# Calculate strides as will be used by the generated inner loops
-function calc_loop_strides(shape::Dims, As::AbstractArray...)
-    # squeeze out singleton dimensions in shape
-    dims = Array(Int, 0)
-    loopshape = Array(Int, 0)
-    nd = length(shape)
-    sizehint(dims, nd)
-    sizehint(loopshape, nd)
-    for i = 1:nd
-        s = shape[i]
-        if s != 1
-            push!(dims, i)
-            push!(loopshape, s)
+## Broadcasting core
+# Generate the body for a broadcasting function f_broadcast!(B, A1, A2, ..., A$narrays),
+# using function f, output B, and inputs As...
+# B must have already been set to the appropriate size.
+function gen_broadcast_body(nd::Int, narrays::Int, f::Function)
+    checkshape = Expr(:call, check_broadcast_shape, :(size(B)), [symbol("A_"*string(i)) for i = 1:narrays]...)
+    F = Expr(:quote, f)
+    quote
+        @assert ndims(B) == $nd
+        $checkshape
+        @nloops $nd i B d->(@nexprs $narrays k->(j_d_k = size(A_k, d) == 1 ? 1 : i_d)) begin
+            @nexprs $narrays k->(@inbounds v_k = @nref $nd A_k d->j_d_k)
+            @inbounds (@nref $nd B i) = (@ncall $narrays $F v)
         end
+        B
     end
-    nd = length(loopshape)
-
-    strides = Int[(size(A, d) > 1 ? stride(A, d) : 0) for A in As, d in dims]
-    # convert from regular strides to loop strides
-    for k=(nd-1):-1:1, a=1:length(As)
-        strides[a, k+1] -= strides[a, k]*loopshape[k]
-    end
-
-    tuple(loopshape...), strides
 end
 
-function broadcast_args(shape::Dims, As::(Array...))
-    loopshape, strides = calc_loop_strides(shape, As...)
-    (loopshape, As, ones(Int, length(As)), strides)
-end
-function broadcast_args(shape::Dims, As::(StridedArray...))
-    loopshape, strides = calc_loop_strides(shape, As...)
-    nA = length(As)
-    offs = Array(Int, nA)
-    baseAs = Array(Array, nA)
-    for (k, A) in enumerate(As)
-        offs[k],baseAs[k] = isa(A,SubArray) ? (A.first_index,A.parent) : (1,A)
+function broadcast!_function(nd::Int, narrays::Int, f::Function)
+    As = [symbol("A_"*string(i)) for i = 1:narrays]
+    body = gen_broadcast_body(nd, narrays, f)
+    @eval begin
+        local _F_
+        function _F_(B, $(As...))
+            $body
+        end
+        _F_
     end
-    (loopshape, tuple(baseAs...), offs, strides)
 end
 
-
-## Generation of inner loop instances ##
-
-function code_inner_loop(fname::Symbol, extra_args::Vector, initial,
-                         innermost::Function, narrays::Int, nd::Int)
-    Asyms      = [gensym("A$a") for a=1:narrays]
-    indsyms    = [gensym("k$a") for a=1:narrays]
-    axissyms   = [gensym("i$d") for d=1:nd]
-    sizesyms   = [gensym("n$d") for d=1:nd]
-    stridesyms = [gensym("s$(a)_$d") for a=1:narrays, d=1:nd]
-        
-    loop = innermost([:($arr[$ind]) for (arr, ind) in zip(Asyms, indsyms)]...)
-    for (d, (axis, n)) in enumerate(zip(axissyms, sizesyms))
-        loop = :(
-            for $axis=1:$n
-                $loop
-                $([:($ind += $(stridesyms[a, d]))
-                   for (a, ind) in enumerate(indsyms)]...)
-            end
-        )
-    end
-    
-    @gensym shape arrays offsets strides
-    quote
-        function $fname($shape::NTuple{$nd, Int},
-                        $arrays::NTuple{$narrays, StridedArray},
-                        $offsets::Vector{Int},
-                        $strides::Matrix{Int}, $(extra_args...))
-            @assert size($strides) == ($narrays, $nd)
-            ($(sizesyms...),)   = $shape
-            $([:(if $n==0; return; end) for n in sizesyms]...)
-            ($(Asyms...), )     = $arrays
-            ($(stridesyms...),) = $strides
-            ($(indsyms...), )   = $offsets
-            $initial
-            $loop
-        end
+let broadcast_cache = Dict()
+global broadcast!
+function broadcast!(f::Function, B, As...)
+    nd = ndims(B)
+    narrays = length(As)
+    key = (f, nd, narrays)
+    if !haskey(broadcast_cache,key)
+        func = broadcast!_function(nd, narrays, f)
+        broadcast_cache[key] = func
+    else
+        func = broadcast_cache[key]
     end
+    func(B, As...)
 end
+end  # let broadcast_cache
 
+broadcast(f::Function, As...) = broadcast!(f, Array(promote_eltype(As...), broadcast_shape(As...)), As...)
 
-## Generation of inner loop staged functions ##
+broadcast!_function(f::Function) = (B, As...) -> broadcast!(f, B, As...)
+broadcast_function(f::Function) = (As...) -> broadcast(f, As...)
 
-function code_inner(fname::Symbol, extra_args::Vector, initial,
-                    innermost::Function)
-    quote
-        function $fname(shape::(Int...), arrays::(StridedArray...), 
-                        offsets::Vector{Int}, strides::Matrix{Int},
-                        $(extra_args...))
-            f = eval(code_inner_loop($(quot(fname)), $(quot(extra_args)),
-                                     $(quot(initial)), $(quot(innermost)),
-                                     length(arrays), length(shape)))
-            f(shape, arrays, offsets, strides, $(extra_args...))
-        end
+broadcast_getindex(src::AbstractArray, I::AbstractArray...) = broadcast_getindex!(Array(eltype(src), broadcast_shape(I...)), src, I...)
+@ngenerate N function broadcast_getindex!(dest::AbstractArray, src::AbstractArray, I::NTuple{N, AbstractArray}...)
+    check_broadcast_shape(size(dest), I...)  # unnecessary if this function is never called directly
+    checkbounds(src, I...)
+    @nloops N i dest d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+        @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
+        @inbounds (@nref N dest i) = (@nref N src J)
     end
+    dest
 end
 
-code_foreach_inner(fname::Symbol, extra_args::Vector, innermost::Function) =
-    code_inner(fname, extra_args, quote end, innermost)
-
-function code_map!_inner(fname::Symbol, dest, extra_args::Vector, 
-                         innermost::Function)
-    @gensym k
-    code_inner(fname, {dest, extra_args...}, :($k=1),
-        (els...)->quote
-            @inbounds $dest[$k] = $(innermost(:($dest[$k]), els...))
-            $k += 1
-        end)
-end
-
-
-## (Generation of) complete broadcast functions ##
-
-function code_broadcasts(name::String, op)
-    fname, fname_T, fname! = [gensym("broadcast$(infix)_$name")
-                              for infix in ("", "_T", "!")]
-
-    inner!, inner!! = gensym("$(name)_inner!"), gensym("$(name)!_inner!")
-    innerdef = code_map!_inner(inner!, :(result::Array), [],
-                               (dest, els...) -> :( $op($(els...)) ))
-    innerdef! = code_foreach_inner(inner!!, [],
-                                   (dest, els...) -> :( $dest=$op($(els...)) ))
-    quote
-        $innerdef
-        $fname_T{T}(::Type{T}) = $op()
-        function $fname_T{T}(::Type{T}, As::StridedArray...)
-            shape = broadcast_shape(As...)
-            result = Array(T, shape)
-            $inner!(broadcast_args(shape, As)..., result)
-            result
-        end
-
-        function $fname(As::StridedArray...)
-            $fname_T(Base.promote_eltype(As...), As...)
-        end
-
-        function $fname{T}(As::StridedArray{T}...)
-            $fname_T(T, As...)
+@ngenerate N function broadcast_setindex!(A::AbstractArray, x, I::NTuple{N, AbstractArray}...)
+    checkbounds(A, I...)
+    shape = broadcast_shape(I...)
+    @nextract N shape d->(length(shape) < d ? 1 : shape[d])
+    if !isa(x, AbstractArray)
+        @nloops N i d->(1:shape_d) d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+            @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
+            @inbounds (@nref N A J) = x
         end
-
-        function $fname(As::StridedArray{Bool}...)
-            $fname_T(typeof($op(true,true)), As...)
-        end
-
-        $innerdef!
-        function $fname!(dest::StridedArray, As::StridedArray...)
-            shape = size(dest)
-            check_broadcast_shape(shape, As...)
-            $inner!!(broadcast_args(shape, tuple(dest, As...))...)
-            dest
+    else
+        X = x
+        # To call setindex_shape_check, we need to create fake 1-d indexes of the proper size
+        @nexprs N d->(fakeI_d = 1:shape_d)
+        Base.setindex_shape_check(X, (@ntuple N fakeI)...)
+        k = 1
+        @nloops N i d->(1:shape_d) d->(@nexprs N k->(j_d_k = size(I_k, d) == 1 ? 1 : i_d)) begin
+            @nexprs N k->(@inbounds J_k = @nref N I_k d->j_d_k)
+            @inbounds (@nref N A J) = X[k]
+            k += 1
         end
-
-        ($fname, $fname_T, $fname!)
     end
+    A
 end
 
-eval(code_map!_inner(:broadcast_getindex_inner!,
-                     :(result::Array), [:(A::AbstractArray)],
-                     (dest, inds...) -> :( A[$(inds...)] )))
-function broadcast_getindex(A::AbstractArray,
-                            ind1::StridedArray{Int},
-                            inds::StridedArray{Int}...)
-    inds = tuple(ind1, inds...)
-    shape = broadcast_shape(inds...)
-    result = Array(eltype(A), shape)
-    broadcast_getindex_inner!(broadcast_args(shape, inds)..., result, A)
-    result
-end
-
-eval(code_foreach_inner(:broadcast_setindex!_inner!, [:(A::AbstractArray)],
-                        (x, inds...)->:( A[$(inds...)] = $x )))
-function broadcast_setindex!(A::AbstractArray, X::StridedArray,
-                             ind1::StridedArray{Int}, 
-                             inds::StridedArray{Int}...)
-    Xinds = tuple(X, ind1, inds...)
-    shape = broadcast_shape(Xinds...)
-    broadcast_setindex!_inner!(broadcast_args(shape, Xinds)..., A)
-    Xinds[1]
-end
-
-
-## actual functions for broadcast and broadcast! ##
-
-broadcastfuns = ObjectIdDict()
-function broadcast_functions(op::Function)
-    (haskey(broadcastfuns, op) ? broadcastfuns[op] :
-     (broadcastfuns[op] = eval(code_broadcasts(string(op), quot(op)))))::NTuple{3,Function}
-end
-
-broadcast_function(op::Function)   = broadcast_functions(op)[1]
-broadcast_T_function(op::Function) = broadcast_functions(op)[2]
-broadcast!_function(op::Function)  = broadcast_functions(op)[3]
-
-broadcast(op::Function) = op()
-broadcast(op::Function, As::StridedArray...) = broadcast_function(op)(As...)
-
-function broadcast_T{T}(op::Function, ::Type{T}, As::StridedArray...)
-    broadcast_T_function(op)(T, As...)
-end
-
-function broadcast!(op::Function, dest::StridedArray, As::StridedArray...)
-    broadcast!_function(op)(dest, As...)
-end
-
-
 ## elementwise operators ##
 
-const broadcast_add = broadcast_function(+)
-const broadcast_sub = broadcast_function(-)
-const broadcast_mul = broadcast_function(*)
-const broadcast_rem = broadcast_function(%)
-const broadcast_div_T  = broadcast_T_function(/)
-const broadcast_rdiv_T = broadcast_T_function(\)
-const broadcast_pow_T  = broadcast_T_function(^)
-
-.+(As::StridedArray...) = broadcast_add(As...)
-.*(As::StridedArray...) = broadcast_mul(As...)
-.-(A::StridedArray, B::StridedArray) = broadcast_sub(A, B)
-.%(A::StridedArray, B::StridedArray) = broadcast_rem(A, B)
+.+(As::AbstractArray...) = broadcast(+, As...)
+.*(As::AbstractArray...) = broadcast(*, As...)
+.-(A::AbstractArray, B::AbstractArray) = broadcast(-, A, B)
+.%(A::AbstractArray, B::AbstractArray) = broadcast(%, A, B)
 
 type_div(T,S) = promote_type(T,S)
 type_div{T<:Integer,S<:Integer}(::Type{T},::Type{S}) = typeof(one(T)/one(S))
@@ -284,20 +156,20 @@ type_div{T,S}(::Type{Complex{T}},::Type{Complex{S}}) = Complex{type_div(T,S)}
 type_div{T,S}(::Type{Complex{T}},::Type{S})          = Complex{type_div(T,S)}
 type_div{T,S}(::Type{T},::Type{Complex{S}})          = Complex{type_div(T,S)}
 
-function ./(A::StridedArray, B::StridedArray) 
-    broadcast_div_T(type_div(eltype(A), eltype(B)), A, B)
+function ./(A::AbstractArray, B::AbstractArray)
+    broadcast!(/, Array(type_div(eltype(A), eltype(B)), broadcast_shape(A, B)), A, B)
 end
 
-function .\(A::StridedArray, B::StridedArray) 
-    broadcast_rdiv_T(type_div(eltype(B), eltype(A)), A, B)
+function .\(A::AbstractArray, B::AbstractArray)
+    broadcast!(\, Array(type_div(eltype(A), eltype(B)), broadcast_shape(A, B)), A, B)
 end
 
 type_pow(T,S) = promote_type(T,S)
 type_pow{S<:Integer}(::Type{Bool},::Type{S}) = Bool
 type_pow{S}(T,::Type{Rational{S}}) = type_pow(T, type_div(S, S))
 
-function .^(A::StridedArray, B::StridedArray) 
-    broadcast_pow_T(type_pow(eltype(A), eltype(B)), A, B)
+function .^(A::AbstractArray, B::AbstractArray) 
+    broadcast!(^, Array(type_pow(eltype(A), eltype(B)), broadcast_shape(A, B)), A, B)
 end
 
 
diff --git a/base/cartesian.jl b/base/cartesian.jl
new file mode 100644
index 0000000000000..98ed64ec95fe9
--- /dev/null
+++ b/base/cartesian.jl
@@ -0,0 +1,415 @@
+module Cartesian
+
+export @ngenerate, @nloops, @nref, @ncall, @nexprs, @nextract, @nall, @ntuple, ngenerate
+
+### @ngenerate, for auto-generation of separate versions of functions for different dimensionalities
+# Examples (deliberately trivial):
+#     @ngenerate N myndims{T,N}(A::Array{T,N}) = N
+# or alternatively
+#     function gen_body(N::Int)
+#         quote
+#             return $N
+#         end
+#     end
+#     eval(ngenerate(:N, :(myndims{T,N}(A::Array{T,N})), gen_body))
+# The latter allows you to use a single gen_body function for both ngenerate and
+# when your function maintains its own method cache (e.g., reduction or broadcasting).
+#
+# Special syntax for function prototypes:
+#   @ngenerate N function myfunction(A::AbstractArray, I::NTuple{N, Int}...)
+# for N = 3 translates to
+#   function myfunction(A::AbstractArray, I_1::Int, I_2::Int, I_3::Int)
+# and for the generic (cached) case as
+#   function myfunction(A::AbstractArray, I::Int...)
+#     @nextract N I I
+# with N = length(I). N should _not_ be listed as a parameter of the function unless
+# earlier arguments use it that way.
+# To avoid ambiguity, it would be preferable to have some specific syntax for this, such as
+#   myfunction(A::AbstractArray, I::Int...N)
+# where N can be an integer or symbol. Currently T...N generates a parser error.
+
+const CARTESIAN_DIMS = 2   # FIXME: increase after testing is complete
+
+macro ngenerate(itersym, funcexpr)
+    isfuncexpr(funcexpr) || error("Requires a function expression")
+    esc(ngenerate(itersym, funcexpr.args[1], N->sreplace!(copy(funcexpr.args[2]), itersym, N)))
+end
+
+generate1(itersym, prototype, bodyfunc, N::Int, varname, T) =
+    Expr(:function, spliceint!(sreplace!(resolvesplat!(copy(prototype), varname, T, N), itersym, N)),
+         resolvesplats!(bodyfunc(N), varname, N))
+
+function ngenerate(itersym, prototype, bodyfunc, dims=1:CARTESIAN_DIMS, makecached::Bool = true)
+    varname, T = get_splatinfo(prototype, itersym)
+    # Generate versions for specific dimensions
+    fdim = [generate1(itersym, prototype, bodyfunc, N, varname, T) for N in dims]
+    if !makecached
+        return Expr(:block, fdim...)
+    end
+    # Generate the generic cache-based version
+    if isempty(varname)
+        setitersym, extractvarargs = :(), N -> nothing
+    else
+        s = symbol(varname)
+        setitersym = hasparameter(prototype, itersym) ? (:(@assert $itersym == length($s))) : (:($itersym = length($s)))
+        extractvarargs = N -> Expr(:block, map(popescape, _nextract(N, s, s).args)...)
+    end
+    fsym = funcsym(prototype)
+    dictname = symbol(string(fsym)*"_cache")
+    fargs = funcargs(prototype)
+    if !isempty(varname)
+        fargs[end] = Expr(:..., fargs[end].args[1])
+    end
+    flocal = funcrename(copy(prototype), :_F_)
+    F = Expr(:function, resolvesplat!(prototype, varname, T), quote
+             $setitersym
+             if !haskey($dictname, $itersym)
+                 gen1 = Base.Cartesian.generate1($(symbol(itersym)), $(Expr(:quote, flocal)), $bodyfunc, $itersym, $varname, $T)
+                 $(dictname)[$itersym] = eval(quote
+                     local _F_
+                     $gen1
+                     _F_
+                 end)
+             end
+             $(dictname)[$itersym]($(fargs...))
+         end)
+    Expr(:block, fdim..., quote
+            let $dictname = Dict{Int,Function}()
+            $F
+            end
+        end)
+end
+
+isfuncexpr(ex::Expr) =
+    ex.head == :function || (ex.head == :(=) && typeof(ex.args[1]) == Expr && ex.args[1].head == :call)
+isfuncexpr(arg) = false
+
+sreplace!(arg, sym, val) = arg
+function sreplace!(ex::Expr, sym, val)
+    for i = 1:length(ex.args)
+        ex.args[i] = sreplace!(ex.args[i], sym, val)
+    end
+    ex
+end
+sreplace!(s::Symbol, sym, val) = s == sym ? val : s
+
+# If using the syntax that will need "desplatting",
+#     myfunction(A::AbstractArray, I::NTuple{N, Int}...)
+# return the variable name (as a string) and type
+function get_splatinfo(ex::Expr, itersym::Symbol)
+    if ex.head == :call
+        a = ex.args[end]
+        if  isa(a, Expr) && a.head == :... && length(a.args) == 1
+            b = a.args[1]
+            if isa(b, Expr) && b.head == :(::)
+                varname = string(b.args[1])
+                c = b.args[2]
+                if isa(c, Expr) && c.head == :curly && c.args[1] == :NTuple && c.args[2] == itersym
+                    T = c.args[3]
+                    return varname, T
+                end
+            end
+        end
+    end
+    "", Nothing
+end
+
+# Replace splatted with desplatted for a specific number of arguments
+function resolvesplat!(prototype, varname, T::Union(Type,Symbol,Expr), N::Int)
+    if !isempty(varname)
+        prototype.args[end] = Expr(:(::), symbol(string(varname, "_1")), T)
+        for i = 2:N
+            push!(prototype.args, Expr(:(::), symbol(string(varname, "_", i)), T))
+        end
+    end
+    prototype
+end
+
+# Return the generic splatting form, e.g.,
+#     myfunction(A::AbstractArray, I::Int...)
+function resolvesplat!(prototype, varname, T::Union(Type,Symbol,Expr))
+    if !isempty(varname)
+        svarname = symbol(varname)
+        prototype.args[end] = Expr(:..., :($svarname::$T))
+    end
+    prototype
+end
+
+# Desplatting function calls: replace func(a, b, I...) with func(a, b, I_1, I_2, I_3)
+resolvesplats!(arg, varname, N) = arg
+function resolvesplats!(ex::Expr, varname, N::Int)
+    if ex.head == :call
+        for i = 2:length(ex.args)-1
+            resolvesplats!(ex.args[i], varname, N)
+        end
+        a = ex.args[end]
+        if isa(a, Expr) && a.head == :... && a.args[1] == symbol(varname)
+            ex.args[end] = symbol(string(varname, "_1"))
+            for i = 2:N
+                push!(ex.args, symbol(string(varname, "_", i)))
+            end
+        else
+            resolvesplats!(a, varname, N)
+        end
+    else
+        for i = 1:length(ex.args)
+            resolvesplats!(ex.args[i], varname, N)
+        end
+    end
+    ex
+end
+
+# Remove any function parameters that are integers
+function spliceint!(ex::Expr)
+    if ex.head == :escape
+        return esc(spliceint!(ex.args[1]))
+    end
+    ex.head == :call || error(string(ex, " must be a call"))
+    if isa(ex.args[1], Expr) && ex.args[1].head == :curly
+        args = ex.args[1].args
+        for i = length(args):-1:1
+            if isa(args[i], Int)
+                splice!(args, i)
+            end
+        end
+    end
+    ex
+end
+
+function popescape(ex::Expr)
+    while ex.head == :escape
+        ex = ex.args[1]
+    end
+    ex
+end
+
+# Extract the "function name"
+function funcsym(prototype::Expr)
+    prototype = popescape(prototype)
+    prototype.head == :call || error(string(prototype, " must be a call"))
+    tmp = prototype.args[1]
+    if isa(tmp, Expr) && tmp.head == :curly
+        tmp = tmp.args[1]
+    end
+    return tmp
+end
+
+function funcrename(prototype::Expr, name::Symbol)
+    prototype = popescape(prototype)
+    prototype.head == :call || error(string(prototype, " must be a call"))
+    tmp = prototype.args[1]
+    if isa(tmp, Expr) && tmp.head == :curly
+        tmp.args[1] = name
+    else
+        prototype.args[1] = name
+    end
+    return prototype
+end
+
+function hasparameter(prototype::Expr, sym::Symbol)
+    prototype = popescape(prototype)
+    prototype.head == :call || error(string(prototype, " must be a call"))
+    tmp = prototype.args[1]
+    if isa(tmp, Expr) && tmp.head == :curly
+        for i = 2:length(tmp.args)
+            if tmp.args[i] == sym
+                return true
+            end
+        end
+    end
+    false
+end
+
+# Extract the symbols of the function arguments
+funcarg(s::Symbol) = s
+funcarg(ex::Expr) = ex.args[1]
+function funcargs(prototype::Expr)
+    prototype = popescape(prototype)
+    prototype.head == :call || error(string(prototype, " must be a call"))
+    map(a->funcarg(a), prototype.args[2:end])
+end
+
+### Cartesian-specific macros
+
+# Generate nested loops
+macro nloops(N, itersym, rangeexpr, args...)
+    _nloops(N, itersym, rangeexpr, args...)
+end
+
+_nloops(N::Int, itersym::Symbol, arraysym::Symbol, args::Expr...) = _nloops(N, itersym, :(d->1:size($arraysym,d)), args...)
+
+function _nloops(N::Int, itersym::Symbol, rangeexpr::Expr, args::Expr...)
+    if rangeexpr.head != :->
+        error("Second argument must be an anonymous function expression to compute the range")
+    end
+    if !(1 <= length(args) <= 3)
+        error("Too many arguments")
+    end
+    body = args[end]
+    ex = Expr(:escape, body)
+    for dim = 1:N
+        itervar = inlineanonymous(itersym, dim)
+        rng = inlineanonymous(rangeexpr, dim)
+        preexpr = length(args) > 1 ? inlineanonymous(args[1], dim) : (:(nothing))
+        postexpr = length(args) > 2 ? inlineanonymous(args[2], dim) : (:(nothing))
+        ex = quote
+            for $(esc(itervar)) = $(esc(rng))
+                $(esc(preexpr))
+                $ex
+                $(esc(postexpr))
+            end
+        end
+    end
+    ex
+end
+
+# Generate expression A[i1, i2, ...]
+macro nref(N, A, sym)
+    _nref(N, A, sym)
+end
+
+function _nref(N::Int, A::Symbol, ex)
+    vars = [ inlineanonymous(ex,i) for i = 1:N ]
+    Expr(:escape, Expr(:ref, A, vars...))
+end
+
+# Generate f(arg1, arg2, ...)
+macro ncall(N, f, sym)
+    _ncall(N, f, sym)
+end
+
+function _ncall(N::Int, f, ex)
+    vars = [ inlineanonymous(ex,i) for i = 1:N ]
+    Expr(:escape, Expr(:call, f, vars...))
+end
+
+# Generate N expressions
+macro nexprs(N, ex)
+    _nexprs(N, ex)
+end
+
+function _nexprs(N::Int, ex::Expr)
+    exs = [ inlineanonymous(ex,i) for i = 1:N ]
+    Expr(:escape, Expr(:block, exs...))
+end
+
+# Make variables esym1, esym2, ... = isym
+macro nextract(N, esym, isym)
+    _nextract(N, esym, isym)
+end
+
+function _nextract(N::Int, esym::Symbol, isym::Symbol)
+    aexprs = [Expr(:escape, Expr(:(=), inlineanonymous(esym, i), :(($isym)[$i]))) for i = 1:N]
+    Expr(:block, aexprs...)
+end
+
+function _nextract(N::Int, esym::Symbol, ex::Expr)
+    aexprs = [Expr(:escape, Expr(:(=), inlineanonymous(esym, i), inlineanonymous(ex,i))) for i = 1:N]
+    Expr(:block, aexprs...)
+end
+
+# Check whether variables i1, i2, ... all satisfy criterion
+macro nall(N, criterion)
+    _nall(N, criterion)
+end
+
+function _nall(N::Int, criterion::Expr)
+    if criterion.head != :->
+        error("Second argument must be an anonymous function expression yielding the criterion")
+    end
+    conds = [Expr(:escape, inlineanonymous(criterion, i)) for i = 1:N]
+    Expr(:&&, conds...)
+end
+
+macro ntuple(N, ex)
+    _ntuple(N, ex)
+end
+
+function _ntuple(N::Int, ex)
+    vars = [ inlineanonymous(ex,i) for i = 1:N ]
+    Expr(:escape, Expr(:tuple, vars...))
+end
+
+## Utilities
+
+# Simplify expressions like :(d->3:size(A,d)-3) given an explicit value for d
+function inlineanonymous(ex::Expr, val)
+    if ex.head != :->
+        error("Not an anonymous function")
+    end
+    if !isa(ex.args[1], Symbol)
+        error("Not a single-argument anonymous function")
+    end
+    sym = ex.args[1]
+    ex = ex.args[2]
+    exout = lreplace(ex, sym, val)
+    exout = poplinenum(exout)
+    exout = poparithmetic(exout)
+    # Inline ternary expressions
+    if isa(exout, Expr) && exout.head == :if
+        try
+            tf = eval(exout.args[1])
+            exout = tf?exout.args[2]:exout.args[3]
+        catch
+        end
+    end
+    exout
+end
+
+# Given :i and 3, this generates :i_3
+inlineanonymous(base::Symbol, ext) = symbol(string(base)*"_"*string(ext))
+
+# Replace a symbol by a value or a "coded" symbol
+# E.g., for d = 3,
+#    lreplace(:d, :d, 3) -> 3
+#    lreplace(:i_d, :d, 3) -> :i_3
+#    lreplace(:i_{d-1}, :d, 3) -> :i_2
+# This follows LaTeX notation.
+lreplace(ex, sym::Symbol, val) = lreplace!(copy(ex), sym, val, Regex("_"*string(sym)*"(\$|(?=_))"))
+lreplace!(arg, sym::Symbol, val, r) = arg
+function lreplace!(s::Symbol, sym::Symbol, val, r::Regex)
+    if (s == sym)
+        return val
+    end
+    symbol(replace(string(s), r, "_"*string(val)))
+end
+function lreplace!(ex::Expr, sym::Symbol, val, r)
+    # Curly-brace notation, which acts like parentheses
+    if ex.head == :curly && length(ex.args) == 2 && isa(ex.args[1], Symbol) && endswith(string(ex.args[1]), "_")
+        excurly = lreplace!(ex.args[2], sym, val, r)
+        return symbol(string(ex.args[1])*string(poparithmetic(excurly)))
+    end
+    for i in 1:length(ex.args)
+        ex.args[i] = lreplace!(ex.args[i], sym, val, r)
+    end
+    ex
+end
+
+poplinenum(arg) = arg
+function poplinenum(ex::Expr)
+    if ex.head == :block
+        if length(ex.args) == 1
+            return ex.args[1]
+        elseif length(ex.args) == 2 && ex.args[1].head == :line
+            return ex.args[2]
+        end
+    end
+    ex
+end
+
+# Handle very simple arithmetic at the expression level
+poparithmetic(ex) = ex
+function poparithmetic(ex::Expr)
+    for i = 1:length(ex.args)
+        ex.args[i] = poparithmetic(ex.args[i])
+    end
+    if ex.head == :call && in(ex.args[1], (:+, :-, :*, :/)) && all([isa(ex.args[i], Number) for i = 2:length(ex.args)])
+        return eval(ex)
+    elseif ex.head == :call && (ex.args[1] == :+ || ex.args[1] == :-) && length(ex.args) == 3 && ex.args[3] == 0
+        # simplify x+0 and x-0
+        return ex.args[2]
+    end
+    ex
+end
+
+end
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
new file mode 100644
index 0000000000000..a1d3011ccc4be
--- /dev/null
+++ b/base/multidimensional.jl
@@ -0,0 +1,526 @@
+### TODO: implement bitarray functions in terms of cartesian and delete gen_cartesian_map
+
+### From array.jl
+
+@ngenerate N function checksize(A::AbstractArray, I::NTuple{N, Any}...)
+    @nexprs N d->(size(A, d) == length(I_d) || throw(DimensionMismatch("Index $d has length $(length(I_d)), but size(A, $d) = $(size(A,d))")))
+    nothing
+end
+
+# Version that uses cartesian indexing for src
+@ngenerate N function getindex!(dest::Array, src::AbstractArray, I::NTuple{N,Union(Real,AbstractVector)}...)
+    checksize(dest, I...)
+    checkbounds(src, I...)
+    @nexprs N d->(J_d = to_index(I_d))
+    k = 1
+    @nloops N i dest d->(j_d = J_d[i_d]) begin
+        @inbounds dest[k] = (@nref N src j)
+        k += 1
+    end
+    dest
+end
+
+# Version that uses linear indexing for src
+@ngenerate N function getindex!(dest::Array, src::Array, I::NTuple{N,Union(Real,AbstractVector)}...)
+    checksize(dest, I...)
+    checkbounds(src, I...)
+    @nexprs N d->(J_d = to_index(I_d))
+    stride_1 = 1
+    @nexprs N d->(stride_{d+1} = stride_d*size(src,d))
+    @nexprs N d->(offset_d = 1)  # only really need offset_$N = 1
+    k = 1
+    @nloops N i dest d->(offset_{d-1} = offset_d + (J_d[i_d]-1)*stride_d) begin
+        @inbounds dest[k] = src[offset_0]
+        k += 1
+    end
+    dest
+end
+
+@ngenerate N getindex(A::Array, I::NTuple{N,Union(Real,AbstractVector)}...) = getindex!(similar(A, eltype(A), index_shape(I...)), A, I...)
+
+
+@ngenerate N function setindex!(A::Array, x, I::NTuple{N,Union(Real,AbstractArray)}...)
+    checkbounds(A, I...)
+    @nexprs N d->(J_d = to_index(I_d))
+    stride_1 = 1
+    @nexprs N d->(stride_{d+1} = stride_d*size(A,d))
+    @nexprs N d->(offset_d = 1)  # really only need offset_$N = 1
+    if !isa(x, AbstractArray)
+        @nloops N i d->(1:length(J_d)) d->(offset_{d-1} = offset_d + (J_d[i_d]-1)*stride_d) begin
+            @inbounds A[offset_0] = x
+        end
+    else
+        X = x
+        setindex_shape_check(X, I...)
+        # TODO? A variant that can use cartesian indexing for RHS
+        k = 1
+        @nloops N i d->(1:length(J_d)) d->(offset_{d-1} = offset_d + (J_d[i_d]-1)*stride_d) begin
+            @inbounds A[offset_0] = X[k]
+            k += 1
+        end
+    end
+    A
+end
+
+
+@ngenerate N function findn{T,N}(A::AbstractArray{T,N})
+    nnzA = nnz(A)
+    @nexprs N d->(I_d = Array(Int, nnzA))
+    k = 1
+    @nloops N i A begin
+        @inbounds if (@nref N A i) != zero(T)
+            @nexprs N d->(I_d[k] = i_d)
+            k += 1
+        end
+    end
+    @ntuple N I
+end
+
+
+### subarray.jl
+
+# Here we want to skip creating the dict-based cached version,
+# so use the ngenerate function
+function gen_getindex_body(N::Int)
+    quote
+        strd_1 = 1
+        @nexprs $N d->(@inbounds strd_{d+1} = strd_d*s.dims[d])
+        ind -= 1
+        indp = s.first_index
+        @nexprs $N d->begin
+            i = div(ind, strd_{$N-d+1})
+            @inbounds indp += i*s.strides[$N-d+1]
+            ind -= i*strd_{$N-d+1}
+        end
+        s.parent[indp]
+    end
+end
+
+eval(ngenerate(:N, :(getindex{T}(s::SubArray{T,N}, ind::Integer)), gen_getindex_body, 2:5, false))
+
+
+function gen_setindex!_body(N::Int)
+    quote
+        strd_1 = 1
+        @nexprs $N d->(@inbounds strd_{d+1} = strd_d*s.dims[d])
+        ind -= 1
+        indp = s.first_index
+        @nexprs $N d->begin
+            i = div(ind, strd_{$N-d+1})
+            @inbounds indp += i*s.strides[$N-d+1]
+            ind -= i*strd_{$N-d+1}
+        end
+        s.parent[indp] = v
+    end
+end
+
+eval(ngenerate(:N, :(setindex!{T}(s::SubArray{T,N}, v, ind::Integer)), gen_setindex!_body, 2:5, false))
+
+
+### from abstractarray.jl
+
+@ngenerate N function fill!{T,N}(A::AbstractArray{T,N}, x)
+    @nloops N i A begin
+        @inbounds (@nref N A i) = x
+    end
+    return A
+end
+
+## code generator for specializing on the number of dimensions ##
+
+#otherbodies are the bodies that reside between loops, if its a 2 dimension array.
+function make_loop_nest(vars, ranges, body)
+    otherbodies = cell(length(vars),2)
+    #println(vars)
+    for i = 1:2*length(vars)
+        otherbodies[i] = nothing
+    end
+    make_loop_nest(vars, ranges, body, otherbodies)
+end
+
+function make_loop_nest(vars, ranges, body, otherbodies)
+    expr = body
+    len = size(otherbodies)[1]
+    for i=1:length(vars)
+        v = vars[i]
+        r = ranges[i]
+        l = otherbodies[i]
+        j = otherbodies[i+len]
+        expr = quote
+            $l
+            for ($v) = ($r)
+                $expr
+            end
+            $j
+        end
+    end
+    expr
+end
+
+
+## genbodies() is a function that creates an array (potentially 2d),
+## where the first element is inside the inner most array, and the last
+## element is outside most loop, and all the other arguments are
+## between each loop. If it creates a 2d array, it just means that it
+## specifies what it wants to do before and after each loop.
+## If genbodies creates an array it must of length N.
+function gen_cartesian_map(cache, genbodies, ranges, exargnames, exargs...)
+    if ranges === ()
+        ranges = (1,)
+    end
+    N = length(ranges)
+    if !haskey(cache,N)
+        if isdefined(genbodies,:code)
+            mod = genbodies.code.module
+        else
+            mod = Main
+        end
+        dimargnames = { symbol(string("_d",i)) for i=1:N }
+        ivars = { symbol(string("_i",i)) for i=1:N }
+        bodies = genbodies(ivars)
+
+        ## creating a 2d array, to pass as bodies
+        if isa(bodies,Array)
+            if (ndims(bodies)==2)
+                #println("2d array noticed")
+                body = bodies[1]
+                bodies = bodies[2:end,:]
+            elseif (ndims(bodies)==1)
+                #println("1d array noticed")
+                body = bodies[1]
+                bodies_tmp = cell(N,2)
+                for i = 1:N
+                    bodies_tmp[i] = bodies[i+1]
+                    bodies_tmp[i+N] = nothing
+                end
+                bodies = bodies_tmp
+            end
+        else
+            #println("no array noticed")
+            body = bodies
+            bodies = cell(N,2)
+            for i=1:2*N
+                bodies[i] = nothing
+            end
+        end
+        fexpr =
+        quote
+            local _F_
+            function _F_($(dimargnames...), $(exargnames...))
+                $(make_loop_nest(ivars, dimargnames, body, bodies))
+            end
+            _F_
+        end
+        f = eval(mod,fexpr)
+        cache[N] = f
+    else
+        f = cache[N]
+    end
+    return f(ranges..., exargs...)
+end
+
+
+### from bitarray.jl
+
+# note: we can gain some performance if the first dimension is a range;
+# TODO: extend to I:Union(Real,AbstractArray)... (i.e. not necessarily contiguous)
+let getindex_cache = nothing
+    global getindex
+    function getindex(B::BitArray, I0::Range1{Int}, I::Union(Real,Range1{Int})...)
+        # the < should become a != once
+        # the stricter indexing behaviour is enforced
+        if ndims(B) < 1 + length(I)
+            error("wrong number of dimensions")
+        end
+        checkbounds(B, I0, I...)
+        X = BitArray(index_shape(I0, I...))
+        nI = 1 + length(I)
+
+        I = map(x->(isa(x,Real) ? (to_index(x):to_index(x)) : to_index(x)), I[1:nI-1])
+
+        f0 = first(I0)
+        l0 = length(I0)
+
+        gap_lst = Int[last(r)-first(r)+1 for r in I]
+        stride_lst = Array(Int, nI)
+        stride = 1
+        ind = f0
+        for k = 1 : nI - 1
+            stride *= size(B, k)
+            stride_lst[k] = stride
+            ind += stride * (first(I[k]) - 1)
+            gap_lst[k] *= stride
+        end
+        # we only need nI-1 elements, the last one
+        # is dummy (used in bodies[k,2] below)
+        stride_lst[nI] = 0
+
+        if ndims(X) == 1
+            copy_chunks(X.chunks, 1, B.chunks, ind, l0)
+            return X
+        end
+
+        if is(getindex_cache,nothing)
+            getindex_cache = Dict()
+        end
+
+        gen_cartesian_map(getindex_cache,
+            ivars->begin
+                bodies = cell(nI, 2)
+                bodies[1] = quote
+                        copy_chunks(X.chunks, storeind, B.chunks, ind, l0)
+                        storeind += l0
+                        ind += stride_lst[loop_ind]
+                    end
+                for k = 2 : nI
+                    bodies[k, 1] = quote
+                        loop_ind -= 1
+                    end
+                    bodies[k, 2] = quote
+                        ind -= gap_lst[loop_ind]
+                        loop_ind += 1
+                        ind += stride_lst[loop_ind]
+                    end
+                end
+                return bodies
+            end,
+            I, (:B, :X, :storeind, :ind, :l0, :stride_lst, :gap_lst, :loop_ind),
+            B, X, 1, ind, l0, stride_lst, gap_lst, nI)
+        return X
+    end
+end
+
+let getindex_cache = nothing
+    global getindex
+    function getindex(B::BitArray, I::Union(Real,AbstractVector)...)
+        checkbounds(B, I...)
+        I = to_index(I)
+        X = BitArray(index_shape(I...))
+        Xc = X.chunks
+
+        if is(getindex_cache,nothing)
+            getindex_cache = Dict()
+        end
+        gen_cartesian_map(getindex_cache, ivars -> quote
+                #faster X[storeind] = B[$(ivars...)]
+                setindex_unchecked(Xc, B[$(ivars...)], ind)
+                ind += 1
+            end, I, (:B, :Xc, :ind), B, Xc, 1)
+        return X
+    end
+end
+
+let setindex_cache = nothing
+    global setindex_array2bitarray_ranges
+    function setindex_array2bitarray_ranges(B::BitArray, X::BitArray, I0::Range1{Int}, I::Range1{Int}...)
+        nI = 1 + length(I)
+        if ndims(B) != nI
+            error("wrong number of dimensions in assigment")
+        end
+        lI = length(I0)
+        for r in I
+            lI *= length(r)
+        end
+        if length(X) != lI
+            error("array assignment dimensions mismatch")
+        end
+        if lI == 0
+            return B
+        end
+        f0 = first(I0)
+        l0 = length(I0)
+        if nI == 1
+            copy_chunks(B.chunks, f0, X.chunks, 1, l0)
+            return B
+        end
+        if is(setindex_cache,nothing)
+            setindex_cache = Dict()
+        end
+        gap_lst = [last(r)-first(r)+1 for r in I]
+        stride_lst = Array(Int, nI)
+        stride = 1
+        ind = f0
+        @inbounds for k = 1 : nI - 1
+            stride *= size(B, k)
+            stride_lst[k] = stride
+            ind += stride * (first(I[k]) - 1)
+            gap_lst[k] *= stride
+        end
+        # we only need nI-1 elements, the last one
+        # is dummy (used in bodies[k,2] below)
+        stride_lst[nI] = 0
+
+        gen_cartesian_map(setindex_cache,
+            ivars->begin
+                bodies = cell(nI, 2)
+                bodies[1] = quote
+                        copy_chunks(B.chunks, ind, X.chunks, refind, l0)
+                        refind += l0
+                        ind += stride_lst[loop_ind]
+                    end
+                for k = 2 : nI
+                    bodies[k, 1] = quote
+                        loop_ind -= 1
+                    end
+                    bodies[k, 2] = quote
+                        ind -= gap_lst[loop_ind]
+                        loop_ind += 1
+                        ind += stride_lst[loop_ind]
+                    end
+                end
+                return bodies
+            end,
+            I, (:B, :X, :refind, :ind, :l0, :stride_lst, :gap_lst, :loop_ind),
+            B, X, 1, ind, l0, stride_lst, gap_lst, nI)
+        return B
+    end
+end
+
+let setindex_cache = nothing
+    global setindex!
+    function setindex!(B::BitArray, X::AbstractArray, I::Union(Real,AbstractArray)...)
+        I = to_index(I)
+        nel = 1
+        for idx in I
+            nel *= length(idx)
+        end
+        if length(X) != nel
+            error("argument dimensions must match")
+        end
+        if ndims(X) > 1
+            for i = 1:length(I)
+                if size(X,i) != length(I[i])
+                    error("argument dimensions must match")
+                end
+            end
+        end
+        if is(setindex_cache,nothing)
+            setindex_cache = Dict()
+        end
+        gen_cartesian_map(setindex_cache,
+            ivars->:(B[$(ivars...)] = X[refind]; refind += 1),
+            I,
+            (:B, :X, :refind),
+            B, X, 1)
+        return B
+    end
+end
+
+let setindex_cache = nothing
+    global setindex!
+    function setindex!(B::BitArray, x, I::Union(Real,AbstractArray)...)
+        x = convert(Bool, x)
+        checkbounds(B, I...)
+        I = to_index(I)
+        if is(setindex_cache,nothing)
+            setindex_cache = Dict()
+        end
+        gen_cartesian_map(setindex_cache, ivars->:(B[$(ivars...)] = x),
+            I,
+            (:B, :x),
+            B, x)
+        return B
+    end
+end
+
+let findn_cache = nothing
+global findn
+function findn(B::BitArray)
+    ndimsB = ndims(B)
+    nnzB = nnz(B)
+    I = ntuple(ndimsB, x->Array(Int, nnzB))
+    if nnzB > 0
+        ranges = ntuple(ndims(B), d->(1:size(B,d)))
+
+        if is(findn_cache,nothing)
+            findn_cache = Dict()
+        end
+
+        gen_cartesian_map(findn_cache, findn_one, ranges,
+                          (:B, :I, :count), B, I, 1)
+    end
+    return I
+end
+end
+
+let permutedims_cache = nothing, stridenames::Array{Any,1} = {}
+global permutedims!
+function permutedims!(P::BitArray,B::BitArray, perm)
+    dimsB = size(B)
+    ndimsB = length(dimsB)
+    (ndimsB == length(perm) && isperm(perm)) || error("no valid permutation of dimensions")
+	dimsP = size(P)
+    for i = 1:length(perm)
+        dimsP[i] == dimsB[perm[i]] || error("destination tensor of incorrect size")
+    end
+	
+    ranges = ntuple(ndimsB, i->(1:dimsP[i]))
+    while length(stridenames) < ndimsB
+        push!(stridenames, gensym())
+    end
+
+    #calculates all the strides
+    strides = [ prod(dimsB[1:(perm[dim]-1)])::Int for dim = 1:length(perm) ]
+
+    #Creates offset, because indexing starts at 1
+    offset = 0
+    for i in strides
+        offset+=i
+    end
+    offset = 1-offset
+
+    if isa(B,SubArray)
+        offset += (B.first_index-1)
+        B = B.parent
+    end
+
+    if is(permutedims_cache,nothing)
+        permutedims_cache = Dict()
+    end
+
+    gen_cartesian_map(permutedims_cache, iv->permute_one_dim(iv,stridenames), ranges,
+                      tuple(:B, :P, :perm, :offset, stridenames[1:ndimsB]...),
+                      B, P, perm, offset, strides...)
+
+    return P
+end
+function permutedims!(P::Array,B::StridedArray, perm)
+    dimsB = size(B)
+    ndimsB = length(dimsB)
+    (ndimsB == length(perm) && isperm(perm)) || error("no valid permutation of dimensions")
+	dimsP = size(P)
+    for i = 1:length(perm)
+        dimsP[i] == dimsB[perm[i]] || error("destination tensor of incorrect size")
+    end
+	
+    ranges = ntuple(ndimsB, i->(1:dimsP[i]))
+    while length(stridenames) < ndimsB
+        push!(stridenames, gensym())
+    end
+
+    #calculates all the strides
+    strides = [ stride(B, perm[dim]) for dim = 1:length(perm) ]
+
+    #Creates offset, because indexing starts at 1
+    offset = 0
+    for i in strides
+        offset+=i
+    end
+    offset = 1-offset
+
+    if isa(B,SubArray)
+        offset += (B.first_index-1)
+        B = B.parent
+    end
+
+    if is(permutedims_cache,nothing)
+        permutedims_cache = Dict()
+    end
+
+    gen_cartesian_map(permutedims_cache, iv->permute_one_dim(iv,stridenames), ranges,
+                      tuple(:B, :P, :perm, :offset, stridenames[1:ndimsB]...),
+                      B, P, perm, offset, strides...)
+
+    return P
+end
+end # let
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 36da09502d313..a0b3a6cd04c8c 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -1,4 +1,3 @@
-
 ## Functions to compute the reduced shape
 
 # for reductions that expand 0 dims to 1
@@ -45,381 +44,90 @@ function regionsize(a, region)
 end
 
 
-##### (original) reduction codes for arbitrary arrays
-
-reducedim(f::Function, A, region, v0) =
-    reducedim(f, A, region, v0, similar(A, reduced_dims(A, region)))
+### Generic reduction functions
 
-maximum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmax,A,region,typemin(T))
-minimum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmin,A,region,typemax(T))
-sum{T}(A::AbstractArray{T}, region)  = reducedim(+,A,region,zero(T))
-prod{T}(A::AbstractArray{T}, region) = reducedim(*,A,region,one(T))
+reducedim(f::Function, A, region, initial) = reducedim!(f, reduction_init(A, region, initial), A)
 
-all(A::AbstractArray{Bool}, region) = reducedim(&,A,region,true)
-any(A::AbstractArray{Bool}, region) = reducedim(|,A,region,false)
-sum(A::AbstractArray{Bool}, region) = reducedim(+,A,region,0,similar(A,Int,reduced_dims(A,region)))
+reducedim(f::Function, A, region, initial, R) = reducedim!(f, fill!(R, initial), A)
 
-prod(A::AbstractArray{Bool}, region) =
-    error("use all() instead of prod() for boolean arrays")
-
-# TODO:
-# - find out why inner loop with dimsA[i] instead of size(A,i) is way too slow
-
-let reducedim_cache = nothing
-# generate the body of the N-d loop to compute a reduction
-function gen_reducedim_func(n, f)
-    ivars = { symbol(string("i",i)) for i=1:n }
-    # limits and vars for reduction loop
-    lo    = { symbol(string("lo",i)) for i=1:n }
-    hi    = { symbol(string("hi",i)) for i=1:n }
-    rvars = { symbol(string("r",i)) for i=1:n }
-    setlims = { quote
-        # each dim of reduction is either 1:sizeA or ivar:ivar
-        if in($i,region)
-            $(lo[i]) = 1
-            $(hi[i]) = size(A,$i)
-        else
-            $(lo[i]) = $(hi[i]) = $(ivars[i])
-        end
-               end for i=1:n }
-    rranges = { :( $(lo[i]):$(hi[i]) ) for i=1:n }  # lo:hi for all dims
-    body =
-    quote
-        _tot = v0
-        $(setlims...)
-        $(make_loop_nest(rvars, rranges,
-                         :(_tot = ($f)(_tot, A[$(rvars...)]))))
-        R[_ind] = _tot
-        _ind += 1
-    end
-    quote
+function reducedim!_function(N::Int, f::Function)
+    body = gen_reduction_body(N, f)
+    @eval begin
         local _F_
-        function _F_(f, A, region, R, v0)
-            _ind = 1
-            $(make_loop_nest(ivars, { :(1:size(R,$i)) for i=1:n }, body))
+        function _F_(R, A)
+            $body
         end
         _F_
     end
 end
 
-global reducedim
-function reducedim(f::Function, A, region, v0, R)
-    ndimsA = ndims(A)
-
-    if is(reducedim_cache,nothing)
-        reducedim_cache = Dict()
+let reducedim_cache = Dict()
+# reducedim! assumes that R has already been initialized with a seed value
+global reducedim!
+function reducedim!(f::Function, R, A)
+    if isempty(R)
+        return R
     end
-
-    key = ndimsA
-    fname = :f
-
-    if  (is(f,+)     && (fname=:+;true)) ||
-        (is(f,*)     && (fname=:*;true)) ||
-        (is(f,scalarmax)   && (fname=:scalarmax;true)) ||
-        (is(f,scalarmin)   && (fname=:scalarmin;true)) ||
-        (is(f,&)     && (fname=:&;true)) ||
-        (is(f,|)     && (fname=:|;true))
-        key = (fname, ndimsA)
-    end
-
+    ndimsA = ndims(A)
+    key = (ndimsA, f)
     if !haskey(reducedim_cache,key)
-        fexpr = gen_reducedim_func(ndimsA, fname)
-        func = eval(fexpr)
+        func = reducedim!_function(ndimsA, f)
         reducedim_cache[key] = func
     else
         func = reducedim_cache[key]
     end
-
-    func(f, A, region, R, v0)
-
-    return R
+    func(R, A)
 end
-end
-
-
+end  # let reducedim_cache
 
-##### Below are optimized codes for contiguous arrays
-
-function rcompress_dims{N}(siz::NTuple{N,Int}, region)
-    isrd = fill(false, N)
-    for k in region
-        if 1 <= k <= N
-            isrd[k] = true
-        end
-    end
-    
-    sdims = Int[]
-    sizehint(sdims, N)
-    b = isrd[1]
-    n = siz[1]
-    i = 2
-    while i <= N
-        if isrd[i] == b
-            n *= siz[i]
-        else
-            b = !b
-            push!(sdims, n)
-            n = siz[i]
-        end
-        i += 1
-    end
-    if n != 1
-        push!(sdims, n)
-    end
-    return (isrd[1], sdims)
-end
-
-function generate_reducedim_funcs(fname, params, args, sizexpr, comb, sker, ker0!, ker1!)
-    # Parameters:
-    #
-    # - fname:   the interface function name (e.g. sum, maximum)
-    # - params:  a list of input parameters in function signatures
-    # - args:    a list of argument symbols 
-    # - sizexpr: the expression that calculates the input size
-    # - comb:   the combination operation (e.g. +)
-    # - sker:   a kernel function that reduces a vector (or a range of it) to a scalar
-    # - ker0!:  a kernel that initializes an accumulator array using the first column of terms
-    # - ker1!:  a kernel that accumulates a term column to an accumulating column
-    #
-
-    fname! = symbol("$(fname)!")
-    fa! = symbol("$(fname)_a!")
-    fb! = symbol("$(fname)_b!")
-    
+# Generate the body for a reduction function reduce!(f, R, A), using binary operation f,
+# where R is the output and A is the input.
+# R must have already been set to the appropriate size and initialized with the seed value
+function gen_reduction_body(N, f::Function)
+    F = Expr(:quote, f)
     quote
-        global $(fname!)
-        function $(fname!)(dst::Array, $(params...), dim::Integer)
-            siz = $(sizexpr)
-            nd = length(siz)
-            if 1 <= dim <= nd
-                if dim == 1
-                    $(fa!)(true, dst, 0, $(args...), 0, prod(siz[2:nd]), siz[1])
-                elseif dim == nd
-                    $(fb!)(true, dst, 0, $(args...), 0, siz[nd], prod(siz[1:nd-1]))
-                else
-                    $(fb!)(true, dst, 0, $(args...), 0, prod(siz[dim+1:nd]), siz[dim], prod(siz[1:dim-1]))
-                end
-            else
-                $(ker0!)(dst, 1, $(args...), 1, prod(siz))
-            end
-            dst
-        end
-
-        function $(fname!)(dst::Array, $(params...), region)
-            if length(region) == 1
-                $(fname!)(dst, $(args...), region[1])
-            else
-                isrd1, secs = rcompress_dims($(sizexpr), region)
-                if isrd1
-                    $(fa!)(true, dst, 0, $(args...), 0, secs[end:-1:1]...)
-                else
-                    $(fb!)(true, dst, 0, $(args...), 0, secs[end:-1:1]...)
-                end                
-            end
-            dst
-        end
-    
-        # $(fa!)
-        global $(fa!)
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int)
-            if isinit
-                dst[od+1] = $(sker)($(args...), oa+1, oa+n1)
-            else
-                dst[od+1] = $(comb)(dst[od+1], $(sker)($(args...), oa+1, oa+n1))
-            end
-        end
-
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int)
-            if isinit
-                for j = 1:n1
-                    alast = oa + n2
-                    dst[od+j] = $(sker)($(args...), oa+1, alast)
-                    oa = alast
-                end
-            else
-                for j = 1:n1
-                    alast = oa + n2
-                    dst[od+j] = $(comb)(dst[od+j], $(sker)($(args...), oa+1, alast))
-                    oa = alast
-                end
-            end
-        end
-
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
-            as::Int = *(n2, n3, ns...)
-            if length(ns) & 1 == 0
-                $(fa!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
-                oa += as    
-    
-                for j = 2:n1
-                    $(fa!)(false, dst, od, $(args...), oa, n2, n3, ns...)
-                    oa += as
-                end                                 
-            else 
-                ds::Int = *(n3, ns[2:2:end]...)
-                for j = 1:n1
-                    $(fa!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
-                    od += ds
-                    oa += as
-                end   
-            end
-        end
-
-        # $(fb!)
-        global $(fb!)
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int)
-            if isinit
-                $(ker0!)(dst, od+1, $(args...), oa+1, n1)
-            else
-                $(ker1!)(dst, od+1, $(args...), oa+1, n1)
-            end
-        end
-
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int)
-            if isinit
-                $(ker0!)(dst, od+1, $(args...), oa+1, n2)
-            else
-                $(ker1!)(dst, od+1, $(args...), oa+1, n2)        
-            end
-            oa += n2
-            for j = 2:n1
-                $(ker1!)(dst, od+1, $(args...), oa+1, n2)
-                oa += n2
-            end
-        end
-
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
-            as = *(n2, n3, ns...)
-            if length(ns) & 1 == 0
-                ds::Int = *(n3, ns[2:2:end]...)
-                for j = 1:n1
-                    $(fb!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
-                    od += ds
-                    oa += as
-                end
-            else
-                $(fb!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
-                oa += as
-    
-                for j = 2:n1
-                    $(fb!)(false, dst, od, $(args...), oa, n2, n3, ns...)
-                    oa += as
+        (isempty(R) || isempty(A)) && return R
+        for i = 1:$N
+            (size(R, i) == size(A, i) || size(R, i) == 1) || throw(DimensionMismatch("Reduction on array of size $(size(A)) with output of size $(size(R))"))
+        end
+        @nextract $N sizeR d->size(R,d)
+        if size(R, 1) < size(A, 1)
+            @nloops $N i d->(d>1? (1:size(A,d)) : (1:1)) d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds tmp = (@nref $N R j)
+                for i_1 = 1:size(A,1)
+                    @inbounds tmp = ($F)(tmp, (@nref $N A i))
                 end
+                @inbounds (@nref $N R j) = tmp
             end
-        end
-    end
-end
-
-function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
-    # specialized method to generate functions with single input arguments
-    generate_reducedim_funcs(fname, [:(a::Array)], [:a], :(size(a)), comb, sker, ker0!, ker1!)
-end
-
-macro code_reducedim(fname, comb, sker, ker0, ker1)
-    esc(generate_reducedim_funcs(fname, comb, sker, ker0, ker1))
-end
-
-
-# sum
-
-function vcopy!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
-    for i = 1:n
-        @inbounds dst[od] = a[oa]
-        od += 1
-        oa += 1
-    end    
-end
-
-vcopy!{T}(dst::Array{T}, od::Int, a::Array{T}, oa::Int, n::Int) = copy!(dst, od, a, oa, n)
-
-function vadd!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
-    for i = 1:n
-        @inbounds dst[od] += a[oa]
-        od += 1
-        oa += 1
-    end
-end
-
-@code_reducedim sum (+) sum_seq vcopy! vadd!
-
-function sum{R}(rt::Type{R}, a::Array, region)
-    dst = Array(R, reduced_dims(a, region))
-    if !isempty(dst)
-        if isempty(a)
-            fill!(dst, zero(R))
         else
-            sum!(dst, a, region)
-        end
-    end
-    return dst
-end
-
-sum{T}(a::Array{T}, region) = sum(T, a, region)
-sum(a::Array{Bool}, region) = sum(Int, a, region)
-
-# prod
-
-function vmultiply!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
-    for i = 1:n
-        @inbounds dst[od] *= a[oa]
-        od += 1
-        oa += 1
-    end
-end
-
-@code_reducedim prod (*) prod_rgn vcopy! vmultiply!
-
-function prod{R}(rt::Type{R}, a::Array, region)
-    dst = Array(R, reduced_dims(a, region))
-    if !isempty(dst)
-        if isempty(a)
-            fill!(dst, one(R))
-        else
-            prod!(dst, a, region)
+            @nloops $N i A d->(j_d = sizeR_d==1 ? 1 : i_d) begin
+                @inbounds (@nref $N R j) = ($F)((@nref $N R j), (@nref $N A i))
+            end
         end
+        R
     end
-    return dst
 end
 
-prod{T}(a::Array{T}, region) = prod(T, a, region)
+reduction_init{T}(A::AbstractArray, region, initial::T) = fill!(similar(A,T,reduced_dims(A,region)), initial)
 
 
-# maximum & minimum
-
-function vmax!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
-    for i = 1:n
-        @inbounds dst[od] = scalarmax(dst[od], a[oa])
-        od += 1
-        oa += 1
-    end
-end
-
-function vmin!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
-    for i = 1:n
-        @inbounds dst[od] = scalarmin(dst[od], a[oa])
-        od += 1
-        oa += 1
-    end
-end
-
-@code_reducedim maximum scalarmax maximum_rgn vcopy! vmax!
-@code_reducedim minimum scalarmin minimum_rgn vcopy! vmin!
-
-function maximum{T}(a::Array{T}, region)
-    dst = Array(T, reduced_dims0(a, region))
-    if !isempty(dst)
-        maximum!(dst, a, region)
-    end
-    return dst
-end
-
-function minimum{T}(a::Array{T}, region)
-    dst = Array(T, reduced_dims0(a, region))
-    if !isempty(dst)
-        minimum!(dst, a, region)
-    end
-    return dst
-end
+### Pre-generated cases
+# For performance, these bypass reducedim_cache
 
+all(A::AbstractArray{Bool}, region) = all!(reduction_init(A,region,true), A)
+eval(ngenerate(:N, :(all!{N}(R::AbstractArray, A::AbstractArray{Bool,N})), N->gen_reduction_body(N, &)))
+any(A::AbstractArray{Bool}, region) = any!(reduction_init(A,region,false), A)
+eval(ngenerate(:N, :(any!{N}(R::AbstractArray, A::AbstractArray{Bool,N})), N->gen_reduction_body(N, |)))
+maximum{T}(A::AbstractArray{T}, region) =
+    isempty(A) ? similar(A,reduced_dims0(A,region)) : maximum!(reduction_init(A,region,typemin(T)), A)
+eval(ngenerate(:N, :(maximum!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, scalarmax)))
+minimum{T}(A::AbstractArray{T}, region) =
+    isempty(A) ? similar(A,reduced_dims0(A,region)) : minimum!(reduction_init(A,region,typemax(T)), A)
+eval(ngenerate(:N, :(minimum!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, scalarmin)))
+sum{T}(A::AbstractArray{T}, region) = sum!(reduction_init(A,region,zero(T)), A)
+sum(A::AbstractArray{Bool}, region) = sum!(reduction_init(A,region,0), A)
+eval(ngenerate(:N, :(sum!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, +)))
+prod{T}(A::AbstractArray{T}, region) = prod!(reduction_init(A,region,one(T)), A)
+eval(ngenerate(:N, :(prod!{T,N}(R::AbstractArray, A::AbstractArray{T,N})), N->gen_reduction_body(N, *)))
+
+prod(A::AbstractArray{Bool}, region) = error("use all() instead of prod() for boolean arrays")
diff --git a/base/subarray.jl b/base/subarray.jl
index 6f187999db200..fdc38be506c1e 100644
--- a/base/subarray.jl
+++ b/base/subarray.jl
@@ -220,56 +220,6 @@ getindex(s::SubArray, i0::Real, i1::Real, i2::Real, i3::Real, i4::Real, i5::Real
 
 getindex(s::SubArray, i::Integer) = s[ind2sub(size(s), i)...]
 
-function getindex{T}(s::SubArray{T,2}, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    ind -= 1
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2]]
-end
-
-function getindex{T}(s::SubArray{T,3}, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    ind -= 1
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3]]
-end
-
-function getindex{T}(s::SubArray{T,4}, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    @inbounds strd4 = strd3*s.dims[3]
-    ind -= 1
-    i4 = div(ind,strd4)
-    ind -= i4*strd4
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3] + i4*s.strides[4]]
-end
-
-function getindex{T}(s::SubArray{T,5}, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    @inbounds strd4 = strd3*s.dims[3]
-    @inbounds strd5 = strd4*s.dims[4]
-    ind -= 1
-    i5 = div(ind,strd5)
-    ind -= i5*strd5
-    i4 = div(ind,strd4)
-    ind -= i4*strd4
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3] + i4*s.strides[4] + i5*s.strides[5]]
-end
-
 function getindex(s::SubArray, is::Integer...)
     index = s.first_index
     for i = 1:length(is)
@@ -396,60 +346,6 @@ end
 
 setindex!(s::SubArray, v, i::Integer) = setindex!(s, v, ind2sub(size(s), i)...)
 
-function setindex!{T}(s::SubArray{T,2}, v, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    ind -= 1
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2]] = v
-    s
-end
-
-function setindex!{T}(s::SubArray{T,3}, v, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    ind -= 1
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3]] = v
-    s
-end
-
-function setindex!{T}(s::SubArray{T,4}, v, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    @inbounds strd4 = strd3*s.dims[3]
-    ind -= 1
-    i4 = div(ind,strd4)
-    ind -= i4*strd4
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3] + i4*s.strides[4]] = v
-    s
-end
-
-function setindex!{T}(s::SubArray{T,5}, v, ind::Integer)
-    @inbounds strd2 = s.dims[1]
-    @inbounds strd3 = strd2*s.dims[2]
-    @inbounds strd4 = strd3*s.dims[3]
-    @inbounds strd5 = strd4*s.dims[4]
-    ind -= 1
-    i5 = div(ind,strd5)
-    ind -= i5*strd5
-    i4 = div(ind,strd4)
-    ind -= i4*strd4
-    i3 = div(ind,strd3)
-    ind -= i3*strd3
-    i2 = div(ind,strd2)
-    i1 = ind-i2*strd2
-    s.parent[s.first_index + i1*s.strides[1] + i2*s.strides[2] + i3*s.strides[3] + i4*s.strides[4] + i5*s.strides[5]] = v
-    s
-end
-
 function setindex!(s::SubArray, v, is::Integer...)
     index = s.first_index
     for i = 1:length(is)
diff --git a/base/sysimg.jl b/base/sysimg.jl
index c142a56922c27..44d3c03ac1e2f 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -108,6 +108,11 @@ importall .Printf
 include("file.jl")
 include("methodshow.jl")
 
+# multidimensional arrays
+include("cartesian.jl")
+using .Cartesian
+include("multidimensional.jl")
+
 # core math functions
 include("floatfuncs.jl")
 include("math.jl")
diff --git a/test/arrayops.jl b/test/arrayops.jl
index 7e709fa73229e..2e6f5ebc3f0ff 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -715,6 +715,12 @@ end
 
 # fill
 @test fill!(Array(Float64,1),-0.0)[1] === -0.0
+A = ones(3,3)
+S = sub(A, 2, 1:3)
+fill!(S, 2)
+S = sub(A, 1:2, 3)
+fill!(S, 3)
+@test A == [1 1 3; 2 2 3; 1 1 1]
 
 # splice!
 for idx in {1, 2, 5, 9, 10, 1:0, 2:1, 1:1, 2:2, 1:2, 2:4, 9:8, 10:9, 9:9, 10:10,
diff --git a/test/broadcast.jl b/test/broadcast.jl
index d2588da552076..13b9c27c06c3e 100644
--- a/test/broadcast.jl
+++ b/test/broadcast.jl
@@ -21,8 +21,6 @@ for arr in (identity, as_sub)
     @test broadcast(+, arr([1  0]), arr([1, 4])) == [2 1; 5 4]
     @test broadcast(+, arr([1, 0]), arr([1  4])) == [2 5; 1 4]
     @test broadcast(+, arr([1, 0]), arr([1, 4])) == [2, 4]
-    @test broadcast(+) == 0
-    @test broadcast(*) == 1
 
     @test arr(eye(2)) .+ arr([1, 4]) == arr([2 1; 4 5])
     @test arr(eye(2)) .+ arr([1  4]) == arr([2 4; 1 5])
@@ -44,8 +42,19 @@ for arr in (identity, as_sub)
 
     M = arr([11 12; 21 22])
     @test broadcast_getindex(M, eye(Int, 2)+1,arr([1, 2])) == [21 11; 12 22]
+    @test_throws broadcast_getindex(M, eye(Int, 2)+1,arr([1, -1]))
+    @test_throws broadcast_getindex(M, eye(Int, 2)+1,arr([1, 2]), [2])
+    @test broadcast_getindex(M, eye(Int, 2)+1,arr([2, 1]), [1]) == [22 12; 11 21]
 
     A = arr(zeros(2,2))
     broadcast_setindex!(A, arr([21 11; 12 22]), eye(Int, 2)+1,arr([1, 2]))
     @test A == M
+    broadcast_setindex!(A, 5, [1,2], [2 2])
+    @test A == [11 5; 21 5]
+    broadcast_setindex!(A, 7, [1,2], [1 2])
+    @test A == fill(7, 2, 2)
+    A = arr(zeros(3,3))
+    broadcast_setindex!(A, 10:12, 1:3, 1:3)
+    @test A == diagm(10:12)
+    @test_throws broadcast_setindex!(A, 7, [1,-1], [1 2])
 end
diff --git a/test/reducedim.jl b/test/reducedim.jl
index b66d227d10ab6..7b54868ce2d29 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -1,45 +1,17 @@
-
-# Supporting routines
-
-@test Base.rcompress_dims((3, 4), 1) == (true,  [3, 4])
-@test Base.rcompress_dims((3, 4), 2) == (false, [3, 4])
-@test Base.rcompress_dims((3, 4), 3) == (false, [12])
-@test Base.rcompress_dims((3, 4), (1, 2)) == (true, [12])
-
-@test Base.rcompress_dims((3, 4, 5), 1) == (true,  [3, 20])
-@test Base.rcompress_dims((3, 4, 5), 2) == (false, [3, 4, 5])
-@test Base.rcompress_dims((3, 4, 5), 3) == (false, [12, 5])
-@test Base.rcompress_dims((3, 4, 5), (1, 2)) == (true,  [12, 5])
-@test Base.rcompress_dims((3, 4, 5), (1, 3)) == (true,  [3, 4, 5])
-@test Base.rcompress_dims((3, 4, 5), (2, 3)) == (false, [3, 20])
-@test Base.rcompress_dims((3, 4, 5), (1, 2, 3)) == (true, [60])
-
-@test Base.rcompress_dims((3, 4, 5, 2), 1) == (true,  [3, 40])
-@test Base.rcompress_dims((3, 4, 5, 2), 2) == (false, [3, 4, 10])
-@test Base.rcompress_dims((3, 4, 5, 2), 3) == (false, [12, 5, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), 4) == (false, [60, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 2)) == (true,  [12, 10])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 3)) == (true,  [3, 4, 5, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 4)) == (true,  [3, 20, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (2, 3)) == (false, [3, 20, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (2, 4)) == (false, [3, 4, 5, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (3, 4)) == (false, [12, 10])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 3)) == (true,  [60, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 4)) == (true,  [12, 5, 2])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 3, 4)) == (true,  [3, 4, 10])
-@test Base.rcompress_dims((3, 4, 5, 2), (2, 3, 4)) == (false, [3, 40])
-@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 3, 4)) == (true, [120])
-
 # main tests
 
-safe_sum{T}(A::Array{T}, region) = reducedim(+, A, region, zero(T))
-safe_prod{T}(A::Array{T}, region) = reducedim(*, A, region, one(T))
-safe_maximum{T}(A::Array{T}, region) = reducedim(Base.scalarmax, A, region, typemin(T))
-safe_minimum{T}(A::Array{T}, region) = reducedim(Base.scalarmin, A, region, typemax(T))
+function safe_mapslices(op, A, region)
+    newregion = intersect(region, 1:ndims(A))
+    return isempty(newregion) ? A : mapslices(op, A, newregion)
+end
+safe_sum{T}(A::Array{T}, region) = safe_mapslices(sum, A, region)
+safe_prod{T}(A::Array{T}, region) = safe_mapslices(prod, A, region)
+safe_maximum{T}(A::Array{T}, region) = safe_mapslices(maximum, A, region)
+safe_minimum{T}(A::Array{T}, region) = safe_mapslices(minimum, A, region)
 
 Areduc = rand(3, 4, 5, 6)
 for region in {
-	1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), 
+        1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4),
 	(1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
 
 	@test_approx_eq sum(Areduc, region) safe_sum(Areduc, region)
@@ -48,3 +20,8 @@ for region in {
 	@test_approx_eq minimum(Areduc, region) safe_minimum(Areduc, region)
 end
 
+@test reducedim((a,b) -> a|b, [true false; false false], 1, false) == [true false]
+R = reducedim((a,b) -> a+b, [1 2; 3 4], 2, 0.0)
+@test eltype(R) == Float64
+@test_approx_eq R [3,7]
+@test reducedim((a,b) -> a+b, [1 2; 3 4], 1, 0) == [4 6]