Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support mapreduce over dimensions with SkipMissing #28027

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 209 additions & 5 deletions base/missing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ float(A::AbstractArray{Missing}) = A
skipmissing(itr)

Return an iterator over the elements in `itr` skipping [`missing`](@ref) values.
In addition to supporting any function taking iterators, the resulting object
implements reductions over dimensions (i.e. the `dims` argument to
[`mapreduce`](@ref), [`reduce`](@ref) and special functions like [`sum`](@ref)).

Use [`collect`](@ref) to obtain an `Array` containing the non-`missing` values in
`itr`. Note that even if `itr` is a multidimensional array, the result will always
Expand All @@ -154,9 +157,6 @@ of the input.

# Examples
```jldoctest
julia> sum(skipmissing([1, missing, 2]))
3

julia> collect(skipmissing([1, missing, 2]))
2-element Array{Int64,1}:
1
Expand All @@ -166,6 +166,31 @@ julia> collect(skipmissing([1 missing; 2 missing]))
2-element Array{Int64,1}:
1
2

julia> sum(skipmissing([1, missing, 2]))
3

julia> B = [1 missing; 3 4]
2×2 Array{Union{Missing, Int64},2}:
1 missing
3 4

julia> sum(skipmissing(B), dims=1)
1×2 Array{Int64,2}:
4 4

julia> sum(skipmissing(B), dims=2)
2×1 Array{Int64,2}:
1
7

julia> reduce(*, skipmissing(B), dims=1)
1×2 Array{Int64,2}:
3 4

julia> mapreduce(cos, +, skipmissing(B), dims=1)
1×2 Array{Float64,2}:
-0.44969 -0.653644
```
"""
skipmissing(itr) = SkipMissing(itr)
Expand All @@ -192,8 +217,8 @@ end
# Optimized mapreduce implementation
# The generic method is faster when !(eltype(A) >: Missing) since it does not need
# additional loops to identify the two first non-missing values of each block
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) =
_mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x)
mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; dims=:, kw...) =
_mapreduce_dim(f, op, kw.data, eltype(itr.x) >: Missing ? itr : itr.x, dims)

function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray})
A = itr.x
Expand Down Expand Up @@ -280,6 +305,185 @@ mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) =
end
end

# mapreduce over dimensions implementation

_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, ::Colon) =
mapfoldl(f, op, itr; nt...)

_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, ::Colon) =
_mapreduce(f, op, IndexStyle(itr.x), itr)

_mapreduce_dim(f, op, nt::NamedTuple{(:init,)}, itr::SkipMissing{<:AbstractArray}, dims) =
mapreducedim!(f, op, reducedim_initarray(itr, dims, nt.init), itr)

_mapreduce_dim(f, op, ::NamedTuple{()}, itr::SkipMissing{<:AbstractArray}, dims) =
mapreducedim!(f, op, reducedim_init(f, op, itr, dims), itr)

reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init, ::Type{R}) where {R} =
reducedim_initarray(itr.x, region, init, R)
reducedim_initarray(itr::SkipMissing{<:AbstractArray}, region, init::T) where {T} =
reducedim_initarray(itr.x, region, init, T)

reducedim_init(f, op::Union{typeof(+),typeof(add_sum)},
itr::SkipMissing{<:AbstractArray}, region) =
_reducedim_init(f, op, zero, sum, itr, region)
reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)},
itr::SkipMissing{<:AbstractArray}, region) =
_reducedim_init(f, op, one, prod, itr, region)
reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max),
itr::SkipMissing{<:AbstractArray}, region) =
reducedim_initarray(itr, region, zero(f(zero(eltype(itr)))))
reducedim_init(f, op::typeof(&), itr::SkipMissing{<:AbstractArray}, region) =
reducedim_initarray(itr, region, true)
reducedim_init(f, op::typeof(|), itr::SkipMissing{<:AbstractArray}, region) =
reducedim_initarray(itr, region, false)

# initialization when computing minima and maxima requires a little care
for (f1, f2, initval) in ((:min, :max, :Inf), (:max, :min, :(-Inf)))
@eval function reducedim_init(f, op::typeof($f1), itr::SkipMissing{<:AbstractArray}, region)
A = itr.x
T = eltype(itr)

# First compute the reduce indices. This will throw an ArgumentError
# if any region is invalid
ri = reduced_indices(A, region)

# Next, throw if reduction is over a region with length zero
any(i -> isempty(axes(A, i)), region) && _empty_reduce_error()

# Make a view of the first slice of the region
A1 = view(A, ri...)

if isempty(A1)
# If the slice is empty just return non-view, non-missing version as the initial array
return similar(A1, eltype(itr))
else
# otherwise use the min/max of the first slice as initial value
v0 = mapreduce(f, $f2, A1)

R = similar(A1, typeof(v0))

# if any value is missing in first slice, look for first
# non-missing value in each slice
if v0 === missing
v0 = nonmissingval(f, $f2, itr, R)
R = similar(A1, typeof(v0))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove.

end

# but NaNs need to be avoided as initial values
v0 = v0 != v0 ? typeof(v0)($initval) : v0

# equivalent to reducedim_initarray, but we need R in advance
return fill!(R, v0)
end
end
end

# Iterate until we've encountered at least one non-missing value in each slice,
# and return the min/max non-missing value of all clices
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# and return the min/max non-missing value of all clices
# and return the min/max non-missing value of all slices

function nonmissingval(f, op::Union{typeof(min), typeof(max)},
itr::SkipMissing{<:AbstractArray}, R::AbstractArray)
A = itr.x
lsiz = check_reducedims(R,A)
indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually
keep, Idefault = Broadcast.shapeindexer(indsRt)
i = findfirst(!ismissing, A)
i === nothing && throw(ArgumentError("cannot reduce over array with only missing values"))
@inbounds v = A[i]
if reducedim1(R, A)
# keep track of state using a single variable when reducing along the first dimension
i1 = first(axes1(R))
@inbounds for IA in CartesianIndices(indsAt)
IR = Broadcast.newindex(IA, keep, Idefault)
filled = false
for i in axes(A, 1)
x = A[i, IA]
if x !== missing
v = op(v, f(x))
filled = true
break
end
end
if !filled
throw(ArgumentError("cannot reduce over slices with only missing values"))
end
end
else
filled = fill!(similar(R, Bool), false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you use an external boolean array to keep the "stage" of reduction, using _InitialValue + BottomRF as now done in foldl might be simpler. However, this would need to use Array{Union{T,Missing,_InitialValue}} as the destination/state. It would be nice if there is an API to get Array{Union{T,Missing}} from Array{Union{T,Missing,_InitialValue}} by only re-writing the type tag part of the array. Is there such an API?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually we could simply use missing here to indicate a slice for which we haven't found a non-missing value yet. This is because in the end the initialized array should contain missing only if the slice contains only missing values, in which case an error is thrown.

(Note that the current code just returns a single value, which is the smallest/largest value in the whole array. This relies on the assumption that all entries in the array can be compared with <, which isn't necessarily the case. So returning the smallest/largest value for each slice would make sense, in which case the strategy I describe above should work.)

Regarding your question, I don't think such a conversion method exists, but there's a related issue: #26681.

allfilled = false
@inbounds for IA in CartesianIndices(indsAt)
IR = Broadcast.newindex(IA, keep, Idefault)
for i in axes(A, 1)
x = A[i, IA]
if x !== missing
v = op(v, f(x))
filled[i,IR] = true
end
end
(allfilled = all(filled)) && break
end
if !allfilled
throw(ArgumentError("cannot reduce over slices with only missing values"))
end
end
v
end

function _mapreducedim!(f, op, R::AbstractArray, itr::SkipMissing{<:AbstractArray})
A = itr.x
lsiz = check_reducedims(R,A)
isempty(A) && return R

if has_fast_linear_indexing(A) && lsiz > 16
# use mapreduce_impl, which is probably better tuned to achieve higher performance
nslices = div(length(A), lsiz)
ibase = first(LinearIndices(A))-1
for i = 1:nslices
x = mapreduce_impl(f, op, itr, ibase+1, ibase+lsiz)
if x !== nothing
@inbounds R[i] = op(R[i], something(x))
end
ibase += lsiz
end
return R
end
indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually
keep, Idefault = Broadcast.shapeindexer(indsRt)
if reducedim1(R, A)
# keep the accumulator as a local variable when reducing along the first dimension
i1 = first(axes1(R))
@inbounds for IA in CartesianIndices(indsAt)
IR = Broadcast.newindex(IA, keep, Idefault)
r = R[i1,IR]
@simd for i in axes(A, 1)
x = A[i, IA]
if x !== missing
r = op(r, f(x))
end
end

R[i1,IR] = r
end
else
@inbounds for IA in CartesianIndices(indsAt)
IR = Broadcast.newindex(IA, keep, Idefault)
@simd for i in axes(A, 1)
x = A[i, IA]
if x !== missing
R[i,IR] = op(R[i,IR], f(x))
end
end
end
end
return R
end

mapreducedim!(f, op, R::AbstractArray, A::SkipMissing{<:AbstractArray}) =
(_mapreducedim!(f, op, R, A); R)

reducedim!(op, R::AbstractArray{RT}, A::SkipMissing{<:AbstractArray}) where {RT} =
mapreducedim!(identity, op, R, A)

"""
coalesce(x, y...)

Expand Down
Loading