Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changed recode to accept more general collection types #290

Merged
merged 8 commits into from
Apr 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions src/recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,19 @@ recode!(dest::CategoricalArray, src::AbstractArray, pairs::Pair...) =
recode!(dest::CategoricalArray, src::CategoricalArray, pairs::Pair...) =
recode!(dest, src, nothing, pairs...)

"""
recode_in(x, collection)

Helper function to test if `x` is a member of `collection`.

The default method is to test if any element in the `collection` `isequal` to
`x`. For `Set`s `in` is used as it is faster than the default method and equivalent to it.
A user defined type could override this method to define an appropriate test function.
"""
@inline recode_in(x, ::Missing) = false
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
@inline recode_in(x, collection::Set) = x in collection
@inline recode_in(x, collection) = any(x ≅ y for y in collection)

function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs::Pair...) where {T}
if length(dest) != length(src)
throw(DimensionMismatch("dest and src must be of the same length (got $(length(dest)) and $(length(src)))"))
Expand All @@ -46,8 +59,8 @@ function recode!(dest::AbstractArray{T}, src::AbstractArray, default::Any, pairs

for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) ||
x ≅ p.first)
# we use isequal and recode_in because we cannot really distinguish scalars from collections
if x ≅ p.first || recode_in(x, p.first)
dest[i] = p.second
@goto nextitem
end
Expand Down Expand Up @@ -99,8 +112,8 @@ function recode!(dest::CategoricalArray{T}, src::AbstractArray, default::Any, pa

for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(x ≅ y for y in p.first)) ||
x ≅ p.first)
# we use isequal and recode_in because we cannot really distinguish scalars from collections
if x ≅ p.first || recode_in(x, p.first)
drefs[i] = dupvals ? pairmap[j] : j
@goto nextitem
end
Expand Down Expand Up @@ -166,7 +179,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,

for l in srclevels
if !(any(x -> x ≅ l, firsts) ||
any(f -> isa(f, Union{AbstractArray, Tuple}) && any(l ≅ y for y in f), firsts))
any(f -> recode_in(l, f), firsts))
try
push!(keptlevels, l)
catch err
Expand Down Expand Up @@ -200,8 +213,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
# For missing values (0 if no missing in pairs' keys)
levelsmap[1] = 0
for p in pairs
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(ismissing, p.first)) ||
ismissing(p.first))
if (ismissing(p.first) || any(ismissing, p.first))
levelsmap[1] = get(dest.pool, p.second)
break
end
Expand All @@ -214,8 +226,7 @@ function recode!(dest::CategoricalArray{T, N, R}, src::CategoricalArray,
@inbounds for (i, l) in enumerate(srclevels)
for j in 1:length(pairs)
p = pairs[j]
if ((isa(p.first, Union{AbstractArray, Tuple}) && any(l ≅ y for y in p.first)) ||
l ≅ p.first)
if l ≅ p.first || recode_in(l, p.first)
levelsmap[i+1] = pairmap[j]
@goto nextitem
end
Expand Down
57 changes: 57 additions & 0 deletions test/16_recode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,48 @@ end

const ≅ = isequal

@testset "recode_in" begin
@testset "collection is a string" begin
@test !CategoricalArrays.recode_in("a", "ab")
@test CategoricalArrays.recode_in('a', "ab")
@test !CategoricalArrays.recode_in('c', "ab")
@test !CategoricalArrays.recode_in(missing, "b")
end
@testset "collection without missing" begin
@test CategoricalArrays.recode_in(1, [1, 2])
@test !CategoricalArrays.recode_in(1, [2, 3])
end
@testset "collection with missing" begin
@test CategoricalArrays.recode_in(1, [1, 2, missing])
@test !CategoricalArrays.recode_in(1, [2, missing])
@test CategoricalArrays.recode_in(missing, [1, 2, missing])
end
@testset "collection is a single value" begin
@test CategoricalArrays.recode_in(1, 1)
@test !CategoricalArrays.recode_in(1, missing)
@test !CategoricalArrays.recode_in(missing, missing)
end
@testset "tuple without missing" begin
@test CategoricalArrays.recode_in(1, (1, 2))
@test !CategoricalArrays.recode_in(1, (2, 3))
end
@testset "tuple with missing" begin
@test CategoricalArrays.recode_in(1, (1, 2, missing))
@test !CategoricalArrays.recode_in(1, (2, missing))
@test CategoricalArrays.recode_in(missing, (1, 2, missing))
end
@testset "nested arrays" begin
@test CategoricalArrays.recode_in([1,2], [[1, 2], [3, 4]])
@test !CategoricalArrays.recode_in([1, 3], [[1, 2], [3, 4]])
end
@testset "NaN in array" begin
@test CategoricalArrays.recode_in(NaN, [1, 2, NaN])
@test !CategoricalArrays.recode_in(NaN, [1, 2, 3])
@test CategoricalArrays.recode_in(2, [1, 2, NaN])
@test !CategoricalArrays.recode_in(3, [1, 2, NaN])
end
end

## Test recode!, used by recode

# Test both recoding into x itself and into an uninitialized vector
Expand All @@ -29,6 +71,21 @@ const ≅ = isequal
end
end

@testset "Recoding from $(typeof(x)) to $(typeof(y)) using a Set as the first argument in a pair" for
x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)),
y in (similar(x), Array{Int}(undef, size(x)),
CategoricalArray{Int}(undef, size(x)),
CategoricalArray{Union{Int, Missing}}(undef, size(x)), x)

z = @inferred recode!(y, x, 1=>100, 2:4=>0, Set([5; 9:10])=>-1)
@test y === z
@test y == [100, 0, 0, 0, -1, 6, 7, 8, -1, -1]
if isa(y, CategoricalArray)
@test levels(y) == [6, 7, 8, 100, 0, -1]
@test !isordered(y)
end
end

@testset "Recoding from $(typeof(x)) to $(typeof(y)) with duplicate recoded values" for
x in ([1:10;], CategoricalArray(1:10), CategoricalArray{Union{Int, Missing}}(1:10)),
y in (similar(x), Array{Int}(undef, size(x)),
Expand Down