Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions src/counts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,10 @@ raw counts.
- `:dict`: use `Dict`-based method which is generally slower but uses less
RAM and is safe for any data type.
"""
function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
function addcounts!(cm::Dict, x; alg = :auto)
# if it's safe to be sorted using radixsort then it should be faster
# albeit using more RAM
if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
if radixsort_safe(eltype(x)) && (alg == :auto || alg == :radixsort)
addcounts_radixsort!(cm, x)
elseif alg == :radixsort
throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead"))
Expand All @@ -269,7 +269,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
end

"""Dict-based addcounts method"""
function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T
function addcounts_dict!(cm::Dict{T}, x) where T
for v in x
index = ht_keyindex2!(cm, v)
if index > 0
Expand Down Expand Up @@ -318,8 +318,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
Float32, Float64}

"Can the type be safely sorted by radixsort"
radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true
radixsort_safe(::Type) = false
radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes

function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
last_sx = sx[1]
Expand Down Expand Up @@ -353,6 +352,13 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
return _addcounts_radix_sort_loop!(cm, sx)
end

# fall-back for `x` an iterator
function addcounts_radixsort!(cm::Dict{T}, x) where T
sx = sort!(collect(x), alg = RadixSort)
return _addcounts_radix_sort_loop!(cm, sx)
end


function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
n = length(x)
length(wv) == n || throw(DimensionMismatch())
Expand Down Expand Up @@ -388,7 +394,15 @@ of occurrences.
"""
countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg)
countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)

# fall-back for iterator `x`
function countmap(x)
if eltype(x) <: Union{Bool, UInt8, UInt16, Int8, Int16}
# faster `addcounts!` specialized
addcounts!(Dict{eltype(x),Int}(), collect(x); alg = :auto)
else
addcounts!(Dict{eltype(x),Int}(), x; alg = :auto)
end
end

"""
proportionmap(x)
Expand Down
12 changes: 11 additions & 1 deletion test/counts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,24 @@ StatsBase.addcounts_radixsort!(cm,xx2)

# testing the Dict-based addcounts
cm = Dict{Int, Int}()
cm_itr = Dict{Int, Int}()
StatsBase.addcounts_dict!(cm,xx)
@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)
StatsBase.addcounts_dict!(cm_itr,skipmissing(xx))
@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)

# test countmap for general iterators
cm = countmap(x, weights(w))
@test cm["a"] == 5.5
@test cm["b"] == 4.5
@test cm["c"] == 3.5

xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"])
cm_missing = countmap(xx_missing)

@test cm_missing["a"] == 2
@test cm_missing["b"] == 2
@test cm_missing["c"] == 1

@test cm == countmap(x, w)

pm = proportionmap(x, weights(w))
Expand Down