Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions src/counts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,10 @@ raw counts.
- `:dict`: use `Dict`-based method which is generally slower but uses less
RAM and is safe for any data type.
"""
function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg)

# manual dispatch for `x` being iterator
function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T
# if it's safe to be sorted using radixsort then it should be faster
# albeit using more RAM
if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
Expand All @@ -269,7 +272,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
end

"""Dict-based addcounts method"""
function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T
function addcounts_dict!(cm::Dict{T}, x) where T
for v in x
index = ht_keyindex2!(cm, v)
if index > 0
Expand All @@ -286,14 +289,27 @@ end
# faster results and less memory usage. However we still wish to enable others
# to write generic algorithms, therefore the methods below still accept the
# `alg` argument but it is ignored.
function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
sumx = sum(x)
cm[true] = get(cm, true, 0) + sumx
cm[false] = get(cm, false, 0) + length(x) - sumx
cm
end

function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16}
#speailized for `Bool` iterator
function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored)
sumx = 0
len = 0
for i in x
sumx += i
len += 1
end
cm[true] = get(cm, true, 0) + sumx
cm[false] = get(cm, false, 0) + len - sumx
cm
end

function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16}
counts = zeros(Int, 2^(8sizeof(T)))

@inbounds for xi in x
Expand All @@ -318,8 +334,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
Float32, Float64}

"Can the type be safely sorted by radixsort"
radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true
radixsort_safe(::Type) = false
radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes

function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
last_sx = sx[1]
Expand Down Expand Up @@ -353,6 +368,12 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
return _addcounts_radix_sort_loop!(cm, sx)
end

# fall-back for `x` an iterator
function addcounts_radixsort!(cm::Dict{T}, x) where T
sx = sort!(collect(x), alg = RadixSort)
return _addcounts_radix_sort_loop!(cm, sx)
end

function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
n = length(x)
length(wv) == n || throw(DimensionMismatch())
Expand Down Expand Up @@ -386,7 +407,7 @@ of occurrences.
- `:dict`: use `Dict`-based method which is generally slower but uses less
RAM and is safe for any data type.
"""
countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg)
countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg)
countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)


Expand Down
17 changes: 14 additions & 3 deletions test/counts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,24 @@ StatsBase.addcounts_radixsort!(cm,xx2)

# testing the Dict-based addcounts
cm = Dict{Int, Int}()
cm_itr = Dict{Int, Int}()
StatsBase.addcounts_dict!(cm,xx)
@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)
StatsBase.addcounts_dict!(cm_itr,skipmissing(xx))
@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)

# test countmap for general iterators
cm = countmap(x, weights(w))
@test cm["a"] == 5.5
@test cm["b"] == 4.5
@test cm["c"] == 3.5

xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"])
cm_missing = countmap(xx_missing)

@test cm_missing["a"] == 2
@test cm_missing["b"] == 2
@test cm_missing["c"] == 1

@test cm == countmap(x, w)

pm = proportionmap(x, weights(w))
Expand All @@ -119,11 +129,12 @@ pm = proportionmap(x, weights(w))

# testing small bits type
bx = [true, false, true, true, false]
@test countmap(bx) == Dict(true => 3, false => 2)
@test countmap(skipmissing(bx)) == countmap(bx) == Dict(true => 3, false => 2)

for T in [UInt8, UInt16, Int8, Int16]
tx = T[typemin(T), 8, typemax(T), 19, 8]
@test countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1)
tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8])
@test countmap(tx) == countmap(tx_missing) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1)
end

@testset "views" begin
Expand Down