JuliaStats · nalimilan · Oct 6, 2020 · Oct 2, 2020 · Oct 2, 2020 · Oct 2, 2020
diff --git a/src/counts.jl b/src/counts.jl
@@ -255,7 +255,10 @@ raw counts.
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
                      RAM and is safe for any data type.
 """
-function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
+addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg)
+
+# manual dispatch for `x` being iterator
+function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T
     # if it's safe to be sorted using radixsort then it should be faster
     # albeit using more RAM
     if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
@@ -269,7 +272,7 @@ function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
 end
 
 """Dict-based addcounts method"""
-function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T
+function addcounts_dict!(cm::Dict{T}, x) where T
     for v in x
         index = ht_keyindex2!(cm, v)
         if index > 0
@@ -286,14 +289,27 @@ end
 # faster results and less memory usage. However we still wish to enable others
 # to write generic algorithms, therefore the methods below still accept the 
 # `alg` argument but it is ignored.
-function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
+function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
     sumx = sum(x)
     cm[true] = get(cm, true, 0) + sumx
     cm[false] = get(cm, false, 0) + length(x) - sumx
     cm
 end
 
-function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16}
+#speailized for `Bool` iterator
+function _addcounts!(::Type{Bool}, cm::Dict{Bool}, x; alg = :ignored)
+    sumx = 0
+    len = 0
+    for i in x
+        sumx += i
+        len += 1
+    end
+    cm[true] = get(cm, true, 0) + sumx
+    cm[false] = get(cm, false, 0) + len - sumx
+    cm
+end
+
+function _addcounts!(::Type{T}, cm::Dict{T}, x; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16}
     counts = zeros(Int, 2^(8sizeof(T)))
 
     @inbounds for xi in x
@@ -318,8 +334,7 @@ const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
                                      Float32, Float64}
 
 "Can the type be safely sorted by radixsort"
-radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true
-radixsort_safe(::Type) = false
+radixsort_safe(::Type{T}) where T = T<:BaseRadixSortSafeTypes
 
 function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
     last_sx = sx[1]
@@ -353,6 +368,12 @@ function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
     return _addcounts_radix_sort_loop!(cm, sx)
 end
 
+# fall-back for `x` an iterator
+function addcounts_radixsort!(cm::Dict{T}, x) where T 
+    sx = sort!(collect(x), alg = RadixSort)
+    return _addcounts_radix_sort_loop!(cm, sx)
+end
+
 function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
     n = length(x)
     length(wv) == n || throw(DimensionMismatch())
@@ -386,7 +407,7 @@ of occurrences.
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
                      RAM and is safe for any data type.
 """
-countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg)
+countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg)
 countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)
 
 

diff --git a/test/counts.jl b/test/counts.jl
@@ -102,14 +102,24 @@ StatsBase.addcounts_radixsort!(cm,xx2)
 
 # testing the Dict-based addcounts
 cm = Dict{Int, Int}()
+cm_itr = Dict{Int, Int}()
 StatsBase.addcounts_dict!(cm,xx)
-@test cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)
+StatsBase.addcounts_dict!(cm_itr,skipmissing(xx))
+@test cm_itr == cm == Dict(1 => 200_000, 3 => 100_000, 6 => 100_000)
 
+# test countmap for general iterators
 cm = countmap(x, weights(w))
 @test cm["a"] == 5.5
 @test cm["b"] == 4.5
 @test cm["c"] == 3.5
 
+xx_missing = skipmissing([missing, "b", "a", "a", "b", "c"])
+cm_missing = countmap(xx_missing)
+
+@test cm_missing["a"] == 2
+@test cm_missing["b"] == 2
+@test cm_missing["c"] == 1
+
 @test cm == countmap(x, w)
 
 pm = proportionmap(x, weights(w))
@@ -119,11 +129,12 @@ pm = proportionmap(x, weights(w))
 
 # testing small bits type
 bx = [true, false, true, true, false]
-@test countmap(bx) == Dict(true => 3, false => 2)
+@test countmap(skipmissing(bx)) == countmap(bx) == Dict(true => 3, false => 2)
 
 for T in [UInt8, UInt16, Int8, Int16]
     tx = T[typemin(T), 8, typemax(T), 19, 8]
-    @test countmap(tx) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1)
+    tx_missing = skipmissing(T[typemin(T), 8, typemax(T), 19, 8])
+    @test countmap(tx) == countmap(tx_missing) == Dict(typemin(T) => 1, typemax(T) => 1, 8 => 2, 19 => 1)
 end
 
 @testset "views" begin