-
Notifications
You must be signed in to change notification settings - Fork 194
/
counts.jl
400 lines (321 loc) · 13 KB
/
counts.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# Counts of discrete values
#################################################
#
# counts on given levels
#
#################################################
const IntUnitRange{T<:Integer} = UnitRange{T}
if isdefined(Base, :ht_keyindex2)
const ht_keyindex2! = Base.ht_keyindex2
else
using Base: ht_keyindex2!
end
#### functions for counting a single list of integers (1D)
"""
addcounts!(r, x, levels::UnitRange{<:Int}, [wv::AbstractWeights])
Add the number of occurrences in `x` of each value in `levels` to an existing
array `r`. If a weighting vector `wv` is specified, the sum of weights is used
rather than the raw counts.
"""
function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange)
# add counts of integers from x to r
k = length(levels)
length(r) == k || throw(DimensionMismatch())
m0 = levels[1]
m1 = levels[end]
b = m0 - 1
@inbounds for i in 1 : length(x)
xi = x[i]
if m0 <= xi <= m1
r[xi - b] += 1
end
end
return r
end
function addcounts!(r::AbstractArray, x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights)
k = length(levels)
length(r) == k || throw(DimensionMismatch())
m0 = levels[1]
m1 = levels[end]
b = m0 - 1
@inbounds for i in 1 : length(x)
xi = x[i]
if m0 <= xi <= m1
r[xi - b] += wv[i]
end
end
return r
end
"""
counts(x, [wv::AbstractWeights])
counts(x, levels::UnitRange{<:Integer}, [wv::AbstractWeights])
counts(x, k::Integer, [wv::AbstractWeights])
Count the number of times each value in `x` occurs. If `levels` is provided, only values
falling in that range will be considered (the others will be ignored without
raising an error or a warning). If an integer `k` is provided, only values in the
range `1:k` will be considered.
If a weighting vector `wv` is specified, the sum of the weights is used rather than the
raw counts.
The output is a vector of length `length(levels)`.
"""
function counts end
counts(x::IntegerArray, levels::IntUnitRange) =
addcounts!(zeros(Int, length(levels)), x, levels)
counts(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
addcounts!(zeros(eltype(wv), length(levels)), x, levels, wv)
counts(x::IntegerArray, k::Integer) = counts(x, 1:k)
counts(x::IntegerArray, k::Integer, wv::AbstractWeights) = counts(x, 1:k, wv)
counts(x::IntegerArray) = counts(x, span(x))
counts(x::IntegerArray, wv::AbstractWeights) = counts(x, span(x), wv)
"""
proportions(x, levels=span(x), [wv::AbstractWeights])
Return the proportion of values in the range `levels` that occur in `x`.
Equivalent to `counts(x, levels) / length(x)`. If a weighting vector `wv`
is specified, the sum of the weights is used rather than the raw counts.
"""
proportions(x::IntegerArray, levels::IntUnitRange) = counts(x, levels) .* inv(length(x))
proportions(x::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
counts(x, levels, wv) .* inv(sum(wv))
"""
proportions(x, k::Integer, [wv::AbstractWeights])
Return the proportion of integers in 1 to `k` that occur in `x`.
"""
proportions(x::IntegerArray, k::Integer) = proportions(x, 1:k)
proportions(x::IntegerArray, k::Integer, wv::AbstractWeights) = proportions(x, 1:k, wv)
proportions(x::IntegerArray) = proportions(x, span(x))
proportions(x::IntegerArray, wv::AbstractWeights) = proportions(x, span(x), wv)
#### functions for counting a single list of integers (2D)
function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange})
# add counts of integers from x to r
n = length(x)
length(y) == n || throw(DimensionMismatch())
xlevels, ylevels = levels
kx = length(xlevels)
ky = length(ylevels)
size(r) == (kx, ky) || throw(DimensionMismatch())
mx0 = xlevels[1]
mx1 = xlevels[end]
my0 = ylevels[1]
my1 = ylevels[end]
bx = mx0 - 1
by = my0 - 1
for i = 1:n
xi = x[i]
yi = y[i]
if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
r[xi - bx, yi - by] += 1
end
end
return r
end
function addcounts!(r::AbstractArray, x::IntegerArray, y::IntegerArray,
levels::NTuple{2,IntUnitRange}, wv::AbstractWeights)
# add counts of integers from x to r
n = length(x)
length(y) == length(wv) == n || throw(DimensionMismatch())
xlevels, ylevels = levels
kx = length(xlevels)
ky = length(ylevels)
size(r) == (kx, ky) || throw(DimensionMismatch())
mx0 = xlevels[1]
mx1 = xlevels[end]
my0 = ylevels[1]
my1 = ylevels[end]
bx = mx0 - 1
by = my0 - 1
for i = 1:n
xi = x[i]
yi = y[i]
if (mx0 <= xi <= mx1) && (my0 <= yi <= my1)
r[xi - bx, yi - by] += wv[i]
end
end
return r
end
# facet functions
function counts(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange})
addcounts!(zeros(Int, length(levels[1]), length(levels[2])), x, y, levels)
end
function counts(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights)
addcounts!(zeros(eltype(wv), length(levels[1]), length(levels[2])), x, y, levels, wv)
end
counts(x::IntegerArray, y::IntegerArray, levels::IntUnitRange) =
counts(x, y, (levels, levels))
counts(x::IntegerArray, y::IntegerArray, levels::IntUnitRange, wv::AbstractWeights) =
counts(x, y, (levels, levels), wv)
counts(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}) =
counts(x, y, (1:ks[1], 1:ks[2]))
counts(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::AbstractWeights) =
counts(x, y, (1:ks[1], 1:ks[2]), wv)
counts(x::IntegerArray, y::IntegerArray, k::Integer) = counts(x, y, (1:k, 1:k))
counts(x::IntegerArray, y::IntegerArray, k::Integer, wv::AbstractWeights) =
counts(x, y, (1:k, 1:k), wv)
counts(x::IntegerArray, y::IntegerArray) = counts(x, y, (span(x), span(y)))
counts(x::IntegerArray, y::IntegerArray, wv::AbstractWeights) = counts(x, y, (span(x), span(y)), wv)
proportions(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}) =
counts(x, y, levels) .* inv(length(x))
proportions(x::IntegerArray, y::IntegerArray, levels::NTuple{2,IntUnitRange}, wv::AbstractWeights) =
counts(x, y, levels, wv) .* inv(sum(wv))
proportions(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}) =
proportions(x, y, (1:ks[1], 1:ks[2]))
proportions(x::IntegerArray, y::IntegerArray, ks::NTuple{2,Integer}, wv::AbstractWeights) =
proportions(x, y, (1:ks[1], 1:ks[2]), wv)
proportions(x::IntegerArray, y::IntegerArray, k::Integer) = proportions(x, y, (1:k, 1:k))
proportions(x::IntegerArray, y::IntegerArray, k::Integer, wv::AbstractWeights) =
proportions(x, y, (1:k, 1:k), wv)
proportions(x::IntegerArray, y::IntegerArray) = proportions(x, y, (span(x), span(y)))
proportions(x::IntegerArray, y::IntegerArray, wv::AbstractWeights) =
proportions(x, y, (span(x), span(y)), wv)
#################################################
#
# countmap on unknown levels
#
# These methods are based on dictionaries, and
# can be used on any kind of hashable values.
#
#################################################
## auxiliary functions
function _normalize_countmap(cm::Dict{T}, s::Real) where T
r = Dict{T,Float64}()
for (k, c) in cm
r[k] = c / s
end
return r
end
## 1D
"""
addcounts!(dict, x[, wv]; alg = :auto)
Add counts based on `x` to a count map. New entries will be added if new values come up.
If a weighting vector `wv` is specified, the sum of the weights is used rather than the
raw counts.
`alg` can be one of:
- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
`:radixsort`, otherwise use `:dict`.
- `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the
[radix sort](https://en.wikipedia.org/wiki/Radix_sort)
algorithm to sort the input vector which will generally lead to
shorter running time. However the radix sort algorithm creates a
copy of the input vector and hence uses more RAM. Choose `:dict`
if the amount of available RAM is a limitation.
- `:dict`: use `Dict`-based method which is generally slower but uses less
RAM and is safe for any data type.
"""
function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :auto) where T
# if it's safe to be sorted using radixsort then it should be faster
# albeit using more RAM
if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
addcounts_radixsort!(cm, x)
elseif alg == :radixsort
throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead"))
else
addcounts_dict!(cm,x)
end
return cm
end
"""Dict-based addcounts method"""
function addcounts_dict!(cm::Dict{T}, x::AbstractArray{T}) where T
for v in x
index = ht_keyindex2!(cm, v)
if index > 0
@inbounds cm.vals[index] += 1
else
@inbounds Base._setindex!(cm, 1, v, -index)
end
end
return cm
end
# If the bits type is of small size i.e. it can have up to 65536 distinct values
# then it is always better to apply a counting-sort like reduce algorithm for
# faster results and less memory usage. However we still wish to enable others
# to write generic algorithms, therefore the methods below still accept the
# `alg` argument but it is ignored.
function addcounts!(cm::Dict{Bool}, x::AbstractArray{Bool}; alg = :ignored)
sumx = sum(x)
cm[true] = get(cm, true, 0) + sumx
cm[false] = get(cm, false, 0) + length(x) - sumx
cm
end
function addcounts!(cm::Dict{T}, x::AbstractArray{T}; alg = :ignored) where T <: Union{UInt8, UInt16, Int8, Int16}
counts = zeros(Int, 2^(8sizeof(T)))
@inbounds for xi in x
counts[Int(xi) - typemin(T) + 1] += 1
end
for (i, c) in zip(typemin(T):typemax(T), counts)
if c != 0
index = ht_keyindex2!(cm, i)
if index > 0
@inbounds cm.vals[index] += c
else
@inbounds Base._setindex!(cm, c, i, -index)
end
end
end
cm
end
const BaseRadixSortSafeTypes = Union{Int8, Int16, Int32, Int64, Int128,
UInt8, UInt16, UInt32, UInt64, UInt128,
Float32, Float64}
"Can the type be safely sorted by radixsort"
radixsort_safe(::Type{T}) where {T<:BaseRadixSortSafeTypes} = true
radixsort_safe(::Type) = false
function _addcounts_radix_sort_loop!(cm::Dict{T}, sx::AbstractArray{T}) where T
last_sx = sx[1]
tmpcount = get(cm, last_sx, 0) + 1
# now the data is sorted: can just run through and accumulate values before
# adding into the Dict
@inbounds for i in 2:length(sx)
sxi = sx[i]
if last_sx == sxi
tmpcount += 1
else
cm[last_sx] = tmpcount
last_sx = sxi
tmpcount = get(cm, last_sx, 0) + 1
end
end
cm[sx[end]] = tmpcount
return cm
end
function addcounts_radixsort!(cm::Dict{T}, x::AbstractArray{T}) where T
# sort the x using radixsort
sx = sort(x, alg = RadixSort)
# Delegate the loop to a separate function since sort might not
# be inferred in Julia 0.6 after SortingAlgorithms is loaded.
# It seems that sort is inferred in Julia 0.7.
return _addcounts_radix_sort_loop!(cm, sx)
end
function addcounts!(cm::Dict{T}, x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real}
n = length(x)
length(wv) == n || throw(DimensionMismatch())
z = zero(W)
for i = 1 : n
@inbounds xi = x[i]
@inbounds wi = wv[i]
cm[xi] = get(cm, xi, z) + wi
end
return cm
end
"""
countmap(x; alg = :auto)
Return a dictionary mapping each unique value in `x` to its number
of occurrences.
- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
`:radixsort`, otherwise use `:dict`.
- `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the
[radix sort](https://en.wikipedia.org/wiki/Radix_sort)
algorithm to sort the input vector which will generally lead to
shorter running time. However the radix sort algorithm creates a
copy of the input vector and hence uses more RAM. Choose `:dict`
if the amount of available RAM is a limitation.
- `:dict`: use `Dict`-based method which is generally slower but uses less
RAM and is safe for any data type.
"""
countmap(x::AbstractArray{T}; alg = :auto) where {T} = addcounts!(Dict{T,Int}(), x; alg = alg)
countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)
"""
proportionmap(x)
Return a dictionary mapping each unique value in `x` to its
proportion in `x`.
"""
proportionmap(x::AbstractArray) = _normalize_countmap(countmap(x), length(x))
proportionmap(x::AbstractArray, wv::AbstractWeights) = _normalize_countmap(countmap(x, wv), sum(wv))