From b90f488ee9033f31c335dad985ab3b8c7b80f087 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Wed, 16 Dec 2015 12:25:57 +0000 Subject: [PATCH] improve quantile: reduce allocations, use partial sort (cherry picked from commit f6ad90e42b9e69886f407476b0ade0680f165198) ref #14413 --- base/docs/helpdb.jl | 7 --- base/float.jl | 4 ++ base/statistics.jl | 118 ++++++++++++++++++++++++++++++++------------ doc/stdlib/math.rst | 20 +++++--- test/statistics.jl | 6 ++- 5 files changed, 108 insertions(+), 47 deletions(-) diff --git a/base/docs/helpdb.jl b/base/docs/helpdb.jl index 37d8aaf759802..747f2b0ec8c8c 100644 --- a/base/docs/helpdb.jl +++ b/base/docs/helpdb.jl @@ -3116,13 +3116,6 @@ with empty collections (see ``reduce(op, itr)``). """ mapreduce(f, op, itr) -doc""" - quantile!(v, p) - -Like `quantile`, but overwrites the input vector. -""" -quantile! - doc""" accept(server[,client]) diff --git a/base/float.jl b/base/float.jl index c3302cd8a9795..eacecf7def9cc 100644 --- a/base/float.jl +++ b/base/float.jl @@ -120,6 +120,10 @@ convert(::Type{AbstractFloat}, x::UInt128) = convert(Float64, x) # LOSSY float(x) = convert(AbstractFloat, x) +# for constructing arrays +float{T<:Number}(::Type{T}) = typeof(float(zero(T))) +float{T}(::Type{T}) = Any + for Ti in (Int8, Int16, Int32, Int64) @eval begin unsafe_trunc(::Type{$Ti}, x::Float32) = box($Ti,fptosi($Ti,unbox(Float32,x))) diff --git a/base/statistics.jl b/base/statistics.jl index 5440db590d49a..3e2ee60d4fb08 100644 --- a/base/statistics.jl +++ b/base/statistics.jl @@ -509,51 +509,105 @@ median{T}(v::AbstractArray{T}, region) = mapslices(median!, v, region) # for now, use the R/S definition of quantile; may want variants later # see ?quantile in R -- this is type 7 -# TODO: need faster implementation (use select!?) -# -function quantile!(v::AbstractVector, q::AbstractVector) - isempty(v) && throw(ArgumentError("empty data array")) - isempty(q) && throw(ArgumentError("empty quantile array")) +""" + quantile!([q, ] v, p; sorted=false) + +Compute the quantile(s) of a vector `v` at the probabilities `p`, with optional output into +array `q` (if not provided, a new output array is created). The keyword argument `sorted` +indicates whether `v` can be assumed to be sorted; if `false` (the default), then the +elements of `v` may be partially sorted. + +The elements of `p` should be on the interval [0,1], and `v` should not have any `NaN` +values. + +Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, +for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan +(1996), and is the same as the R default. - # make sure the quantiles are in [0,1] - q = bound_quantiles(q) +* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", + *The American Statistician*, Vol. 50, No. 4, pp. 361-365 +""" +function quantile!(q::AbstractArray, v::AbstractVector, p::AbstractArray; + sorted::Bool=false) + size(p) == size(q) || throw(DimensionMismatch()) + + isempty(v) && throw(ArgumentError("empty data vector")) lv = length(v) - lq = length(q) + if !sorted + minp, maxp = extrema(p) + lo = floor(Int,1+minp*(lv-1)) + hi = ceil(Int,1+maxp*(lv-1)) - index = 1 .+ (lv-1)*q - lo = floor(Int,index) - hi = ceil(Int,index) - sort!(v) + # only need to perform partial sort + sort!(v, 1, lv, PartialQuickSort(lo:hi), Base.Sort.Forward) + end isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) - i = find(index .> lo) - r = float(v[lo]) - h = (index.-lo)[i] - r[i] = (1.-h).*r[i] + h.*v[hi[i]] - return r + + for i = 1:length(p) + @inbounds q[i] = _quantile(v,p[i]) + end + return q end -""" - quantile(v, ps) +quantile!(v::AbstractVector, p::AbstractArray; sorted::Bool=false) = + quantile!(similar(p,float(eltype(v))), v, p; sorted=sorted) -Compute the quantiles of a vector `v` at a specified set of probability values `ps`. Note: Julia does not ignore `NaN` values in the computation. -""" -quantile(v::AbstractVector, q::AbstractVector) = quantile!(copy(v),q) -""" - quantile(v, p) +function quantile!(v::AbstractVector, p::Real; + sorted::Bool=false) + isempty(v) && throw(ArgumentError("empty data vector")) -Compute the quantile of a vector `v` at the probability `p`. Note: Julia does not ignore `NaN` values in the computation. -""" -quantile(v::AbstractVector, q::Number) = quantile(v,[q])[1] + lv = length(v) + if !sorted + lo = floor(Int,1+p*(lv-1)) + hi = ceil(Int,1+p*(lv-1)) -function bound_quantiles(qs::AbstractVector) - epsilon = 100*eps() - if (any(qs .< -epsilon) || any(qs .> 1+epsilon)) - throw(ArgumentError("quantiles out of [0,1] range")) + # only need to perform partial sort + sort!(v, 1, lv, PartialQuickSort(lo:hi), Base.Sort.Forward) end - [min(1,max(0,q)) for q = qs] + isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs")) + + return _quantile(v,p) end +# Core quantile lookup function: assumes `v` sorted +@inline function _quantile(v::AbstractVector, p::Real) + T = float(eltype(v)) + isnan(p) && return T(NaN) + + lv = length(v) + index = 1 + (lv-1)*p + 1 <= index <= lv || error("input probability out of [0,1] range") + + indlo = floor(index) + i = trunc(Int,indlo) + + if index == indlo + return T(v[i]) + else + h = T(index - indlo) + return (1-h)*T(v[i]) + h*T(v[i+1]) + end +end + + +""" + quantile(v, p; sorted=false) + +Compute the quantile(s) of a vector `v` at a specified probability or vector `p`. The +keyword argument `sorted` indicates whether `v` can be assumed to be sorted. + +The `p` should be on the interval [0,1], and `v` should not have any `NaN` values. + +Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`, +for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan +(1996), and is the same as the R default. + +* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", + *The American Statistician*, Vol. 50, No. 4, pp. 361-365 +""" +quantile(v::AbstractVector, p; sorted::Bool=false) = + quantile!(sorted ? v : copy!(similar(v),v), p; sorted=sorted) ##### histogram ##### diff --git a/doc/stdlib/math.rst b/doc/stdlib/math.rst index 9e3734e067011..ff4d1423c2682 100644 --- a/doc/stdlib/math.rst +++ b/doc/stdlib/math.rst @@ -1673,23 +1673,29 @@ Statistics Compute the midpoints of the bins with edges ``e``\ . The result is a vector/range of length ``length(e) - 1``\ . Note: Julia does not ignore ``NaN`` values in the computation. -.. function:: quantile(v, ps) +.. function:: quantile(v, p; sorted=false) .. Docstring generated from Julia source - Compute the quantiles of a vector ``v`` at a specified set of probability values ``ps``\ . Note: Julia does not ignore ``NaN`` values in the computation. + Compute the quantile(s) of a vector ``v`` at a specified probability or vector ``p``\ . The keyword argument ``sorted`` indicates whether ``v`` can be assumed to be sorted. -.. function:: quantile(v, p) + The ``p`` should be on the interval [0,1], and ``v`` should not have any ``NaN`` values. - .. Docstring generated from Julia source + Quantiles are computed via linear interpolation between the points ``((k-1)/(n-1), v[k])``\ , for ``k = 1:n`` where ``n = length(v)``\ . This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default. - Compute the quantile of a vector ``v`` at the probability ``p``\ . Note: Julia does not ignore ``NaN`` values in the computation. + * Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", *The American Statistician*, Vol. 50, No. 4, pp. 361-365 -.. function:: quantile!(v, p) +.. function:: quantile!([q, ] v, p; sorted=false) .. Docstring generated from Julia source - Like ``quantile``\ , but overwrites the input vector. + Compute the quantile(s) of a vector ``v`` at the probabilities ``p``\ , with optional output into array ``q`` (if not provided, a new output array is created). The keyword argument ``sorted`` indicates whether ``v`` can be assumed to be sorted; if ``false`` (the default), then the elements of ``v`` may be partially sorted. + + The elements of ``p`` should be on the interval [0,1], and ``v`` should not have any ``NaN`` values. + + Quantiles are computed via linear interpolation between the points ``((k-1)/(n-1), v[k])``\ , for ``k = 1:n`` where ``n = length(v)``\ . This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default. + + * Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", *The American Statistician*, Vol. 50, No. 4, pp. 361-365 .. function:: cov(v1[, v2][, vardim=1, corrected=true, mean=nothing]) diff --git a/test/statistics.jl b/test/statistics.jl index af65a4cb15eb7..9fa097e3d7f6e 100644 --- a/test/statistics.jl +++ b/test/statistics.jl @@ -289,8 +289,12 @@ end @test midpoints(Float64[1.0:1.0:10.0;]) == Float64[1.5:1.0:9.5;] @test quantile([1,2,3,4],0.5) == 2.5 +@test quantile([1,2,3,4],[0.5]) == [2.5] @test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3]) -@test quantile([0.:100.;],[.1,.2,.3,.4,.5,.6,.7,.8,.9])[1] == 10.0 +@test quantile(100.0:-1.0:0.0, 0.0:0.1:1.0) == collect(0.0:10.0:100.0) +@test quantile(0.0:100.0, 0.0:0.1:1.0, sorted=true) == collect(0.0:10.0:100.0) +@test quantile(100f0:-1f0:0.0, 0.0:0.1:1.0) == collect(0f0:10f0:100f0) + # test invalid hist nbins argument (#9999) @test_throws ArgumentError hist(Int[], -1)