From b6295d10ab6ee89ba9ee6aa3323847cf0a9483aa Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Wed, 8 Nov 2017 16:28:46 -0800 Subject: [PATCH 01/12] modify weighted quantile (aweights + fweights) Solves #313 --- src/weights.jl | 71 ++++++++++++------------ test/weights.jl | 140 +++++++++++++++++++++++++++++++----------------- 2 files changed, 126 insertions(+), 85 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index e4dddb5d6..1443e5f70 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -537,10 +537,6 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w) ###### Weighted quantile ##### -# http://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample -# In the non weighted version, we compute a vector of index h(N, p) -# and take interpolation between floor and ceil of this index -# Here there is a supplementary function from index to weighted index k -> Sk """ quantile(v, w::AbstractWeights, p) @@ -549,15 +545,16 @@ Compute the weighted quantiles of a vector `x` at a specified set of probability values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -The quantile for `p` is defined as follows. Denoting -``S_k = (k-1)w_k + (n-1) \\sum_{i= N - # out was initialized with maximum v - return out - end - k += 1 - Skold, vkold = Sk, vk - vk, wk = vw[k] - Sk = (k - 1) * wk + (N - 1) * cumulative_weight + h = p[i] * (w.sum - 1) + 1 + while Sk <= h + k += 1 + if k > length(vw) + # out was initialized with maximum v + return out end - # in particular, Sk is different from Skold - g = (h - Skold) / (Sk - Skold) - out[ppermute[i]] = vkold + g * (vk - vkold) + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end + if isa(w, FrequencyWeights) + out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) + else + out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) end end return out diff --git a/test/weights.jl b/test/weights.jl index 7764535c7..87e400fdd 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -261,7 +261,48 @@ end @test_throws ErrorException median(data, f(wt)) end -@testset "Quantile $f" for f in weight_funcs + +# Quantile fweights +@testset "Quantile fweights" begin + data = ( + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10], + [7, 1, 2, 4, 10, 15], + [1, 2, 4, 7, 10, 15], + [0, 10, 20, 30], + [1, 2, 3, 4, 5], + [1, 2, 3, 4, 5], + [30, 40, 50, 60, 35], + [2, 0.6, 1.3, 0.3, 0.3, 1.7, 0.7, 1.7], + [1, 2, 2], + [3.7, 3.3, 3.5, 2.8], + [100, 125, 123, 60, 45, 56, 66], + [2, 2, 2, 2, 2, 2], + [2.3], + [-2, -3, 1, 2, -10], + [1, 2, 3, 4, 5], + [5, 4, 3, 2, 1], + [-2, 2, -1, 3, 6], + [-10, 1, 1, -10, -10], + ) + p = [0.0, 0.25, 0.5, 0.75, 1.0] + for x in data + @test quantile(x, fweights(ones(Int64, length(x))), p) ≈ quantile(x, p) + end + # zero don't count + x = [1, 2, 3, 4, 5] + @test quantile(x, fweights([0,1,1,1,0]), p) ≈ quantile([2, 3, 4], p) + # repetitions dont count + @test quantile(x, fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p) + # Issue #313 + @test quantile(x, fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p) + @test quantile([1, 2], fweights([1, 1]), 0.25) ≈ 1.25 + @test quantile([1, 2], fweights([2, 2]), 0.25) ≈ 1.0 +end + + +@testset "Quantile aweights" begin + data = ( [7, 1, 2, 4, 10], [7, 1, 2, 4, 10], @@ -284,81 +325,84 @@ end [-10, 1, 1, -10, -10], ) wt = ( - f([1, 1/3, 1/3, 1/3, 1]), - f([1, 1, 1, 1, 1]), - f([1, 1/3, 1/3, 1/3, 1, 1]), - f([1/3, 1/3, 1/3, 1, 1, 1]), - f([30, 191, 9, 0]), - f([10, 1, 1, 1, 9]), - f([10, 1, 1, 1, 900]), - f([1, 3, 5, 4, 2]), - f([2, 2, 5, 1, 2, 2, 1, 6]), - f([0.1, 0.1, 0.8]), - f([5, 5, 4, 1]), - f([30, 56, 144, 24, 55, 43, 67]), - f([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]), - f([12]), - f([7, 1, 1, 1, 6]), - f([1, 0, 0, 0, 2]), - f([1, 2, 3, 4, 5]), - f([0.1, 0.2, 0.3, 0.2, 0.1]), - f([1, 1, 1, 1, 1]), + [1, 1/3, 1/3, 1/3, 1], + [1, 1, 1, 1, 1], + [1, 1/3, 1/3, 1/3, 1, 1], + [1/3, 1/3, 1/3, 1, 1, 1], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 1, 2, 2, 1, 6], + [0.1, 0.1, 0.8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [0.1, 0.2, 0.3, 0.2, 0.1], + [1, 1, 1, 1, 1], ) quantile_answers = ( - [1.0,3.6000000000000005,6.181818181818182,8.2,10.0], - [1.0,2.0,4.0,7.0,10.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [1.0,4.75,8.0,10.833333333333334,15.0], - [0.0,6.1387900355871885,11.600000000000001,15.912500000000001,30.0], - [1.0,1.5365853658536586,2.5999999999999996,4.405405405405405,5.0], - [1.0,4.239377950569287,4.492918633712858,4.746459316856429,5.0], - [30.0,38.75,45.714285714285715,52.85714285714286,60.0], - [0.3,0.6903846153846154,1.484,1.7,2.0], - [1.0,2.0,2.0,2.0,2.0], - [2.8,3.3361111111111112,3.4611111111111112,3.581578947368421,3.7], - [45.0,59.88593155893536,100.08846153846153,118.62115384615385,125.0], - [2.0,2.0,2.0,2.0,2.0], - [2.3,2.3,2.3,2.3,2.3], - [-10.0,-5.52,-2.5882352941176467,-0.9411764705882351,2.0], - [1.0,1.75,4.25,4.625,5.0], - [1.0,1.625,2.3333333333333335,3.25,5.0], - [-2.0,-0.5384615384615388,1.5384615384615383,2.6999999999999997,6.0], - [-10.0,-10.0,-10.0,1.0,1.0] + [1.8, 3.4, 4.6, 5.5, 6.4], + [1.0, 2.0, 4.0, 7.0, 10.0], + [2.0, 4.5, 6.0, 7.5, 9.0], + [2.0, 4.5, 6.0, 7.5, 9.0], + [2.44328, 20.0, 20.0, 20.0, 20.0], + [0.44, 5.0, 5.0, 5.0, 5.0], + [4.18844, 5.0, 5.0, 5.0, 5.0], + [35.0, 56.25, 60.0, 60.0, 60.0], + [0.3, 1.7, 2.0, 2.0, 2.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [3.075, 3.7, 3.7, 3.7, 3.7], + [46.2425, 125.0, 125.0, 125.0, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-5.33333, 1.2, 2.0, 2.0, 2.0], + [2.0, 3.5, 5.0, 5.0, 5.0], + [0.6, 3.75, 5.0, 5.0, 5.0], + [-1.73333, -1.74833, -1.76333, -1.77833, -1.79333], + [-10.0, -10.0, -10.0, 1.0, 1.0], ) p = [0.0, 0.25, 0.5, 0.75, 1.0] srand(10) for i = 1:length(data) - @test quantile(data[i], wt[i], p) ≈ quantile_answers[i] + @test quantile(data[i], aweights(wt[i]), p) ≈ quantile_answers[i] atol = 1e-3 for j = 1:10 # order of p does not matter reorder = sortperm(rand(length(p))) - @test quantile(data[i], wt[i], p[reorder]) ≈ quantile_answers[i][reorder] + @test quantile(data[i], aweights(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-3 end for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i] + @test quantile(data[i][reorder], aweights(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-3 end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) + @test quantile(data[i], aweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-3 for j = 1:10 prandom = rand(4) - @test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) + @test quantile(data[i], aweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-3 end end # other syntaxes v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] - answer = 6.181818181818182 - @test quantile(data[1], f(w), 0.5) ≈ answer - @test wquantile(data[1], f(w), [0.5]) ≈ [answer] - @test wquantile(data[1], f(w), 0.5) ≈ answer + answer = 4.6 + @test quantile(data[1], aweights(w), 0.5) ≈ answer + @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] + @test wquantile(data[1], aweights(w), 0.5) ≈ answer @test wquantile(data[1], w, [0.5]) ≈ [answer] @test wquantile(data[1], w, 0.5) ≈ answer end + + + end # @testset StatsBase.Weights From b21a2cb3146cb7c6dca153536288dafe73ece480 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Wed, 8 Nov 2017 20:58:28 -0800 Subject: [PATCH 02/12] update --- src/weights.jl | 46 +++++++++++++++++++++++++++++---------------- test/weights.jl | 50 ++++++++++++++++++++++++------------------------- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index 1443e5f70..a62aeb53c 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -546,14 +546,13 @@ values `p`, using weights given by a weight vector `w` (of type `AbstractWeights Weights must not be negative. The weights and data vectors must have the same length. With frequency weights, the function returns the same result as `quantile` for a vector with repeated values. -With non frequency weights, denote N the length of the vector, w the vector of weights normalized to sum to N, `h = p (N - 1) + 1` and ``S_k = N \\sum_{i<=k}w_i/\\sum_{i<=N}w_i``, define ``x_{k+1}`` the smallest element of `x` such that ``S_{k+1}`` is strictly superior to `h`. The function returns +With non frequency weights, denote N the length of the vector, w the vector of weights normalized to sum to 1, `h = p (N - 1) + 1` and ``S_k = 1 + (k-1) * wk + (N-1) \\sum_{i<=k}w_i/\\sum_{i<=N}w_i``, define ``x_{k+1}`` the smallest element of `x` such that ``S_{k+1}`` is strictly superior to `h`. The function returns ``x_k + \\gamma (x_{k+1} -x_k)`` with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)`` In particular, when `w` is a vector of one, the function returns the same result as `quantile`. - """ -function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V, W<:Real} +function quantile{V, W <: Real}(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) # checks isempty(v) && error("quantile of an empty array is undefined") @@ -570,11 +569,13 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where nz = find(w.values) #normalize if non frequencyweight if !isa(w, FrequencyWeights) - wvalues = wvalues / w.sum * length(nz) + wvalues = wvalues / w.sum end + wsum = sum(wvalues) #remove zeros weights and sort vw = sort!(collect(zip(view(v, nz), view(wvalues, nz)))) + N = length(vw) # prepare percentiles ppermute = sortperm(p) @@ -587,25 +588,38 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where # start looping on quantiles Sk, Skold = zero(W), zero(W) - vk, vkold = zero(V), zero(V) + vk, vkold, cumwk, wk = zero(V), zero(V), zero(V), zero(V) k = 0 for i in 1:length(p) - h = p[i] * (w.sum - 1) + 1 - while Sk <= h - k += 1 - if k > length(vw) - # out was initialized with maximum v - return out - end - Skold, vkold = Sk, vk - vk, wk = vw[k] - Sk += wk - end if isa(w, FrequencyWeights) + h = p[i] * (wsum - 1) + 1 + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out + end + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) else + # https://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample + h = p[i] * (N - 1) + 1 + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out + end + Skold, vkold = Sk, vk + cumwk += wk + vk, wk = vw[k] + Sk = 1 + (k - 1) * wk + (N - 1) * cumwk + end out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) end end diff --git a/test/weights.jl b/test/weights.jl index 87e400fdd..9c974d69c 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -346,25 +346,25 @@ end [1, 1, 1, 1, 1], ) quantile_answers = ( - [1.8, 3.4, 4.6, 5.5, 6.4], - [1.0, 2.0, 4.0, 7.0, 10.0], - [2.0, 4.5, 6.0, 7.5, 9.0], - [2.0, 4.5, 6.0, 7.5, 9.0], - [2.44328, 20.0, 20.0, 20.0, 20.0], - [0.44, 5.0, 5.0, 5.0, 5.0], - [4.18844, 5.0, 5.0, 5.0, 5.0], - [35.0, 56.25, 60.0, 60.0, 60.0], - [0.3, 1.7, 2.0, 2.0, 2.0], - [2.0, 2.0, 2.0, 2.0, 2.0], - [3.075, 3.7, 3.7, 3.7, 3.7], - [46.2425, 125.0, 125.0, 125.0, 125.0], - [2.0, 2.0, 2.0, 2.0, 2.0], - [2.3, 2.3, 2.3, 2.3, 2.3], - [-5.33333, 1.2, 2.0, 2.0, 2.0], - [2.0, 3.5, 5.0, 5.0, 5.0], - [0.6, 3.75, 5.0, 5.0, 5.0], - [-1.73333, -1.74833, -1.76333, -1.77833, -1.79333], - [-10.0, -10.0, -10.0, 1.0, 1.0], + [1.0, 3.6, 6.18182, 8.2, 10.0], + [1.0, 2.0, 4.0, 7.0, 10.0], + [1.0, 4.75, 8.0, 10.8333, 15.0], + [1.0, 4.75, 8.0, 10.8333, 15.0], + [0.0, 4.58167, 9.16335, 14.4976, 20.0], + [1.0, 1.53659, 2.6, 4.40541, 5.0], + [1.0, 4.23938, 4.49292, 4.74646, 5.0], + [30.0, 38.75, 45.7143, 52.8571, 60.0], + [0.3, 0.690385, 1.484, 1.7, 2.0], + [1.0, 2.0, 2.0, 2.0, 2.0], + [2.8, 3.33611, 3.46111, 3.58158, 3.7], + [45.0, 59.8859, 100.088, 118.621, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-10.0, -5.52, -2.58824, -0.941176, 2.0], + [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 1.625, 2.33333, 3.25, 5.0], + [-2.0, -0.538462, 1.53846, 2.7, 6.0], + [-10.0, -10.0, -10.0, 1.0, 1.0], ) p = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -394,12 +394,12 @@ end # other syntaxes v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] - answer = 4.6 - @test quantile(data[1], aweights(w), 0.5) ≈ answer - @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] - @test wquantile(data[1], aweights(w), 0.5) ≈ answer - @test wquantile(data[1], w, [0.5]) ≈ [answer] - @test wquantile(data[1], w, 0.5) ≈ answer + answer = 6.1818 + @test quantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-4 + @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-4 + @test wquantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-4 + @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-4 + @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-4 end From ed3d858f4f947136c0c97fb27e6d6751bd56f46b Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Wed, 8 Nov 2017 21:00:44 -0800 Subject: [PATCH 03/12] julia 0.6 --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index a62aeb53c..e51800d02 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -552,7 +552,7 @@ In particular, when `w` is a vector of one, the function returns the same result """ -function quantile{V, W <: Real}(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) +function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V, W <: Real} # checks isempty(v) && error("quantile of an empty array is undefined") From 8ef45111a838293bad2534b38afc0158ff0e7001 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Thu, 9 Nov 2017 13:07:44 -0800 Subject: [PATCH 04/12] Update for comments --- src/weights.jl | 35 +++++++++++------------ test/weights.jl | 74 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 71 insertions(+), 38 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index e51800d02..af5bb8c38 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -541,19 +541,21 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w) """ quantile(v, w::AbstractWeights, p) -Compute the weighted quantiles of a vector `x` at a specified set of probability -values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). +Compute the weighted quantiles of a vector ``x`` at a specified set of probability +values ``p``, using weights given by a weight vector ``w`` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -With frequency weights, the function returns the same result as `quantile` for a vector with repeated values. -With non frequency weights, denote N the length of the vector, w the vector of weights normalized to sum to 1, `h = p (N - 1) + 1` and ``S_k = 1 + (k-1) * wk + (N-1) \\sum_{i<=k}w_i/\\sum_{i<=N}w_i``, define ``x_{k+1}`` the smallest element of `x` such that ``S_{k+1}`` is strictly superior to `h`. The function returns -``x_k + \\gamma (x_{k+1} -x_k)`` with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)`` -In particular, when `w` is a vector of one, the function returns the same result as `quantile`. -""" - - -function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V, W <: Real} - +With [FrequencyWeights](@ref FrequencyWeights), the function returns the same result as +`quantile` for a vector with repeated values. +Otherwise, denote N the length of the vector, w the vector of weights normalized to sum to + 1, ``h = p (N - 1) + 1`` a real valued index and + ``S_k = 1 + (k-1)w_k + (N-1) \\sum_{i Date: Thu, 9 Nov 2017 15:06:58 -0800 Subject: [PATCH 05/12] Simplify aweight on the model of fweight --- src/weights.jl | 47 ++++++++++++++++------------------------------- test/weights.jl | 40 ++++++++++++++++++++-------------------- 2 files changed, 36 insertions(+), 51 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index af5bb8c38..59d59216c 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -547,9 +547,8 @@ Weights must not be negative. The weights and data vectors must have the same le With [FrequencyWeights](@ref FrequencyWeights), the function returns the same result as `quantile` for a vector with repeated values. -Otherwise, denote N the length of the vector, w the vector of weights normalized to sum to - 1, ``h = p (N - 1) + 1`` a real valued index and - ``S_k = 1 + (k-1)w_k + (N-1) \\sum_{i N - # out was initialized with maximum v - return out - end - Skold, vkold = Sk, vk - vk, wk = vw[k] - Sk += wk + else + h = p[i] * (wsum - vw[1][2]) + vw[1][2] + end + while Sk <= h + k += 1 + if k > N + # out was initialized with maximum v + return out end + Skold, vkold = Sk, vk + vk, wk = vw[k] + Sk += wk + end + if isa(w, FrequencyWeights) out[ppermute[i]] = vkold + min(h - Skold, 1) * (vk - vkold) else - # https://stats.stackexchange.com/questions/13169/defining-quantiles-over-a-weighted-sample - h = p[i] * (N - 1) + 1 - while Sk <= h - k += 1 - if k > N - # out was initialized with maximum v - return out - end - Skold, vkold = Sk, vk - cumwk += wk - vk, wk = vw[k] - Sk = 1 + (k - 1) * wk + (N - 1) * cumwk - end out[ppermute[i]] = vkold + (h - Skold) / (Sk - Skold) * (vk - vkold) end end diff --git a/test/weights.jl b/test/weights.jl index afcdb03df..5ed11b3d0 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -378,25 +378,25 @@ end [1, 1, 1, 1, 1], ) quantile_answers = ( - [1.0, 3.6, 6.18182, 8.2, 10.0], - [1.0, 2.0, 4.0, 7.0, 10.0], - [1.0, 4.75, 8.0, 10.8333, 15.0], - [1.0, 4.75, 8.0, 10.8333, 15.0], - [0.0, 4.58167, 9.16335, 14.4976, 20.0], - [1.0, 1.53659, 2.6, 4.40541, 5.0], - [1.0, 4.23938, 4.49292, 4.74646, 5.0], - [30.0, 38.75, 45.7143, 52.8571, 60.0], - [0.3, 0.690385, 1.484, 1.7, 2.0], - [1.0, 2.0, 2.0, 2.0, 2.0], - [2.8, 3.33611, 3.46111, 3.58158, 3.7], - [45.0, 59.8859, 100.088, 118.621, 125.0], - [2.0, 2.0, 2.0, 2.0, 2.0], - [2.3, 2.3, 2.3, 2.3, 2.3], - [-10.0, -5.52, -2.58824, -0.941176, 2.0], - [1.0, 2.0, 3.0, 4.0, 5.0], - [1.0, 1.625, 2.33333, 3.25, 5.0], - [-2.0, -0.538462, 1.53846, 2.7, 6.0], - [-10.0, -10.0, -10.0, 1.0, 1.0], + [1.0, 4.0, 6.0, 8.0, 10.0], + [1.0, 2.0, 4.0, 7.0, 10.0], + [1.0, 4.75, 7.5, 10.4167, 15.0], + [1.0, 4.75, 7.5, 10.4167, 15.0], + [0.0, 2.6178, 5.2356, 7.8534, 20.0], + [1.0, 4.0, 4.33333, 4.66667, 5.0], + [1.0, 4.2475, 4.49833, 4.74917, 5.0], + [30.0, 37.5, 44.0, 51.25, 60.0], + [0.3, 0.7, 1.3, 1.7, 2.0], + [1.0, 2.0, 2.0, 2.0, 2.0], + [2.8, 3.15, 3.4, 3.56, 3.7], + [45.0, 62.1493, 102.875, 117.41, 125.0], + [2.0, 2.0, 2.0, 2.0, 2.0], + [2.3, 2.3, 2.3, 2.3, 2.3], + [-10.0, -2.78571, -2.42857, -2.07143, 2.0], + [1.0, 2.0, 3.0, 4.0, 5.0], + [1.0, 1.625, 2.33333, 3.25, 5.0], + [-2.0, -1.33333, 0.5, 2.5, 6.0], + [-10.0, -10.0, -10.0, 1.0, 1.0] ) p = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -429,7 +429,7 @@ end # Syntax v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] - answer = 6.181 + answer = 6.0 @test quantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-3 @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-3 @test wquantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-3 From 57203fbca58c5a3c22d725488d8ec9c095d0794b Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Thu, 9 Nov 2017 15:19:26 -0800 Subject: [PATCH 06/12] no need for initalization --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 59d59216c..20e9c3c0a 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -583,7 +583,7 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where fill!(out, vw[end][1]) # start looping on quantiles - Sk, Skold,cumwk, wk = zero(W), zero(W), zero(W), zero(W) + Sk, Skold = zero(W), zero(W) vk, vkold= zero(V), zero(V) k = 0 From 370afd5bc508005721e7b61cdb2707c78b30d1a6 Mon Sep 17 00:00:00 2001 From: Matthieu Gomez Date: Thu, 9 Nov 2017 15:25:01 -0800 Subject: [PATCH 07/12] definition --- src/weights.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index 20e9c3c0a..a7e9e9591 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -547,8 +547,8 @@ Weights must not be negative. The weights and data vectors must have the same le With [FrequencyWeights](@ref FrequencyWeights), the function returns the same result as `quantile` for a vector with repeated values. -With non FrequencyWeights, denote N the length of the vector, w the vector of weights, ``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the index corresponding to the probability `p` and - ``S_k = \\sum_{i<=k}w_i`` the weighted index corresponding to each observation, +With non FrequencyWeights, denote N the length of the vector, w the vector of weights, ``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the probability `p` and + ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for to each observation, define ``x_{k+1}`` the smallest element of ``x`` such that ``S_{k+1}`` is strictly superior to ``h``. The function returns``x_k + \\gamma (x_{k+1} -x_k)`` with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when ``w`` is a vector From f9e315744fa2d7d7d27f1e48f4aba6fd5b327d71 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Wed, 24 Jan 2018 14:19:09 -0500 Subject: [PATCH 08/12] Update --- src/weights.jl | 29 +++++++++--------- test/weights.jl | 78 ++++++++++++++++++++++++------------------------- 2 files changed, 54 insertions(+), 53 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index a7e9e9591..9a9260ef6 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -541,18 +541,19 @@ wmedian(v::RealVector, w::AbstractWeights{<:Real}) = median(v, w) """ quantile(v, w::AbstractWeights, p) -Compute the weighted quantiles of a vector ``x`` at a specified set of probability -values ``p``, using weights given by a weight vector ``w`` (of type `AbstractWeights`). +Compute the weighted quantiles of a vector `v` at a specified set of probability +values `p`, using weights given by a weight vector `w` (of type `AbstractWeights`). Weights must not be negative. The weights and data vectors must have the same length. -With [FrequencyWeights](@ref FrequencyWeights), the function returns the same result as +With [`FrequencyWeights`](@ref), the function returns the same result as `quantile` for a vector with repeated values. -With non FrequencyWeights, denote N the length of the vector, w the vector of weights, ``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the probability `p` and - ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for to each observation, - define ``x_{k+1}`` the smallest element of ``x`` such that ``S_{k+1}`` is strictly - superior to ``h``. The function returns``x_k + \\gamma (x_{k+1} -x_k)`` - with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when ``w`` is a vector - of one, the function returns the same result as `quantile`. +With non FrequencyWeights, denote ``N`` the length of the vector, ``w`` the vector of weights, +``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the +probability ``p`` and ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for each +observation, define ``v_{k+1}`` the smallest element of ``v`` such that ``S_{k+1}`` +is strictly superior to ``h``. The weighted ``p`` quantile is given by ``v_k + \\gamma (v_{k+1} -v_k)`` +with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when ``w`` is a vector +of ones, the function returns the same result as `quantile`. """ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V,W<:Real} # checks @@ -567,10 +568,10 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where end - #remove zeros weights and sort - wsum = sum(w.value) - nz = .!iszero.(w.values) - vw = sort!(collect(zip(view(v, nz), view(wvalues, nz)))) + # remove zeros weights and sort + wsum = sum(w) + nz = .!iszero.(w) + vw = sort!(collect(zip(view(v, nz), view(w, nz)))) N = length(vw) # prepare percentiles @@ -584,7 +585,7 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where # start looping on quantiles Sk, Skold = zero(W), zero(W) - vk, vkold= zero(V), zero(V) + vk, vkold = zero(V), zero(V) k = 0 for i in 1:length(p) diff --git a/test/weights.jl b/test/weights.jl index 5ed11b3d0..166d4477d 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -286,25 +286,25 @@ end [-10, 1, 1, -10, -10], ) wt = ( - Int[3, 1, 1, 1, 3], - Int[1, 1, 1, 1, 1], - Int[3, 1, 1, 1, 3, 3], - Int[1, 1, 1, 3, 3, 3], - Int[30, 191, 9, 0], - Int[10, 1, 1, 1, 9], - Int[10, 1, 1, 1, 900], - Int[1, 3, 5, 4, 2], - Int[2, 2, 5, 0, 2, 2, 1, 6], - Int[1, 1, 8], - Int[5, 5, 4, 1], - Int[30, 56, 144, 24, 55, 43, 67], - Int[1, 2, 3, 4, 5, 6], - Int[12], - Int[7, 1, 1, 1, 6], - Int[1, 0, 0, 0, 2], - Int[1, 2, 3, 4, 5], - Int[1, 2, 3, 2, 1], - Int[0, 1, 1, 1, 1], + [3, 1, 1, 1, 3], + [1, 1, 1, 1, 1], + [3, 1, 1, 1, 3, 3], + [1, 1, 1, 3, 3, 3], + [30, 191, 9, 0], + [10, 1, 1, 1, 9], + [10, 1, 1, 1, 900], + [1, 3, 5, 4, 2], + [2, 2, 5, 0, 2, 2, 1, 6], + [1, 1, 8], + [5, 5, 4, 1], + [30, 56, 144, 24, 55, 43, 67], + [1, 2, 3, 4, 5, 6], + [12], + [7, 1, 1, 1, 6], + [1, 0, 0, 0, 2], + [1, 2, 3, 4, 5], + [1, 2, 3, 2, 1], + [0, 1, 1, 1, 1], ) p = [0.0, 0.25, 0.5, 0.75, 1.0] function _rep(x::AbstractVector, lengths::AbstractVector{Int}) @@ -380,61 +380,61 @@ end quantile_answers = ( [1.0, 4.0, 6.0, 8.0, 10.0], [1.0, 2.0, 4.0, 7.0, 10.0], - [1.0, 4.75, 7.5, 10.4167, 15.0], - [1.0, 4.75, 7.5, 10.4167, 15.0], - [0.0, 2.6178, 5.2356, 7.8534, 20.0], - [1.0, 4.0, 4.33333, 4.66667, 5.0], - [1.0, 4.2475, 4.49833, 4.74917, 5.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [1.0, 4.75, 7.5, 10.4166667, 15.0], + [0.0, 2.6178010, 5.2356021, 7.8534031, 20.0], + [1.0, 4.0, 4.3333333, 4.6666667, 5.0], + [1.0, 4.2475, 4.4983333, 4.7491667, 5.0], [30.0, 37.5, 44.0, 51.25, 60.0], [0.3, 0.7, 1.3, 1.7, 2.0], [1.0, 2.0, 2.0, 2.0, 2.0], [2.8, 3.15, 3.4, 3.56, 3.7], - [45.0, 62.1493, 102.875, 117.41, 125.0], + [45.0, 62.149253, 102.875, 117.4097222, 125.0], [2.0, 2.0, 2.0, 2.0, 2.0], [2.3, 2.3, 2.3, 2.3, 2.3], - [-10.0, -2.78571, -2.42857, -2.07143, 2.0], + [-10.0, -2.7857143, -2.4285714, -2.0714286, 2.0], [1.0, 2.0, 3.0, 4.0, 5.0], - [1.0, 1.625, 2.33333, 3.25, 5.0], - [-2.0, -1.33333, 0.5, 2.5, 6.0], + [1.0, 1.625, 2.3333333, 3.25, 5.0], + [-2.0, -1.3333333, 0.5, 2.5, 6.0], [-10.0, -10.0, -10.0, 1.0, 1.0] ) p = [0.0, 0.25, 0.5, 0.75, 1.0] srand(10) for i = 1:length(data) - @test quantile(data[i], aweights(wt[i]), p) ≈ quantile_answers[i] atol = 1e-3 + @test quantile(data[i], aweights(wt[i]), p) ≈ quantile_answers[i] atol = 1e-5 for j = 1:10 # order of p does not matter reorder = sortperm(rand(length(p))) - @test quantile(data[i], aweights(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-3 + @test quantile(data[i], aweights(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-5 end for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], aweights(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-3 + @test quantile(data[i][reorder], aweights(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-5 end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], aweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-3 + @test quantile(data[i], aweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-5 for j = 1:10 prandom = rand(4) - @test quantile(data[i], aweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-3 + @test quantile(data[i], aweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-5 end end # test zeros are removed for i = 1:length(data) - @test quantile(vcat(1.0, data[i]), aweights(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-3 + @test quantile(vcat(1.0, data[i]), aweights(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-5 end # Syntax v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.0 - @test quantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-3 - @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-3 - @test wquantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-3 - @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-3 - @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-3 + @test quantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-5 end end # @testset StatsBase.Weights \ No newline at end of file From 899f39561365644e0550a19906ff40cbba5b70a9 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Wed, 24 Jan 2018 14:20:06 -0500 Subject: [PATCH 09/12] conflict --- src/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/weights.jl b/src/weights.jl index 9a9260ef6..de1b5d821 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -580,7 +580,7 @@ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where p = bound_quantiles(p) # prepare out vector - out = Vector{typeof(zero(V)/1)}(length(p)) + out = Vector{typeof(zero(V)/1)}(uninitialized, length(p)) fill!(out, vw[end][1]) # start looping on quantiles From 0736c02c0805b529aa0713d2b5f8a5c863c4c402 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 25 Jan 2018 11:28:02 +0100 Subject: [PATCH 10/12] Minor fixes --- src/weights.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/weights.jl b/src/weights.jl index de1b5d821..98669b0c8 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -547,12 +547,12 @@ Weights must not be negative. The weights and data vectors must have the same le With [`FrequencyWeights`](@ref), the function returns the same result as `quantile` for a vector with repeated values. -With non FrequencyWeights, denote ``N`` the length of the vector, ``w`` the vector of weights, +With non `FrequencyWeights`, denote ``N`` the length of the vector, ``w`` the vector of weights, ``h = p (\\sum_{i<= N}w_i - w_1) + w_1`` the cumulative weight corresponding to the probability ``p`` and ``S_k = \\sum_{i<=k}w_i`` the cumulative weight for each -observation, define ``v_{k+1}`` the smallest element of ``v`` such that ``S_{k+1}`` +observation, define ``v_{k+1}`` the smallest element of `v` such that ``S_{k+1}`` is strictly superior to ``h``. The weighted ``p`` quantile is given by ``v_k + \\gamma (v_{k+1} -v_k)`` -with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when ``w`` is a vector +with ``\\gamma = (h - S_k)/(S_{k+1}-S_k)``. In particular, when `w` is a vector of ones, the function returns the same result as `quantile`. """ function quantile(v::RealVector{V}, w::AbstractWeights{W}, p::RealVector) where {V,W<:Real} From 1e12c7764af54993eca7348dde8eae1edc13676f Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 25 Jan 2018 11:32:05 +0100 Subject: [PATCH 11/12] Test pweights and weights --- test/weights.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/weights.jl b/test/weights.jl index 166d4477d..b129129b5 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -334,7 +334,7 @@ end @test quantile([1, 2], fweights([2, 2]), 0.25) ≈ 1.0 end -@testset "Quantile aweights" begin +@testset "Quantile aweights, pweights and weights" for f in (aweights, pweights, weights) data = ( [7, 1, 2, 4, 10], [7, 1, 2, 4, 10], @@ -402,39 +402,39 @@ end srand(10) for i = 1:length(data) - @test quantile(data[i], aweights(wt[i]), p) ≈ quantile_answers[i] atol = 1e-5 + @test quantile(data[i], f(wt[i]), p) ≈ quantile_answers[i] atol = 1e-5 for j = 1:10 # order of p does not matter reorder = sortperm(rand(length(p))) - @test quantile(data[i], aweights(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-5 + @test quantile(data[i], f(wt[i]), p[reorder]) ≈ quantile_answers[i][reorder] atol = 1e-5 end for j = 1:10 # order of w does not matter reorder = sortperm(rand(length(data[i]))) - @test quantile(data[i][reorder], aweights(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-5 + @test quantile(data[i][reorder], f(wt[i][reorder]), p) ≈ quantile_answers[i] atol = 1e-5 end end # w = 1 corresponds to base quantile for i = 1:length(data) - @test quantile(data[i], aweights(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-5 + @test quantile(data[i], f(ones(Int64, length(data[i]))), p) ≈ quantile(data[i], p) atol = 1e-5 for j = 1:10 prandom = rand(4) - @test quantile(data[i], aweights(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-5 + @test quantile(data[i], f(ones(Int64, length(data[i]))), prandom) ≈ quantile(data[i], prandom) atol = 1e-5 end end # test zeros are removed for i = 1:length(data) - @test quantile(vcat(1.0, data[i]), aweights(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-5 + @test quantile(vcat(1.0, data[i]), f(vcat(0.0, wt[i])), p) ≈ quantile_answers[i] atol = 1e-5 end # Syntax v = [7, 1, 2, 4, 10] w = [1, 1/3, 1/3, 1/3, 1] answer = 6.0 - @test quantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-5 - @test wquantile(data[1], aweights(w), [0.5]) ≈ [answer] atol = 1e-5 - @test wquantile(data[1], aweights(w), 0.5) ≈ answer atol = 1e-5 - @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-5 - @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-5 + @test quantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], f(w), [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], f(w), 0.5) ≈ answer atol = 1e-5 + @test wquantile(data[1], w, [0.5]) ≈ [answer] atol = 1e-5 + @test wquantile(data[1], w, 0.5) ≈ answer atol = 1e-5 end -end # @testset StatsBase.Weights \ No newline at end of file +end # @testset StatsBase.Weights From 2e72f48c9c63767047ad393f2b2ed6670eb7b044 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Thu, 25 Jan 2018 11:55:22 +0100 Subject: [PATCH 12/12] Fix test --- test/weights.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/weights.jl b/test/weights.jl index df6779f5a..5cc9a52ea 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -358,7 +358,7 @@ end end # Issue #313 - @test quantile(x, fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p) + @test quantile([1, 2, 3, 4, 5], fweights([0,1,2,1,0]), p) ≈ quantile([2, 3, 3, 4], p) @test quantile([1, 2], fweights([1, 1]), 0.25) ≈ 1.25 @test quantile([1, 2], fweights([2, 2]), 0.25) ≈ 1.0 end