Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make covariance and correlation work for iterators, skipmissing in particular. #34

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 73 additions & 7 deletions src/Statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,17 @@ end
_vmean(x::AbstractVector, vardim::Int) = mean(x)
_vmean(x::AbstractMatrix, vardim::Int) = mean(x, dims=vardim)

_collect_if_itr(x::Any) = collect(x)
_collect_if_itr(x::AbstractVector) = x

function _matrix_error(x, y)
if !(x isa AbstractVector || y isa AbstractVector) && (x isa AbstractArray || y isa AbstractArray)
s = "Covariance and correlation between a non-vector array and a non-vector iterator" *
"is currently disallowed. `collect` one of the arguments."
throw(ArgumentError(s))
end
end

# core functions

unscaled_covzm(x::AbstractVector{<:Number}) = sum(abs2, x)
Expand All @@ -495,6 +506,7 @@ unscaled_covzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int) =

# covzm (with centered data)

nalimilan marked this conversation as resolved.
Show resolved Hide resolved
covzm(itr::Any; corrected::Bool = true) = covzm(collect(itr); corrected = corrected)
covzm(x::AbstractVector; corrected::Bool=true) = unscaled_covzm(x) / (length(x) - Int(corrected))
function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
C = unscaled_covzm(x, vardim)
Expand All @@ -504,6 +516,10 @@ function covzm(x::AbstractMatrix, vardim::Int=1; corrected::Bool=true)
A .= A .* b
return A
end
function covzm(x::Any, y::Any; corrected::Bool = true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what case covzm can get Any? It is an internal method and I thought it can only get already processed data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same question applies to covm below.

_matrix_error(x, y)
covzm(_collect_if_itr(x), _collect_if_itr(y); corrected = corrected)
end
covzm(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
unscaled_covzm(x, y) / (length(x) - Int(corrected))
function covzm(x::AbstractVecOrMat, y::AbstractVecOrMat, vardim::Int=1; corrected::Bool=true)
Expand All @@ -518,22 +534,34 @@ end
# covm (with provided mean)
## Use map(t -> t - xmean, x) instead of x .- xmean to allow for Vector{Vector}
## which can't be handled by broadcast
covm(itr::Any, itrmean; corrected::Bool=true) =
covm(collect(itr), itrmean; corrected=corrected)
covm(x::AbstractVector, xmean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x); corrected=corrected)
covm(x::AbstractMatrix, xmean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, vardim; corrected=corrected)
function covm(x::Any, xmean, y::Any, ymean; corrected::Bool=true)
_matrix_error(x, y)
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
end
covm(x::AbstractVector, xmean, y::AbstractVector, ymean; corrected::Bool=true) =
covzm(map(t -> t - xmean, x), map(t -> t - ymean, y); corrected=corrected)
covm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1; corrected::Bool=true) =
covzm(x .- xmean, y .- ymean, vardim; corrected=corrected)

# cov (API)
"""
cov(x::AbstractVector; corrected::Bool=true)
cov(itr::Any; corrected::Bool=true)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved

Compute the variance of the vector `x`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where `n = length(x)`.
Compute the variance of the iterator `itr`. If `corrected` is `true` (the default) then the sum
is scaled with `n-1`, whereas the sum is scaled with `n` if `corrected` is `false` where
``n`` is the number of elements.
"""
function cov(itr::Any; corrected::Bool=true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want to allow 0 or more than 2 dimensional arrays here?

x = collect(itr)
meanx = mean(x)
covzm(map!(t -> t - meanx, x, x); corrected=corrected)
end
cov(x::AbstractVector; corrected::Bool=true) = covm(x, mean(x); corrected=corrected)

"""
Expand All @@ -547,13 +575,23 @@ cov(X::AbstractMatrix; dims::Int=1, corrected::Bool=true) =
covm(X, _vmean(X, dims), dims; corrected=corrected)

"""
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true)
cov(x::Any, y::Any; corrected::Bool=true)

Compute the covariance between the vectors `x` and `y`. If `corrected` is `true` (the
Compute the covariance between the iterators `x` and `y`. If `corrected` is `true` (the
default), computes ``\\frac{1}{n-1}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*`` where
``*`` denotes the complex conjugate and `n = length(x) = length(y)`. If `corrected` is
``*`` denotes the complex conjugate and ``n`` the number of elements. If `corrected` is
`false`, computes ``\\frac{1}{n}\\sum_{i=1}^n (x_i-\\bar x) (y_i-\\bar y)^*``.
"""
function cov(x::Any, y::Any; corrected::Bool=true)
_matrix_error(x, y)
cx = collect(x)
cy = collect(y)
meanx = mean(cx)
meany = mean(cy)
dx = map!(t -> t - meanx, cx, cx)
dy = map!(t -> t - meany, cy, cy)
covzm(dx, dy; corrected=corrected)
end
cov(x::AbstractVector, y::AbstractVector; corrected::Bool=true) =
covm(x, mean(x), y, mean(y); corrected=corrected)

Expand Down Expand Up @@ -629,8 +667,17 @@ function cov2cor!(C::AbstractMatrix, xsd::AbstractArray, ysd::AbstractArray)
return C
end

function _return_one(itr)
if Base.IteratorEltype(itr) isa Base.HasEltype && isconcrete(eltype(itr))
return one(real(eltype(itr)))
else
return one(real(eltype(collect(itr))))
end
end

# corzm (non-exported, with centered data)

corzm(itr::Any) = _return_one(itr)
corzm(x::AbstractVector{T}) where {T} = one(real(T))
function corzm(x::AbstractMatrix, vardim::Int=1)
c = unscaled_covzm(x, vardim)
Expand All @@ -645,8 +692,13 @@ corzm(x::AbstractMatrix, y::AbstractMatrix, vardim::Int=1) =

# corm

corm(itr::Any, itrmean) = _return_one(itr)
corm(x::AbstractVector{T}, xmean) where {T} = one(real(T))
corm(x::AbstractMatrix, xmean, vardim::Int=1) = corzm(x .- xmean, vardim)
function corm(x::Any, mx, y::Any, my)
_matrix_error(x, y)
corm(_collect_if_itr(x), mx, _collect_if_itr(y), my)
end
function corm(x::AbstractVector, mx, y::AbstractVector, my)
require_one_based_indexing(x, y)
n = length(x)
Expand Down Expand Up @@ -675,10 +727,11 @@ corm(x::AbstractVecOrMat, xmean, y::AbstractVecOrMat, ymean, vardim::Int=1) =

# cor
"""
cor(x::AbstractVector)
cor(itr::Any)

Return the number one.
"""
cor(itr::Any) = _return_one(itr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we touch this part of code then I do not understand the following (this is in general a separate PR, but we implement this method here, so I would like to clarify what is intended):

julia> x = rand(10, 2)
10×2 Array{Float64,2}:
 0.281236  0.0338547
 0.691944  0.830649
 0.627939  0.62187
 0.251539  0.162161
 0.649065  0.627302
 0.67754   0.227709
 0.904292  0.481443
 0.768511  0.439196
 0.56268   0.885131
 0.520348  0.0185026

julia> y = collect(eachrow(x))
10-element Array{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true},1}:
 [0.28123641854321346, 0.03385469973171129]
 [0.6919437304763723, 0.8306494675149141]
 [0.6279387501997036, 0.6218700109315964]
 [0.2515386883173856, 0.16216070470557398]
 [0.6490648176650291, 0.6273015737594985]
 [0.6775400549397448, 0.22770867432380815]
 [0.9042917317032804, 0.4814426050177454]
 [0.7685107010600403, 0.4391959677108144]
 [0.562680390776801, 0.8851305746997942]
 [0.5203479821836228, 0.018502575073925165]

julia> cov(x)
2×2 Array{Float64,2}:
 0.0409994  0.0321085
 0.0321085  0.0983311

julia> cov(y)
2×2 Array{Float64,2}:
 0.0409994  0.0321085
 0.0321085  0.0983311

julia> cor(x)
2×2 Array{Float64,2}:
 1.0       0.505691
 0.505691  1.0

julia> cor(y)
ERROR: MethodError: no method matching zero(::Type{SubArray{Float64,1,Array{Float64,2},Tuple{Int64,Base.Slice{Base.OneTo{Int64}}},true}})

and I do not understand why cov and cor behave differently in how they handle x and y.

cor(x::AbstractVector) = one(real(eltype(x)))

"""
Expand All @@ -688,6 +741,19 @@ Compute the Pearson correlation matrix of the matrix `X` along the dimension `di
"""
cor(X::AbstractMatrix; dims::Int=1) = corm(X, _vmean(X, dims), dims)

"""
cor(x::Any, y::Any)

Compute the Pearson correlation between iterators `x` and `y`.
"""
function cor(x::Any, y::Any)
_matrix_error(x, y)
cx = _collect_if_itr(x)
cy = _collect_if_itr(y)

corm(cx, mean(cx), cy, mean(cy))
end

"""
cor(x::AbstractVector, y::AbstractVector)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this docstring which is a special case of the previous one.


Expand Down
49 changes: 45 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,17 @@ Y = [6.0 2.0;
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end
x1_itr = (x1i for x1i in x1)
y1_itr = skipmissing(y1)

c = zm ? Statistics.covm(x1, 0, corrected=cr) :
cov(x1, corrected=cr)
c_itr = zm ? Statistics.covm(x1_itr, 0, corrected=cr) :
cov(x1_itr, corrected=cr)
@test isa(c, Float64)
@test c Cxx[1,1]
@test c == c_itr == Cxx[1,1]
@inferred cov(x1, corrected=cr)
@inferred cov(x1_itr, corrected=cr)

@test cov(X) == Statistics.covm(X, mean(X, dims=1))
C = zm ? Statistics.covm(X, 0, vd, corrected=cr) :
Expand All @@ -356,9 +361,16 @@ Y = [6.0 2.0;
@test cov(x1, y1) == Statistics.covm(x1, mean(x1), y1, mean(y1))
c = zm ? Statistics.covm(x1, 0, y1, 0, corrected=cr) :
cov(x1, y1, corrected=cr)
c_itr = zm ? Statistics.covm(x1_itr, 0, y1_itr, 0, corrected=cr) :
cov(x1_itr, y1_itr, corrected=cr)
c_itrx = zm ? Statistics.covm(x1_itr, 0, y1, 0, corrected=cr) :
cov(x1_itr, y1, corrected=cr)
c_itry = zm ? Statistics.covm(x1, 0, y1_itr, 0, corrected=cr) :
cov(x1, y1_itr, corrected=cr)
@test isa(c, Float64)
@test c Cxy[1,1]
@test c == c_itr == c_itrx == c_itry == Cxy[1,1]
@inferred cov(x1, y1, corrected=cr)
@inferred cov(x1_itr, y1_itr, corrected=cr)

if vd == 1
@test cov(x1, Y) == Statistics.covm(x1, mean(x1), Y, mean(Y, dims=1))
Expand Down Expand Up @@ -386,6 +398,17 @@ Y = [6.0 2.0;
@inferred cov(X, Y, dims=vd, corrected=cr)
end

@testset "errors for `cov` with non-array iterators and matrices" begin
x1_itr = (xi for xi in X[:, 1])
y1_itr = skipmissing(Y[:, 1])
@test_throws ArgumentError Statistics.covzm(X, y1_itr)
@test_throws ArgumentError Statistics.covzm(x1_itr, Y)
@test_throws ArgumentError Statistics.covm(X, mean(X, dims = 1), y1_itr, mean(y1_itr))
@test_throws ArgumentError Statistics.covm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1))
@test_throws ArgumentError cov(X, y1_itr)
@test_throws ArgumentError cov(x1_itr, Y)
end

@testset "floating point accuracy for `cov` of large numbers" begin
A = [4.0, 7.0, 13.0, 16.0]
C = A .+ 1.0e10
Expand Down Expand Up @@ -426,11 +449,15 @@ end
x1 = vec(X[1,:])
y1 = vec(Y[1,:])
end
x1_itr = (x1i for x1i in x1)
y1_itr = skipmissing(y1)

c = zm ? Statistics.corm(x1, 0) : cor(x1)
c_itr = zm ? Statistics.corm(x1_itr, 0) : cor(x1_itr)
@test isa(c, Float64)
@test c ≈ Cxx[1,1]
@test c ≈ c_itr ≈ Cxx[1,1]
@inferred cor(x1)
@inferred cor(x1_itr)

@test cor(X) == Statistics.corm(X, mean(X, dims=1))
C = zm ? Statistics.corm(X, 0, vd) : cor(X, dims=vd)
Expand All @@ -440,9 +467,14 @@ end

@test cor(x1, y1) == Statistics.corm(x1, mean(x1), y1, mean(y1))
c = zm ? Statistics.corm(x1, 0, y1, 0) : cor(x1, y1)
c_itr = zm ? Statistics.corm(x1_itr, 0, y1_itr, 0) : cor(x1_itr, y1_itr)
c_itrx = zm ? Statistics.corm(x1_itr, 0, y1, 0) : cor(x1_itr, y1)
c_itry = zm ? Statistics.corm(x1, 0, y1_itr, 0) : cor(x1, y1_itr)

@test isa(c, Float64)
@test c ≈ Cxy[1,1]
@test c == c_itr == c_itrx == c_itry ≈ Cxy[1,1]
@inferred cor(x1, y1)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
@inferred cor(x1_itr, y1_itr)

if vd == 1
@test cor(x1, Y) == Statistics.corm(x1, mean(x1), Y, mean(Y, dims=1))
Expand Down Expand Up @@ -477,6 +509,15 @@ end
@test cor(tmp, tmp) <= 1.0
@test cor(tmp, tmp2) <= 1.0
end

@testset "errors for `cor` with non-array iterators and matrices" begin
x1_itr = (xi for xi in X[:, 1])
y1_itr = skipmissing(Y[:, 1])
@test_throws ArgumentError Statistics.corm(X, mean(X, dims = 1), y1_itr, mean(y1_itr))
@test_throws ArgumentError Statistics.corm(x1_itr, mean(x1_itr), Y, mean(Y, dims = 1))
@test_throws ArgumentError cor(X, y1_itr)
@test_throws ArgumentError cor(x1_itr, Y)
end
end

@testset "quantile" begin
Expand Down