From f6702e29f8b301540627944bb4a8eead1d18fb5e Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 2 May 2014 22:25:43 -0400 Subject: [PATCH 1/2] faster sprand, new randsubseq (fixes #6714 and #6708) --- NEWS.md | 4 ++ base/abstractarray.jl | 41 ++++++++++++++++++++ base/exports.jl | 2 + base/sparse/sparsematrix.jl | 76 +++++++++++++++++-------------------- doc/stdlib/base.rst | 13 +++++++ doc/stdlib/sparse.rst | 12 +++--- 6 files changed, 100 insertions(+), 48 deletions(-) diff --git a/NEWS.md b/NEWS.md index 42f8a134d4538..b424517d39eed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -175,6 +175,8 @@ Library improvements * `eigs(A, sigma)` now uses shift-and-invert for nonzero shifts `sigma` and inverse iteration for `which="SM"`. If `sigma==nothing` (the new default), computes ordinary (forward) iterations. ([#5776]) + * `sprand` is faster, and whether any entry is nonzero is now determined independently with the specified probability ([#6726]). + * Dense linear algebra for special matrix types * Interconversions between the special matrix types `Diagonal`, `Bidiagonal`, @@ -253,6 +255,8 @@ Library improvements * Extended API for ``cov`` and ``cor``, which accept keyword arguments ``vardim``, ``corrected``, and ``mean`` ([#6273]) + * New functions `randsubseq` and `randsubseq!` to create a random subsequence of an array ([#6726]) + Deprecated or removed --------------------- diff --git a/base/abstractarray.jl b/base/abstractarray.jl index b11e1f5d0a043..f4d811ec46981 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -1314,3 +1314,44 @@ push!(A, a, b, c...) = push!(push!(A, a, b), c...) unshift!(A) = A unshift!(A, a, b) = unshift!(unshift!(A, b), a) unshift!(A, a, b, c...) = unshift!(unshift!(A, c...), a, b) + +# Fill S (resized as needed) with a random subsequence of A, where +# each element of A is included in S with independent probability p. +# (Note that this is different from the problem of finding a random +# size-m subset of A where m is fixed!) +function randsubseq!(S::AbstractArray, A::AbstractArray, p::Real) + 0 <= p <= 1 || throw(ArgumentError("probability $p not in [0,1]")) + n = length(A) + p == 1 && return copy!(resize!(S, n), A) + empty!(S) + p == 0 && return S + nexpected = p * length(A) + sizehint(S, iround(nexpected + 5*sqrt(nexpected))) + if p > 0.15 # empirical threshold for trivial O(n) algorithm to be better + for i = 1:n + rand() <= p && push!(S, A[i]) + end + else + # Skip through A, in order, from each element i to the next element i+s + # included in S. The probability that the next included element is + # s==k (k > 0) is (1-p)^(k-1) * p, and hence the probability (CDF) that + # s is in {1,...,k} is 1-(1-p)^k = F(k). Thus, we can draw the skip s + # from this probability distribution via the discrete inverse-transform + # method: s = iceil(F^{-1}(u)) where u = rand(), which is simply + # s = iceil(log(rand()) / log1p(-p)). + L = 1 / log1p(-p) + i = 0 + while true + s = log(rand()) * L # note that rand() < 1, so s > 0 + s >= n - i && return S # compare before iceil to avoid overflow + push!(S, A[i += iceil(s)]) + end + # [This algorithm is similar in spirit to, but much simpler than, + # the one by Vitter for a related problem in "Faster methods for + # random sampling," Comm. ACM Magazine 7, 703-718 (1984).] + end + return S +end + +randsubseq{T}(A::AbstractArray{T}, p::Real) = randsubseq!(T[], A, p) + diff --git a/base/exports.jl b/base/exports.jl index 55ebf638c4b43..9d4e47d79ddf2 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -545,6 +545,8 @@ export promote_shape, randcycle, randperm, + randsubseq!, + randsubseq, range, reducedim, repmat, diff --git a/base/sparse/sparsematrix.jl b/base/sparse/sparsematrix.jl index 07c880fe5d8b4..2a08e6c0fb629 100644 --- a/base/sparse/sparsematrix.jl +++ b/base/sparse/sparsematrix.jl @@ -335,56 +335,48 @@ function findnz{Tv,Ti}(S::SparseMatrixCSC{Tv,Ti}) return (I, J, V) end -truebools(n::Integer) = ones(Bool, n) -function sprand{T}(m::Integer, n::Integer, density::FloatingPoint, rng::Function,::Type{T}=eltype(rng(1))) - 0 <= density <= 1 || throw(ArgumentError("density must be between 0 and 1")) +function sprand{T}(m::Integer, n::Integer, density::FloatingPoint, + rng::Function,::Type{T}=eltype(rng(1))) + 0 <= density <= 1 || throw(ArgumentError("$density not in [0,1]")) N = n*m N == 0 && return spzeros(T,m,n) N == 1 && return rand() <= density ? sparse(rng(1)) : spzeros(T,1,1) - # if density < 0.5, we'll randomly generate the indices to set - # otherwise, we'll randomly generate the indices to skip - K = (density > 0.5) ? N*(1-density) : N*density - # Use Newton's method to invert the birthday problem - l = log(1.0-1.0/N) - k = K - k = k + ((1-K/N)*exp(-k*l) - 1)/l - k = k + ((1-K/N)*exp(-k*l) - 1)/l # for K m ? m : int(k) # roundoff-error/underflow paranoia + randsubseq!(rows, 1:m-ik, density) + push!(rows, m-ik+1) + append!(I, rows) + nrows = length(rows) + Jlen = length(J) + resize!(J, Jlen+nrows) + for i = Jlen+1:length(J) + J[i] = j end end - I, J = ind2sub((m,n), uind) - return sparse_IJ_sorted!(I, J, rng(length(uind)), m, n, +) # it will never need to combine + return sparse_IJ_sorted!(I, J, rng(length(I)), m, n, +) # it will never need to combine end + sprand(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,rand,Float64) sprandn(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,randn,Float64) +truebools(n::Integer) = ones(Bool, n) sprandbool(m::Integer, n::Integer, density::FloatingPoint) = sprand(m,n,density,truebools,Bool) spones{T}(S::SparseMatrixCSC{T}) = diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 7a1aec99e3d81..e0e327fb896c8 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -3814,6 +3814,19 @@ Indexing, Assignment, and Concatenation Throw an error if the specified indexes are not in bounds for the given array. +.. function:: randsubseq(A, p) -> Vector + + Return a vector consisting of a random subsequence of the given array ``A``, + where each element of ``A`` is included (in order) with independent + probability ``p``. (Complexity is linear in ``p*length(A)``, so this + function is efficient even if ``p`` is small and ``A`` is large.) + +.. function:: randsubseq!(S, A, p) + + Like ``randsubseq``, but the results are stored in ``S`` (which is + resized as needed). + + Array functions ~~~~~~~~~~~~~~~ diff --git a/doc/stdlib/sparse.rst b/doc/stdlib/sparse.rst index 798d28655a0cb..cee6c6b8bc42d 100644 --- a/doc/stdlib/sparse.rst +++ b/doc/stdlib/sparse.rst @@ -55,17 +55,17 @@ Sparse matrices support much of the same set of operations as dense matrices. Th Construct a sparse diagonal matrix. ``B`` is a tuple of vectors containing the diagonals and ``d`` is a tuple containing the positions of the diagonals. In the case the input contains only one diagonaly, ``B`` can be a vector (instead of a tuple) and ``d`` can be the diagonal position (instead of a tuple), defaulting to 0 (diagonal). Optionally, ``m`` and ``n`` specify the size of the resulting sparse matrix. -.. function:: sprand(m,n,density[,rng]) +.. function:: sprand(m,n,p[,rng]) - Create a random sparse matrix with the specified density. Nonzeros are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified. + Create a random ``m`` by ``n`` sparse matrix, in which the probability of any element being nonzero is independently given by ``p`` (and hence the mean density of nonzeros is also exactly ``p``). Nonzero values are sampled from the distribution specified by ``rng``. The uniform distribution is used in case ``rng`` is not specified. -.. function:: sprandn(m,n,density) +.. function:: sprandn(m,n,p) - Create a random sparse matrix of specified density with nonzeros sampled from the normal distribution. + Create a random ``m`` by ``n`` sparse matrix with the specified (independent) probability ``p`` of any entry being nonzero, where nonzero values are sampled from the normal distribution. -.. function:: sprandbool(m,n,density) +.. function:: sprandbool(m,n,p) - Create a random sparse boolean matrix with the specified density. + Create a random ``m`` by ``n`` sparse boolean matrix with the specified (independent) probability ``p`` of any entry being ``true``. .. function:: etree(A[, post]) From ef94b35d88740c622dc28a75bbb4a6c46617824c Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Mon, 5 May 2014 16:43:53 -0400 Subject: [PATCH 2/2] mention Bernoulli sampling in randsubseq docs --- doc/stdlib/base.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index e0e327fb896c8..df70638fb2a2b 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -3819,7 +3819,8 @@ Indexing, Assignment, and Concatenation Return a vector consisting of a random subsequence of the given array ``A``, where each element of ``A`` is included (in order) with independent probability ``p``. (Complexity is linear in ``p*length(A)``, so this - function is efficient even if ``p`` is small and ``A`` is large.) + function is efficient even if ``p`` is small and ``A`` is large.) Technically, + this process is known as "Bernoulli sampling" of ``A``. .. function:: randsubseq!(S, A, p)