Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] replace chernoff bounds with exact probabilities #2268

Merged
merged 3 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion src/sourmash/distance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataclasses import dataclass, field
from scipy.optimize import brentq
from scipy.stats import norm as scipy_norm
from scipy.stats import binom
import numpy as np
from math import log, exp

Expand Down Expand Up @@ -168,7 +169,8 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
Computes the probability that the estimate: sketch_size * scaled deviates from the true
set_size by more than relative_error. This relies on the fact that the sketch_size
is binomially distributed with parameters sketch_size and 1/scale. The two-sided Chernoff
bounds are used.
bounds are used. This is depreciated in favor of set_size_exact_prob due to the later
being accurate even for very small set sizes
@param set_size: The number of distinct k-mers in the given set
@param relative_error: the desired relative error (defaults to 5%)
@return: float (the upper bound probability)
Expand All @@ -177,6 +179,28 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
return upper_bound


def set_size_exact_prob(set_size, scaled, *, relative_error=0.05):
"""
Computes the exact probability that the estimate: sketch_size * scaled deviates from the true
set_size by more than relative_error. This relies on the fact that the sketch_size
is binomially distributed with parameters sketch_size and 1/scale. The CDF of the binomial distribution
is used.
@param set_size: The number of distinct k-mers in the given set
@param relative_error: the desired relative error (defaults to 5%)
@return: float (the upper bound probability)
"""
# Need to check if the edge case is an integer or not. If not, don't include it in the equation
pmf_arg = -set_size/scaled * (relative_error - 1)
if pmf_arg == int(pmf_arg):
prob = binom.cdf(set_size/scaled * (relative_error + 1), set_size, 1/scaled) - \
binom.cdf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + \
binom.pmf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled)
else:
prob = binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) - \
binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled)
return prob


def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction):
"""helper function
Note that scaled here needs to be between 0 and 1
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class MinHash - core MinHash class.
class FrozenMinHash - read-only MinHash class.
"""
from __future__ import unicode_literals, division
from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_chernoff
from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_exact_prob
from .logging import notify

import numpy as np
Expand Down Expand Up @@ -1051,7 +1051,7 @@ def size_is_accurate(self, relative_error=0.20, confidence=0.95):
if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
# to do: replace unique_dataset_hashes with HLL estimation when it gets implemented
probability = set_size_chernoff(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
probability = set_size_exact_prob(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
return probability >= confidence


Expand Down
37 changes: 35 additions & 2 deletions tests/test_distance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common,
handle_seqlen_nkmers, jaccard_to_distance,
ANIResult, ciANIResult, jaccardANIResult, var_n_mutated,
set_size_chernoff)
set_size_chernoff, set_size_exact_prob)

def test_aniresult():
res = ANIResult(0.4, 0.1)
Expand Down Expand Up @@ -416,4 +416,37 @@ def test_set_size_chernoff():
set_size = 10
s = 1/.01
value_from_mathematica = -1
assert np.abs(set_size_chernoff(set_size, s,relative_error=rel_error) - value_from_mathematica) < eps
assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps


def test_set_size_exact_prob():
# values obtained from Mathematica
# specifically: Probability[Abs[X*s - n]/n <= delta,
# X \[Distributed] BinomialDistribution[n, 1/s]] // N
set_size = 100
scaled = 2
relative_error = 0.05
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.382701
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 200
scaled = 5
relative_error = 0.15
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.749858
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 10
scaled = 10
relative_error = 0.10
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.38742
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 1000
scaled = 10
relative_error = 0.10
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.73182
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
18 changes: 10 additions & 8 deletions tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import itertools
import pickle
import math
import numpy as np

import pytest

Expand Down Expand Up @@ -3144,9 +3145,9 @@ def test_containment_ani_ci_tiny_testdata():

m2_cani_m1 = mh2.containment_ani(mh1, estimate_ci=True)
print(m2_cani_m1)
assert m2_cani_m1.ani == None
# from the formula ANI = c^(1/k) for c=3/4 and k=21
np.testing.assert_almost_equal(m2_cani_m1.ani, 0.986394259982259, decimal=3)
m2_cani_m1.size_is_inaccurate = False
assert m2_cani_m1.ani == 0.986394259982259
assert m2_cani_m1.ani_low == None
assert m2_cani_m1.ani_high == None

Expand Down Expand Up @@ -3207,7 +3208,7 @@ def test_minhash_set_size_estimate_is_accurate():
f2 = utils.get_test_data('2+63.fa.sig')
mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
mh2 = sourmash.load_one_signature(f2).minhash
mh1_ds = mh1.downsample(scaled=10000)
mh1_ds = mh1.downsample(scaled=100000)
# check accuracy using default thresholds (rel_err= 0.2, confidence=0.95)
assert mh1.size_is_accurate() == True
assert mh1_ds.size_is_accurate() == False
Expand All @@ -3219,7 +3220,7 @@ def test_minhash_set_size_estimate_is_accurate():

# change prob
assert mh1.size_is_accurate(confidence=0.5) == True
assert mh1.size_is_accurate(confidence=1) == False
assert mh1.size_is_accurate(relative_error=0.001, confidence=1) == False

# check that relative error and confidence must be between 0 and 1
with pytest.raises(ValueError) as exc:
Expand All @@ -3236,15 +3237,16 @@ def test_minhash_set_size_estimate_is_accurate():


def test_minhash_ani_inaccurate_size_est():
# TODO: It's actually really tricky to get the set size to be inaccurate. Eg. For a scale factor of 10000,
# you would need
f1 = utils.get_test_data('2.fa.sig')
f2 = utils.get_test_data('2+63.fa.sig')
mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
mh2 = sourmash.load_one_signature(f2).minhash
# downsample
mh1_ds = mh1.downsample(scaled=10000)
mh2_ds = mh2.downsample(scaled=10000)

assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == False
mh1_ds = mh1.downsample(scaled=100000)
mh2_ds = mh2.downsample(scaled=100000)
assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == True
assert mh1.size_is_accurate() == True
assert mh1_ds.size_is_accurate() == False
assert mh2.size_is_accurate() == True
Expand Down
22 changes: 11 additions & 11 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -5946,9 +5946,9 @@ def test_search_ani_containment_fail(runtmp):
print(row)
assert search_result_names == list(row.keys())
assert round(float(row['similarity']), 3) == 0.967
assert row['ani'] == ""

assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
assert row['ani'] == "0.998906999319701"
# With PR #2268, this error message should not appear
#assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err


def test_search_ani_containment_estimate_ci(runtmp):
Expand Down Expand Up @@ -6185,14 +6185,14 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
assert row['query_name'] == 'tr1 4'
assert row['query_md5'] == 'c9d5a795'
assert row['query_bp'] == '910'
assert row['query_containment_ani']== ''
assert row['query_containment_ani_low']== ''
assert row['query_containment_ani_high']== ''
assert row['match_containment_ani'] == ''
assert row['match_containment_ani_low'] == ''
assert row['match_containment_ani_high'] == ''
assert row['average_containment_ani'] == ''
assert row['max_containment_ani'] ==''
assert row['query_containment_ani'] == '1.0'
assert row['query_containment_ani_low'] == '1.0'
assert row['query_containment_ani_high'] == '1.0'
assert row['match_containment_ani'] == '1.0'
assert row['match_containment_ani_low'] == '1.0'
assert row['match_containment_ani_high'] == '1.0'
assert row['average_containment_ani'] == '1.0'
assert row['max_containment_ani'] == '1.0'
assert row['potential_false_negative'] == 'False'


Expand Down