Skip to content

Commit

Permalink
[MRG] report both weighted and unweighted % recovered in gather (#2301)
Browse files Browse the repository at this point in the history
* report both weighted and unweighted % recovered in gather

* address no matches

* add gather and multigather abund nomatch tests

* add tests for new output

* update docs
  • Loading branch information
ctb authored Sep 28, 2022
1 parent 719e7d5 commit 2246e71
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 16 deletions.
8 changes: 7 additions & 1 deletion doc/classifying-signatures.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,10 @@ The output below is the CSV output for a fictional metagenome.

The first column, `f_unique_to_query`, is the fraction of the database
match that is _unique_ with respect to the original query. It will
always decrease as you get more matches.
always decrease as you get more matches. The sum of
`f_unique_to_query` across all rows is what is reported in by gather
as the fraction of query k-mers hit by the recovered matches
(unweighted) and should never be greater than 1!

The second column, `f_match_orig`, is how much of the match is in the
_original_ query. For this fictional metagenome, each match is
Expand All @@ -236,6 +239,9 @@ f_unique_to_query f_match_orig f_match f_orig_query
0.10709413369713507 1.0 1.0 0.10709413369713507
0.10368349249658936 1.0 0.3134020618556701 0.33083219645293316
```
Where there are overlapping matches (e.g. to multiple
*E. coli* species in a gut metagenome) the overlaps will be represented
multiple times in this column.

A few quick notes for the algorithmic folk out there --

Expand Down
33 changes: 24 additions & 9 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,11 @@ def gather(args):
estimate_ani_ci=args.estimate_ani_ci)

screen_width = _get_screen_width()
for result, weighted_missed in gather_iter:
sum_f_uniq_found = 0.
result = None
for result in gather_iter:
sum_f_uniq_found += result.f_unique_to_query

if not len(found): # first result? print header.
if is_abundance:
print_results("")
Expand Down Expand Up @@ -854,11 +858,13 @@ def gather(args):
if args.num_results and len(found) == args.num_results:
print_results(f'(truncated gather because --num-results={args.num_results})')

p_covered = (1 - weighted_missed) * 100
if is_abundance:
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
else:
print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
if is_abundance and result:
p_covered = result.sum_weighted_found / result.total_weighted_hashes
p_covered *= 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')

print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')

print_results('')
if gather_iter.scaled != query.minhash.scaled:
print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')
Expand Down Expand Up @@ -932,6 +938,8 @@ def multigather(args):
# need a query to get ksize, moltype for db loading
query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))

notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})')

databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
fail_on_empty_database=args.fail_on_empty_database)

Expand Down Expand Up @@ -994,7 +1002,10 @@ def multigather(args):
ident_mh=ident_mh)

screen_width = _get_screen_width()
for result, weighted_missed in gather_iter:
sum_f_uniq_found = 0.
result = None
for result in gather_iter:
sum_f_uniq_found += result.f_unique_to_query
if not len(found): # first result? print header.
if is_abundance:
print_results("")
Expand Down Expand Up @@ -1035,8 +1046,12 @@ def multigather(args):
# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
if is_abundance and result:
p_covered = result.sum_weighted_found / result.total_weighted_hashes
p_covered *= 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')

print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')
print_results('')

if not found:
Expand Down
5 changes: 2 additions & 3 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,12 +783,11 @@ def __next__(self):
new_query_mh.remove_many(found_mh)
new_query = SourmashSignature(new_query_mh)

# compute weighted_missed for remaining query hashes
# compute weighted information for remaining query hashes
query_hashes = set(query_mh.hashes) - set(found_mh.hashes)
n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes))
n_weighted_missed += self.noident_query_sum_abunds
sum_weighted_found = sum_abunds - n_weighted_missed
weighted_missed = n_weighted_missed / sum_abunds

# build a GatherResult
result = GatherResult(self.orig_query, best_match,
Expand All @@ -809,7 +808,7 @@ def __next__(self):
self.query = new_query
self.orig_query_mh = orig_query_mh

return result, weighted_missed
return result


###
Expand Down
57 changes: 54 additions & 3 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3451,12 +3451,27 @@ def approx_equal(a, b, n=5):
remaining_mh.remove_many(match.minhash.hashes.keys())


def test_gather_nomatch(runtmp):
def test_gather_nomatch(runtmp, linear_gather, prefetch_gather):
testdata_query = utils.get_test_data(
'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')

runtmp.sourmash('gather', testdata_query, testdata_match)
runtmp.sourmash('gather', testdata_query, testdata_match,
linear_gather, prefetch_gather)

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather):
testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')

runtmp.sourmash('gather', testdata_query, testdata_match,
linear_gather, prefetch_gather)

print(runtmp.last_result.out)
print(runtmp.last_result.err)
Expand Down Expand Up @@ -4562,6 +4577,7 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather):
assert 'genome-s12.fa.gz' not in out

assert "the recovered matches hit 100.0% of the abundance-weighted query" in out
assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out


def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
Expand Down Expand Up @@ -4701,7 +4717,8 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath

print(out)
print(err)
assert "the recovered matches hit 100.0% of the query (unweighted)" in out
assert "the recovered matches hit 100.0% of the abundance-weighted query" not in out
assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out

# when we project s10x10-s11 (r2+r3), 10:1 abundance,
# onto s10 and s11 genomes with gather --ignore-abundance, we get:
Expand Down Expand Up @@ -4803,6 +4820,10 @@ def test_multigather_output_unassigned_with_abundance(runtmp):
print(c.last_result.out)
print(c.last_result.err)

out = c.last_result.out
assert "the recovered matches hit 91.0% of the abundance-weighted query." in out
assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out

assert os.path.exists(c.output('r3.fa.unassigned.sig'))

nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig'))
Expand Down Expand Up @@ -4857,6 +4878,36 @@ def test_multigather_empty_db_nofail(runtmp):
assert "loaded 50 total signatures from 2 locations" in err
assert "after selecting signatures compatible with search, 0 remain." in err


def test_multigather_nomatch(runtmp):
testdata_query = utils.get_test_data(
'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')

runtmp.sourmash('multigather', '--query', testdata_query,
'--db', testdata_match, '-k', '31')

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_multigather_abund_nomatch(runtmp):
testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')

runtmp.sourmash('multigather', '--query', testdata_query,
'--db', testdata_match)

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_sbt_categorize(runtmp):
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')
Expand Down

0 comments on commit 2246e71

Please sign in to comment.