Skip to content

Commit

Permalink
Add --ignore-abundance to "search" and "categorize" (#543)
Browse files Browse the repository at this point in the history
* Add ignore_abundance to search_databases
* Add ignore_abundnance to search and categorize
* Containment cannot ignore abundance
* Add note about abundance vs containment
* Add assertion to make sure --ignore-abundance flag produces different results
* Use signatures with abundance for categorize test
  • Loading branch information
olgabot authored and luizirber committed Sep 17, 2018
1 parent 3ffac2d commit c202750
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 4 deletions.
11 changes: 9 additions & 2 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,9 @@ def search(args):
help='number of results to report')
parser.add_argument('--containment', action='store_true',
help='evaluate containment rather than similarity')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present. Note: '
'has no effect if --containment is specified')
parser.add_argument('--scaled', type=float, default=0,
help='downsample query to this scaled factor (yields greater speed)')
parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
Expand Down Expand Up @@ -786,7 +789,7 @@ def search(args):
# do the actual search
results = search_databases(query, databases,
args.threshold, args.containment,
args.best_only)
args.best_only, args.ignore_abundance)

n_matches = len(results)
if args.best_only:
Expand Down Expand Up @@ -839,6 +842,8 @@ def categorize(args):
parser.add_argument('--threshold', default=0.08, type=float,
help='minimum threshold for reporting matches (default=0.08)')
parser.add_argument('--traverse-directory', action="store_true")
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')

sourmash_args.add_moltype_args(parser)

Expand Down Expand Up @@ -879,7 +884,9 @@ def categorize(args):

for leaf in tree.find(search_fn, query, args.threshold):
if leaf.data.md5sum() != query.md5sum(): # ignore self.
results.append((query.similarity(leaf.data), leaf.data))
similarity = query.similarity(
leaf.data, ignore_abundance=args.ignore_abundance)
results.append((similarity, leaf.data))

best_hit_sim = 0.0
best_hit_query_name = ""
Expand Down
6 changes: 4 additions & 2 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ def format_bp(bp):
return '???'


def search_databases(query, databases, threshold, do_containment, best_only):
def search_databases(query, databases, threshold, do_containment, best_only,
ignore_abundance):
# set up the search & score function(s) - similarity vs containment
search_fn = search_minhashes
query_match = lambda x: query.similarity(x, downsample=True)
query_match = lambda x: query.similarity(
x, downsample=True, ignore_abundance=ignore_abundance)
if do_containment:
search_fn = search_minhashes_containment
query_match = lambda x: query.contained_by(x, downsample=True)
Expand Down
84 changes: 84 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,43 @@ def test_search():
assert '93.0%' in out


def test_search_ignore_abundance():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '31',
'--track-abundance',
testdata1, testdata2],
in_directory=location)



# Make sure there's different percent matches when using or
# not using abundance
status1, out1, err1 = utils.runscript('sourmash',
['search',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)
print(status1, out1, err1)
assert '1 matches' in out1
assert '81.5%' in out1

status2, out2, err2 = utils.runscript('sourmash',
['search',
'--ignore-abundance',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)
print(status2, out2, err2)
assert '1 matches' in out2
assert '93.0%' in out2

# Make sure results are different!
assert out1 != out2


def test_search_csv():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -2998,6 +3035,53 @@ def test_sbt_categorize():
assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv


def test_sbt_categorize_ignore_abundance():
with utils.TempDirectory() as location:

query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
against_list = ['reads-s10-s11']
against_list = [ 'gather-abund/' + i + '.sig' \
for i in against_list ]
against_list = [ utils.get_test_data(i) for i in against_list ]

# omit 3
args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list
status2, out2, err2 = utils.runscript('sourmash', args,
in_directory=location)

# --- Categorize without ignoring abundance ---
args = ['categorize', 'thebestdatabase',
'--ksize', '21', '--dna', '--csv', 'out3.csv', query]
status3, out3, err3 = utils.runscript('sourmash', args,
in_directory=location)

print(out3)
print(err3)

assert 'for 1-1, found: 0.44 1-1' in err3

out_csv3 = open(os.path.join(location, 'out3.csv')).read()
assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3

# --- Now categorize with ignored abundance ---
args = ['categorize', '--ignore-abundance',
'--ksize', '21', '--dna', '--csv', 'out4.csv',
'thebestdatabase', query]
status4, out4, err4 = utils.runscript('sourmash', args,
in_directory=location)

print(out4)
print(err4)

assert 'for 1-1, found: 0.88 1-1' in err4

out_csv4 = open(os.path.join(location, 'out4.csv')).read()
assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4

# Make sure ignoring abundance produces a different output!
assert err3 != err4


def test_sbt_categorize_already_done():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
Expand Down

0 comments on commit c202750

Please sign in to comment.