Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --ignore-abundance to "search" and "categorize" #543

Merged
merged 8 commits into from
Sep 17, 2018
11 changes: 9 additions & 2 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,9 @@ def search(args):
help='number of results to report')
parser.add_argument('--containment', action='store_true',
help='evaluate containment rather than similarity')
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present. Note: '
'has no effect if --containment is specified')
parser.add_argument('--scaled', type=float, default=0,
help='downsample query to this scaled factor (yields greater speed)')
parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
Expand Down Expand Up @@ -786,7 +789,7 @@ def search(args):
# do the actual search
results = search_databases(query, databases,
args.threshold, args.containment,
args.best_only)
args.best_only, args.ignore_abundance)

n_matches = len(results)
if args.best_only:
Expand Down Expand Up @@ -839,6 +842,8 @@ def categorize(args):
parser.add_argument('--threshold', default=0.08, type=float,
help='minimum threshold for reporting matches (default=0.08)')
parser.add_argument('--traverse-directory', action="store_true")
parser.add_argument('--ignore-abundance', action='store_true',
help='do NOT use k-mer abundances if present')

sourmash_args.add_moltype_args(parser)

Expand Down Expand Up @@ -879,7 +884,9 @@ def categorize(args):

for leaf in tree.find(search_fn, query, args.threshold):
if leaf.data.md5sum() != query.md5sum(): # ignore self.
results.append((query.similarity(leaf.data), leaf.data))
similarity = query.similarity(
leaf.data, ignore_abundance=args.ignore_abundance)
results.append((similarity, leaf.data))

best_hit_sim = 0.0
best_hit_query_name = ""
Expand Down
6 changes: 4 additions & 2 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ def format_bp(bp):
return '???'


def search_databases(query, databases, threshold, do_containment, best_only):
def search_databases(query, databases, threshold, do_containment, best_only,
ignore_abundance):
# set up the search & score function(s) - similarity vs containment
search_fn = search_minhashes
query_match = lambda x: query.similarity(x, downsample=True)
query_match = lambda x: query.similarity(
x, downsample=True, ignore_abundance=ignore_abundance)
if do_containment:
search_fn = search_minhashes_containment
query_match = lambda x: query.contained_by(x, downsample=True)
Expand Down
84 changes: 84 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,43 @@ def test_search():
assert '93.0%' in out


def test_search_ignore_abundance():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '31',
'--track-abundance',
testdata1, testdata2],
in_directory=location)



# Make sure there's different percent matches when using or
# not using abundance
status1, out1, err1 = utils.runscript('sourmash',
['search',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)
print(status1, out1, err1)
assert '1 matches' in out1
assert '81.5%' in out1

status2, out2, err2 = utils.runscript('sourmash',
['search',
'--ignore-abundance',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)
print(status2, out2, err2)
assert '1 matches' in out2
assert '93.0%' in out2

# Make sure results are different!
assert out1 != out2


def test_search_csv():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -2998,6 +3035,53 @@ def test_sbt_categorize():
assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv


def test_sbt_categorize_ignore_abundance():
with utils.TempDirectory() as location:

query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
against_list = ['reads-s10-s11']
against_list = [ 'gather-abund/' + i + '.sig' \
for i in against_list ]
against_list = [ utils.get_test_data(i) for i in against_list ]

# omit 3
args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list
status2, out2, err2 = utils.runscript('sourmash', args,
in_directory=location)

# --- Categorize without ignoring abundance ---
args = ['categorize', 'thebestdatabase',
'--ksize', '21', '--dna', '--csv', 'out3.csv', query]
status3, out3, err3 = utils.runscript('sourmash', args,
in_directory=location)

print(out3)
print(err3)

assert 'for 1-1, found: 0.44 1-1' in err3

out_csv3 = open(os.path.join(location, 'out3.csv')).read()
assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3

# --- Now categorize with ignored abundance ---
args = ['categorize', '--ignore-abundance',
'--ksize', '21', '--dna', '--csv', 'out4.csv',
'thebestdatabase', query]
status4, out4, err4 = utils.runscript('sourmash', args,
in_directory=location)

print(out4)
print(err4)

assert 'for 1-1, found: 0.88 1-1' in err4

out_csv4 = open(os.path.join(location, 'out4.csv')).read()
assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4

# Make sure ignoring abundance produces a different output!
assert err3 != err4


def test_sbt_categorize_already_done():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
Expand Down