From 71abc6aa3dcd00090237c30aaa52830557c1023f Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 15:50:42 -0700 Subject: [PATCH 1/7] Add ignore_abundance to search_databases --- sourmash/search.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sourmash/search.py b/sourmash/search.py index 86e43fb69e..cee6360415 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -28,13 +28,16 @@ def format_bp(bp): return '???' -def search_databases(query, databases, threshold, do_containment, best_only): +def search_databases(query, databases, threshold, do_containment, best_only, + ignore_abundance): # set up the search & score function(s) - similarity vs containment search_fn = search_minhashes - query_match = lambda x: query.similarity(x, downsample=True) + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) if do_containment: search_fn = search_minhashes_containment - query_match = lambda x: query.contained_by(x, downsample=True) + query_match = lambda x: query.contained_by( + x, downsample=True, ignore_abundance=ignore_abundance) results = [] found_md5 = set() From 05ecfd5449b5e7cba99d04349c60822cfd62af9d Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 15:51:04 -0700 Subject: [PATCH 2/7] Add ignore_abundnance to search and categorize --- sourmash/commands.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index ddee5ac411..7a6ff2efa9 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -744,6 +744,8 @@ def search(args): help='number of results to report') parser.add_argument('--containment', action='store_true', help='evaluate containment rather than similarity') + parser.add_argument('--ignore-abundance', action='store_true', + help='do NOT use k-mer abundances if present') parser.add_argument('--scaled', type=float, default=0, help='downsample query to this scaled factor (yields greater speed)') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), @@ -786,7 +788,7 @@ def search(args): # do the actual search results = search_databases(query, databases, args.threshold, args.containment, - args.best_only) + args.best_only, args.ignore_abundance) n_matches = len(results) if args.best_only: @@ -838,6 +840,8 @@ def categorize(args): parser.add_argument('-k', '--ksize', type=int, default=None) parser.add_argument('--threshold', default=0.08, type=float) parser.add_argument('--traverse-directory', action="store_true") + parser.add_argument('--ignore-abundance', action='store_true', + help='do NOT use k-mer abundances if present') sourmash_args.add_moltype_args(parser) @@ -878,7 +882,9 @@ def categorize(args): for leaf in tree.find(search_fn, query, args.threshold): if leaf.data.md5sum() != query.md5sum(): # ignore self. - results.append((query.similarity(leaf.data), leaf.data)) + similarity = query.similarity( + leaf.data, ignore_abundance=args.ignore_abundance) + results.append((similarity, leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" From d18c6a35395f358f90fe803a974d92370af627d9 Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 16:21:55 -0700 Subject: [PATCH 3/7] Start writing tests --- tests/test_sourmash.py | 94 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index de5da3cac4..d26e0e90f1 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -972,6 +972,40 @@ def test_search(): assert '93.0%' in out +def test_search_ignore_abundance(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', + '--track-abundance', + testdata1, testdata2], + in_directory=location) + + + + # Make sure there's different percent matches when using or + # not using abundance + status1, out1, err1 = utils.runscript('sourmash', + ['search', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location) + print(status1, out1, err1) + assert '1 matches' in out1 + assert '81.5%' in out1 + + status2, out2, err2 = utils.runscript('sourmash', + ['search', + '--ignore-abundance', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location) + print(status2, out2, err2) + assert '1 matches' in out2 + assert '93.0%' in out2 + + def test_search_csv(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -2998,6 +3032,66 @@ def test_sbt_categorize(): assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv +def test_sbt_categorize_ignore_abundance(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('genome-s10.fa.gz') + testdata2 = utils.get_test_data('genome-s11.fa.gz') + testdata3 = utils.get_test_data('genome-s12.fa.gz') + testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') + + shutil.copyfile(testdata1, os.path.join(location, 'genome-s10.fa.gz')) + shutil.copyfile(testdata2, os.path.join(location, 'genome-s11.fa.gz')) + shutil.copyfile(testdata3, os.path.join(location, 'genome-s12.fa.gz')) + shutil.copyfile(testdata4, os.path.join(location, 'genome-s10+s11.fa.gz')) + + status1, out1, err1 = utils.runscript('sourmash', + ['compute', + testdata1, testdata2, testdata3, + testdata4, + '--track-abundance'], + in_directory=location) + + # omit 3 + args = ['index', '--dna', '-k', '21', 'thebestdatabase', + 'genome-s10.fa.gz.sig', + 'genome-s11.fa.gz.sig'] + status2, out2, err2 = utils.runscript('sourmash', args, + in_directory=location) + + # --- Categorize without ignoring abundance --- + args = ['categorize', 'thebestdatabase', '--traverse-directory', '.', + '--ksize', '21', '--dna', '--csv', 'out.csv'] + status3, out3, err3 = utils.runscript('sourmash', args, + in_directory=location) + + print(out3) + print(err3) + + # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz + # yields 521/1000 ==> ~0.5 + assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err3 + + out_csv = open(os.path.join(location, 'out.csv')).read() + assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv + + # --- Now categorize with ignored abundance --- + args = ['categorize', 'thebestdatabase', '--traverse-directory', '.', + '--ignore-abundance', + '--ksize', '21', '--dna', '--csv', 'out.csv'] + status4, out4, err4 = utils.runscript('sourmash', args, + in_directory=location) + + print(out4) + print(err4) + + # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz + # yields 521/1000 ==> ~0.5 + assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err4 + + out_csv = open(os.path.join(location, 'out.csv')).read() + assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv + + def test_sbt_categorize_already_done(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') From 2785033756eea3419638ee5f1c8fc1c3a43a4c2b Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 16:27:24 -0700 Subject: [PATCH 4/7] Containment cannot ignore abundance --- sourmash/search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sourmash/search.py b/sourmash/search.py index cee6360415..4177d8b752 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -36,8 +36,7 @@ def search_databases(query, databases, threshold, do_containment, best_only, x, downsample=True, ignore_abundance=ignore_abundance) if do_containment: search_fn = search_minhashes_containment - query_match = lambda x: query.contained_by( - x, downsample=True, ignore_abundance=ignore_abundance) + query_match = lambda x: query.contained_by(x, downsample=True) results = [] found_md5 = set() From e88cece10f35dff6c789ee0559a135dba4682452 Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 16:28:39 -0700 Subject: [PATCH 5/7] Add note about abundance vs containment --- sourmash/commands.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index 7a6ff2efa9..1851839cce 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -745,7 +745,8 @@ def search(args): parser.add_argument('--containment', action='store_true', help='evaluate containment rather than similarity') parser.add_argument('--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present') + help='do NOT use k-mer abundances if present. Note: ' + 'has no effect if --containment is specified') parser.add_argument('--scaled', type=float, default=0, help='downsample query to this scaled factor (yields greater speed)') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), From 9a8fc8bbcbb1a90bf8478aead51e234c792df3ff Mon Sep 17 00:00:00 2001 From: olgabot Date: Fri, 14 Sep 2018 16:44:49 -0700 Subject: [PATCH 6/7] Add assertion to make sure --ignore-abundance flag produces different results --- tests/test_sourmash.py | 56 +++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index d26e0e90f1..c8abbf60a1 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1005,6 +1005,9 @@ def test_search_ignore_abundance(): assert '1 matches' in out2 assert '93.0%' in out2 + # Make sure results are different! + assert out1 != out2 + def test_search_csv(): with utils.TempDirectory() as location: @@ -3034,62 +3037,49 @@ def test_sbt_categorize(): def test_sbt_categorize_ignore_abundance(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('genome-s10.fa.gz') - testdata2 = utils.get_test_data('genome-s11.fa.gz') - testdata3 = utils.get_test_data('genome-s12.fa.gz') - testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') - - shutil.copyfile(testdata1, os.path.join(location, 'genome-s10.fa.gz')) - shutil.copyfile(testdata2, os.path.join(location, 'genome-s11.fa.gz')) - shutil.copyfile(testdata3, os.path.join(location, 'genome-s12.fa.gz')) - shutil.copyfile(testdata4, os.path.join(location, 'genome-s10+s11.fa.gz')) - status1, out1, err1 = utils.runscript('sourmash', - ['compute', - testdata1, testdata2, testdata3, - testdata4, - '--track-abundance'], - in_directory=location) + query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') + against_list = ['genome-s10', 'genome-s11', 'genome-s12'] + against_list = [ 'gather-abund/' + i + '.fa.gz.sig' \ + for i in against_list ] + against_list = [ utils.get_test_data(i) for i in against_list ] # omit 3 - args = ['index', '--dna', '-k', '21', 'thebestdatabase', - 'genome-s10.fa.gz.sig', - 'genome-s11.fa.gz.sig'] + args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list status2, out2, err2 = utils.runscript('sourmash', args, in_directory=location) # --- Categorize without ignoring abundance --- - args = ['categorize', 'thebestdatabase', '--traverse-directory', '.', - '--ksize', '21', '--dna', '--csv', 'out.csv'] + args = ['categorize', 'thebestdatabase', + '--ksize', '21', '--dna', '--csv', 'out3.csv', query] status3, out3, err3 = utils.runscript('sourmash', args, in_directory=location) print(out3) print(err3) - # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz - # yields 521/1000 ==> ~0.5 - assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err3 + assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err3 - out_csv = open(os.path.join(location, 'out.csv')).read() - assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv + out_csv3 = open(os.path.join(location, 'out3.csv')).read() + assert 'reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv3 # --- Now categorize with ignored abundance --- - args = ['categorize', 'thebestdatabase', '--traverse-directory', '.', - '--ignore-abundance', - '--ksize', '21', '--dna', '--csv', 'out.csv'] + args = ['categorize', '--ignore-abundance', + '--ksize', '21', '--dna', '--csv', 'out4.csv', + 'thebestdatabase', query] status4, out4, err4 = utils.runscript('sourmash', args, in_directory=location) print(out4) print(err4) - # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz - # yields 521/1000 ==> ~0.5 - assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err4 + assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err4 - out_csv = open(os.path.join(location, 'out.csv')).read() - assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv + out_csv4 = open(os.path.join(location, 'out4.csv')).read() + assert '/reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv4 + + # Make sure ignoring abundance produces a different output! + assert err3 != err4 def test_sbt_categorize_already_done(): From 537c276f2dc953b43a72ec0842d5b98c9009030a Mon Sep 17 00:00:00 2001 From: olgabot Date: Mon, 17 Sep 2018 08:57:10 -0700 Subject: [PATCH 7/7] Use signatures with abundance for categorize test --- tests/test_sourmash.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index c8abbf60a1..b6ba2989d1 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -3039,8 +3039,8 @@ def test_sbt_categorize_ignore_abundance(): with utils.TempDirectory() as location: query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = [ 'gather-abund/' + i + '.fa.gz.sig' \ + against_list = ['reads-s10-s11'] + against_list = [ 'gather-abund/' + i + '.sig' \ for i in against_list ] against_list = [ utils.get_test_data(i) for i in against_list ] @@ -3058,10 +3058,10 @@ def test_sbt_categorize_ignore_abundance(): print(out3) print(err3) - assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err3 + assert 'for 1-1, found: 0.44 1-1' in err3 out_csv3 = open(os.path.join(location, 'out3.csv')).read() - assert 'reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv3 + assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3 # --- Now categorize with ignored abundance --- args = ['categorize', '--ignore-abundance', @@ -3073,10 +3073,10 @@ def test_sbt_categorize_ignore_abundance(): print(out4) print(err4) - assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err4 + assert 'for 1-1, found: 0.88 1-1' in err4 out_csv4 = open(os.path.join(location, 'out4.csv')).read() - assert '/reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv4 + assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4 # Make sure ignoring abundance produces a different output! assert err3 != err4