Add --ignore-abundance to "search" and "categorize" (#543)

* Add ignore_abundance to search_databases * Add ignore_abundnance to search and categorize * Containment cannot ignore abundance * Add note about abundance vs containment * Add assertion to make sure --ignore-abundance flag produces different results * Use signatures with abundance for categorize test
sourmash-bio · Sep 17, 2018 · c202750 · c202750
1 parent 3ffac2d
commit c202750
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 4 deletions.
diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -744,6 +744,9 @@ def search(args):
                         help='number of results to report')
     parser.add_argument('--containment', action='store_true',
                         help='evaluate containment rather than similarity')
+    parser.add_argument('--ignore-abundance', action='store_true',
+                        help='do NOT use k-mer abundances if present. Note: '
+                             'has no effect if --containment is specified')
     parser.add_argument('--scaled', type=float, default=0,
                         help='downsample query to this scaled factor (yields greater speed)')
     parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
@@ -786,7 +789,7 @@ def search(args):
     # do the actual search
     results = search_databases(query, databases,
                                args.threshold, args.containment,
-                               args.best_only)
+                               args.best_only, args.ignore_abundance)
 
     n_matches = len(results)
     if args.best_only:
@@ -839,6 +842,8 @@ def categorize(args):
     parser.add_argument('--threshold', default=0.08, type=float,
                        help='minimum threshold for reporting matches (default=0.08)')
     parser.add_argument('--traverse-directory', action="store_true")
+    parser.add_argument('--ignore-abundance', action='store_true',
+                        help='do NOT use k-mer abundances if present')
 
     sourmash_args.add_moltype_args(parser)
 
@@ -879,7 +884,9 @@ def categorize(args):
 
         for leaf in tree.find(search_fn, query, args.threshold):
             if leaf.data.md5sum() != query.md5sum(): # ignore self.
-                results.append((query.similarity(leaf.data), leaf.data))
+                similarity = query.similarity(
+                    leaf.data, ignore_abundance=args.ignore_abundance)
+                results.append((similarity, leaf.data))
 
         best_hit_sim = 0.0
         best_hit_query_name = ""

diff --git a/sourmash/search.py b/sourmash/search.py
@@ -28,10 +28,12 @@ def format_bp(bp):
     return '???'
 
 
-def search_databases(query, databases, threshold, do_containment, best_only):
+def search_databases(query, databases, threshold, do_containment, best_only,
+                     ignore_abundance):
     # set up the search & score function(s) - similarity vs containment
     search_fn = search_minhashes
-    query_match = lambda x: query.similarity(x, downsample=True)
+    query_match = lambda x: query.similarity(
+        x, downsample=True, ignore_abundance=ignore_abundance)
     if do_containment:
         search_fn = search_minhashes_containment
         query_match = lambda x: query.contained_by(x, downsample=True)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -972,6 +972,43 @@ def test_search():
         assert '93.0%' in out
 
 
+def test_search_ignore_abundance():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31',
+                                            '--track-abundance',
+                                            testdata1, testdata2],
+                                           in_directory=location)
+
+
+
+        # Make sure there's different percent matches when using or
+        # not using abundance
+        status1, out1, err1 = utils.runscript('sourmash',
+                                           ['search',
+                                            'short.fa.sig',
+                                            'short2.fa.sig'],
+                                           in_directory=location)
+        print(status1, out1, err1)
+        assert '1 matches' in out1
+        assert '81.5%' in out1
+
+        status2, out2, err2 = utils.runscript('sourmash',
+                                           ['search',
+                                            '--ignore-abundance',
+                                            'short.fa.sig',
+                                            'short2.fa.sig'],
+                                           in_directory=location)
+        print(status2, out2, err2)
+        assert '1 matches' in out2
+        assert '93.0%' in out2
+
+        # Make sure results are different!
+        assert out1 != out2
+
+
 def test_search_csv():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('short.fa')
@@ -2998,6 +3035,53 @@ def test_sbt_categorize():
         assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
 
 
+def test_sbt_categorize_ignore_abundance():
+    with utils.TempDirectory() as location:
+
+        query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+        against_list = ['reads-s10-s11']
+        against_list = [ 'gather-abund/' + i + '.sig' \
+                         for i in against_list ]
+        against_list = [ utils.get_test_data(i) for i in against_list ]
+
+        # omit 3
+        args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list
+        status2, out2, err2 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        # --- Categorize without ignoring abundance ---
+        args = ['categorize', 'thebestdatabase',
+                '--ksize', '21', '--dna', '--csv', 'out3.csv', query]
+        status3, out3, err3 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        print(out3)
+        print(err3)
+
+        assert 'for 1-1, found: 0.44 1-1' in err3
+
+        out_csv3 = open(os.path.join(location, 'out3.csv')).read()
+        assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3
+
+        # --- Now categorize with ignored abundance ---
+        args = ['categorize', '--ignore-abundance',
+                '--ksize', '21', '--dna', '--csv', 'out4.csv',
+                'thebestdatabase', query]
+        status4, out4, err4 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        print(out4)
+        print(err4)
+
+        assert 'for 1-1, found: 0.88 1-1' in err4
+
+        out_csv4 = open(os.path.join(location, 'out4.csv')).read()
+        assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4
+
+        # Make sure ignoring abundance produces a different output!
+        assert err3 != err4
+
+
 def test_sbt_categorize_already_done():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')