From 71abc6aa3dcd00090237c30aaa52830557c1023f Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 15:50:42 -0700
Subject: [PATCH 1/7] Add ignore_abundance to search_databases

---
 sourmash/search.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sourmash/search.py b/sourmash/search.py
index 86e43fb69e..cee6360415 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -28,13 +28,16 @@ def format_bp(bp):
     return '???'
 
 
-def search_databases(query, databases, threshold, do_containment, best_only):
+def search_databases(query, databases, threshold, do_containment, best_only,
+                     ignore_abundance):
     # set up the search & score function(s) - similarity vs containment
     search_fn = search_minhashes
-    query_match = lambda x: query.similarity(x, downsample=True)
+    query_match = lambda x: query.similarity(
+        x, downsample=True, ignore_abundance=ignore_abundance)
     if do_containment:
         search_fn = search_minhashes_containment
-        query_match = lambda x: query.contained_by(x, downsample=True)
+        query_match = lambda x: query.contained_by(
+            x, downsample=True, ignore_abundance=ignore_abundance)
 
     results = []
     found_md5 = set()

From 05ecfd5449b5e7cba99d04349c60822cfd62af9d Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 15:51:04 -0700
Subject: [PATCH 2/7] Add ignore_abundnance to search and categorize

---
 sourmash/commands.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index ddee5ac411..7a6ff2efa9 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -744,6 +744,8 @@ def search(args):
                         help='number of results to report')
     parser.add_argument('--containment', action='store_true',
                         help='evaluate containment rather than similarity')
+    parser.add_argument('--ignore-abundance', action='store_true',
+                        help='do NOT use k-mer abundances if present')
     parser.add_argument('--scaled', type=float, default=0,
                         help='downsample query to this scaled factor (yields greater speed)')
     parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
@@ -786,7 +788,7 @@ def search(args):
     # do the actual search
     results = search_databases(query, databases,
                                args.threshold, args.containment,
-                               args.best_only)
+                               args.best_only, args.ignore_abundance)
 
     n_matches = len(results)
     if args.best_only:
@@ -838,6 +840,8 @@ def categorize(args):
     parser.add_argument('-k', '--ksize', type=int, default=None)
     parser.add_argument('--threshold', default=0.08, type=float)
     parser.add_argument('--traverse-directory', action="store_true")
+    parser.add_argument('--ignore-abundance', action='store_true',
+                        help='do NOT use k-mer abundances if present')
 
     sourmash_args.add_moltype_args(parser)
 
@@ -878,7 +882,9 @@ def categorize(args):
 
         for leaf in tree.find(search_fn, query, args.threshold):
             if leaf.data.md5sum() != query.md5sum(): # ignore self.
-                results.append((query.similarity(leaf.data), leaf.data))
+                similarity = query.similarity(
+                    leaf.data, ignore_abundance=args.ignore_abundance)
+                results.append((similarity, leaf.data))
 
         best_hit_sim = 0.0
         best_hit_query_name = ""

From d18c6a35395f358f90fe803a974d92370af627d9 Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 16:21:55 -0700
Subject: [PATCH 3/7] Start writing tests

---
 tests/test_sourmash.py | 94 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index de5da3cac4..d26e0e90f1 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -972,6 +972,40 @@ def test_search():
         assert '93.0%' in out
 
 
+def test_search_ignore_abundance():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31',
+                                            '--track-abundance',
+                                            testdata1, testdata2],
+                                           in_directory=location)
+
+
+
+        # Make sure there's different percent matches when using or
+        # not using abundance
+        status1, out1, err1 = utils.runscript('sourmash',
+                                           ['search',
+                                            'short.fa.sig',
+                                            'short2.fa.sig'],
+                                           in_directory=location)
+        print(status1, out1, err1)
+        assert '1 matches' in out1
+        assert '81.5%' in out1
+
+        status2, out2, err2 = utils.runscript('sourmash',
+                                           ['search',
+                                            '--ignore-abundance',
+                                            'short.fa.sig',
+                                            'short2.fa.sig'],
+                                           in_directory=location)
+        print(status2, out2, err2)
+        assert '1 matches' in out2
+        assert '93.0%' in out2
+
+
 def test_search_csv():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('short.fa')
@@ -2998,6 +3032,66 @@ def test_sbt_categorize():
         assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
 
 
+def test_sbt_categorize_ignore_abundance():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('genome-s10.fa.gz')
+        testdata2 = utils.get_test_data('genome-s11.fa.gz')
+        testdata3 = utils.get_test_data('genome-s12.fa.gz')
+        testdata4 = utils.get_test_data('genome-s10+s11.fa.gz')
+
+        shutil.copyfile(testdata1, os.path.join(location, 'genome-s10.fa.gz'))
+        shutil.copyfile(testdata2, os.path.join(location, 'genome-s11.fa.gz'))
+        shutil.copyfile(testdata3, os.path.join(location, 'genome-s12.fa.gz'))
+        shutil.copyfile(testdata4, os.path.join(location, 'genome-s10+s11.fa.gz'))
+
+        status1, out1, err1 = utils.runscript('sourmash',
+                                           ['compute',
+                                            testdata1, testdata2, testdata3,
+                                            testdata4,
+                                            '--track-abundance'],
+                                           in_directory=location)
+
+        # omit 3
+        args = ['index', '--dna', '-k', '21', 'thebestdatabase',
+                'genome-s10.fa.gz.sig',
+                'genome-s11.fa.gz.sig']
+        status2, out2, err2 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        # --- Categorize without ignoring abundance ---
+        args = ['categorize', 'thebestdatabase', '--traverse-directory', '.',
+                '--ksize', '21', '--dna', '--csv', 'out.csv']
+        status3, out3, err3 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        print(out3)
+        print(err3)
+
+        # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz
+        # yields 521/1000 ==> ~0.5
+        assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err3
+
+        out_csv = open(os.path.join(location, 'out.csv')).read()
+        assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
+
+        # --- Now categorize with ignored abundance ---
+        args = ['categorize', 'thebestdatabase', '--traverse-directory', '.',
+                '--ignore-abundance',
+                '--ksize', '21', '--dna', '--csv', 'out.csv']
+        status4, out4, err4 = utils.runscript('sourmash', args,
+                                           in_directory=location)
+
+        print(out4)
+        print(err4)
+
+        # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz
+        # yields 521/1000 ==> ~0.5
+        assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err4
+
+        out_csv = open(os.path.join(location, 'out.csv')).read()
+        assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
+
+
 def test_sbt_categorize_already_done():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')

From 2785033756eea3419638ee5f1c8fc1c3a43a4c2b Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 16:27:24 -0700
Subject: [PATCH 4/7] Containment cannot ignore abundance

---
 sourmash/search.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sourmash/search.py b/sourmash/search.py
index cee6360415..4177d8b752 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -36,8 +36,7 @@ def search_databases(query, databases, threshold, do_containment, best_only,
         x, downsample=True, ignore_abundance=ignore_abundance)
     if do_containment:
         search_fn = search_minhashes_containment
-        query_match = lambda x: query.contained_by(
-            x, downsample=True, ignore_abundance=ignore_abundance)
+        query_match = lambda x: query.contained_by(x, downsample=True)
 
     results = []
     found_md5 = set()

From e88cece10f35dff6c789ee0559a135dba4682452 Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 16:28:39 -0700
Subject: [PATCH 5/7] Add note about abundance vs containment

---
 sourmash/commands.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index 7a6ff2efa9..1851839cce 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -745,7 +745,8 @@ def search(args):
     parser.add_argument('--containment', action='store_true',
                         help='evaluate containment rather than similarity')
     parser.add_argument('--ignore-abundance', action='store_true',
-                        help='do NOT use k-mer abundances if present')
+                        help='do NOT use k-mer abundances if present. Note: '
+                             'has no effect if --containment is specified')
     parser.add_argument('--scaled', type=float, default=0,
                         help='downsample query to this scaled factor (yields greater speed)')
     parser.add_argument('-o', '--output', type=argparse.FileType('wt'),

From 9a8fc8bbcbb1a90bf8478aead51e234c792df3ff Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Fri, 14 Sep 2018 16:44:49 -0700
Subject: [PATCH 6/7] Add assertion to make sure --ignore-abundance flag
 produces different results

---
 tests/test_sourmash.py | 56 +++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index d26e0e90f1..c8abbf60a1 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -1005,6 +1005,9 @@ def test_search_ignore_abundance():
         assert '1 matches' in out2
         assert '93.0%' in out2
 
+        # Make sure results are different!
+        assert out1 != out2
+
 
 def test_search_csv():
     with utils.TempDirectory() as location:
@@ -3034,62 +3037,49 @@ def test_sbt_categorize():
 
 def test_sbt_categorize_ignore_abundance():
     with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('genome-s10.fa.gz')
-        testdata2 = utils.get_test_data('genome-s11.fa.gz')
-        testdata3 = utils.get_test_data('genome-s12.fa.gz')
-        testdata4 = utils.get_test_data('genome-s10+s11.fa.gz')
-
-        shutil.copyfile(testdata1, os.path.join(location, 'genome-s10.fa.gz'))
-        shutil.copyfile(testdata2, os.path.join(location, 'genome-s11.fa.gz'))
-        shutil.copyfile(testdata3, os.path.join(location, 'genome-s12.fa.gz'))
-        shutil.copyfile(testdata4, os.path.join(location, 'genome-s10+s11.fa.gz'))
 
-        status1, out1, err1 = utils.runscript('sourmash',
-                                           ['compute',
-                                            testdata1, testdata2, testdata3,
-                                            testdata4,
-                                            '--track-abundance'],
-                                           in_directory=location)
+        query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+        against_list = ['genome-s10', 'genome-s11', 'genome-s12']
+        against_list = [ 'gather-abund/' + i + '.fa.gz.sig' \
+                         for i in against_list ]
+        against_list = [ utils.get_test_data(i) for i in against_list ]
 
         # omit 3
-        args = ['index', '--dna', '-k', '21', 'thebestdatabase',
-                'genome-s10.fa.gz.sig',
-                'genome-s11.fa.gz.sig']
+        args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list
         status2, out2, err2 = utils.runscript('sourmash', args,
                                            in_directory=location)
 
         # --- Categorize without ignoring abundance ---
-        args = ['categorize', 'thebestdatabase', '--traverse-directory', '.',
-                '--ksize', '21', '--dna', '--csv', 'out.csv']
+        args = ['categorize', 'thebestdatabase',
+                '--ksize', '21', '--dna', '--csv', 'out3.csv', query]
         status3, out3, err3 = utils.runscript('sourmash', args,
                                            in_directory=location)
 
         print(out3)
         print(err3)
 
-        # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz
-        # yields 521/1000 ==> ~0.5
-        assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err3
+        assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err3
 
-        out_csv = open(os.path.join(location, 'out.csv')).read()
-        assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
+        out_csv3 = open(os.path.join(location, 'out3.csv')).read()
+        assert 'reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv3
 
         # --- Now categorize with ignored abundance ---
-        args = ['categorize', 'thebestdatabase', '--traverse-directory', '.',
-                '--ignore-abundance',
-                '--ksize', '21', '--dna', '--csv', 'out.csv']
+        args = ['categorize', '--ignore-abundance',
+                '--ksize', '21', '--dna', '--csv', 'out4.csv',
+                'thebestdatabase', query]
         status4, out4, err4 = utils.runscript('sourmash', args,
                                            in_directory=location)
 
         print(out4)
         print(err4)
 
-        # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz
-        # yields 521/1000 ==> ~0.5
-        assert 'for s10+s11, found: 0.50 genome-s10.fa.gz' in err4
+        assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err4
 
-        out_csv = open(os.path.join(location, 'out.csv')).read()
-        assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv
+        out_csv4 = open(os.path.join(location, 'out4.csv')).read()
+        assert '/reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv4
+
+        # Make sure ignoring abundance produces a different output!
+        assert err3 != err4
 
 
 def test_sbt_categorize_already_done():

From 537c276f2dc953b43a72ec0842d5b98c9009030a Mon Sep 17 00:00:00 2001
From: olgabot <olga.botvinnik@gmail.com>
Date: Mon, 17 Sep 2018 08:57:10 -0700
Subject: [PATCH 7/7] Use signatures with abundance for categorize test

---
 tests/test_sourmash.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index c8abbf60a1..b6ba2989d1 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -3039,8 +3039,8 @@ def test_sbt_categorize_ignore_abundance():
     with utils.TempDirectory() as location:
 
         query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
-        against_list = ['genome-s10', 'genome-s11', 'genome-s12']
-        against_list = [ 'gather-abund/' + i + '.fa.gz.sig' \
+        against_list = ['reads-s10-s11']
+        against_list = [ 'gather-abund/' + i + '.sig' \
                          for i in against_list ]
         against_list = [ utils.get_test_data(i) for i in against_list ]
 
@@ -3058,10 +3058,10 @@ def test_sbt_categorize_ignore_abundance():
         print(out3)
         print(err3)
 
-        assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err3
+        assert 'for 1-1, found: 0.44 1-1' in err3
 
         out_csv3 = open(os.path.join(location, 'out3.csv')).read()
-        assert 'reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv3
+        assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3
 
         # --- Now categorize with ignored abundance ---
         args = ['categorize', '--ignore-abundance',
@@ -3073,10 +3073,10 @@ def test_sbt_categorize_ignore_abundance():
         print(out4)
         print(err4)
 
-        assert 'for 1-1, found: 0.57 tests/test-data/genome-s10.fa.gz' in err4
+        assert 'for 1-1, found: 0.88 1-1' in err4
 
         out_csv4 = open(os.path.join(location, 'out4.csv')).read()
-        assert '/reads-s10x10-s11.sig,1-1,tests/test-data/genome-s10.fa.gz,0.57' in out_csv4
+        assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4
 
         # Make sure ignoring abundance produces a different output!
         assert err3 != err4