sourmash-bio · luizirber · Oct 28, 2020 · Sep 24, 2020 · Oct 29, 2020 · Oct 30, 2020
diff --git a/include/sourmash.h b/include/sourmash.h
@@ -263,6 +263,8 @@ uintptr_t nodegraph_get_kmer(const SourmashNodegraph *ptr, const char *kmer);
 
 const uint64_t *nodegraph_hashsizes(const SourmashNodegraph *ptr, uintptr_t *size);
 
+uintptr_t nodegraph_intersection_count(const SourmashNodegraph *ptr, const SourmashNodegraph *optr);
+
 uintptr_t nodegraph_ksize(const SourmashNodegraph *ptr);
 
 uintptr_t nodegraph_matches(const SourmashNodegraph *ptr, const SourmashKmerMinHash *mh_ptr);

diff --git a/sourmash/cli/index.py b/sourmash/cli/index.py
@@ -55,7 +55,9 @@ def subparser(subparsers):
     )
     subparser.add_argument(
         '-x', '--bf-size', metavar='S', type=float, default=1e5,
-        help='Bloom filter size used for internal nodes'
+        help='Maximum Bloom filter size used for internal nodes. Set to 0 '
+             'to calculate an optimal size given the complexity of the datasets '
+             '(using a HyperLogLog to estimate the number of unique k-mers'
     )
     subparser.add_argument(
         '-f', '--force', action='store_true',

diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -327,10 +327,20 @@ def index(args):
     set_quiet(args.quiet)
     moltype = sourmash_args.calculate_moltype(args)
 
+    batch = False
     if args.append:
         tree = load_sbt_index(args.sbt_name)
     else:
-        tree = create_sbt_index(args.bf_size, n_children=args.n_children)
+        bf_size = args.bf_size
+        if bf_size == 0:
+            bf_size = None
+
+        tree = create_sbt_index(bf_size, n_children=args.n_children)
+        batch = True
+
+        # TODO: set up storage here
+        storage_info = tree._setup_storage(args.sbt_name)
+        tree.storage = storage_info.storage
 
     if args.sparseness < 0 or args.sparseness > 1.0:
         error('sparseness must be in range [0.0, 1.0].')
@@ -375,7 +385,7 @@ def index(args):
                 ss.minhash = ss.minhash.downsample(scaled=args.scaled)
             scaleds.add(ss.minhash.scaled)
 
-            tree.insert(ss)
+            tree.insert(ss, batch=batch)
             n += 1
 
         if not ss:
@@ -398,6 +408,8 @@ def index(args):
             error('nums = {}; scaleds = {}', repr(nums), repr(scaleds))
             sys.exit(-1)
 
+    tree = tree.finish()
+
     notify('')
 
     # did we load any!?
@@ -406,6 +418,10 @@ def index(args):
         sys.exit(-1)
 
     notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
+    # TODO: if all nodes are already saved (like in the scaffold/batch case)
+    # we can potentially set structure_only=True here. An alternative is to
+    # modify Node.save to verify if the data is already saved or still need to
+    # be saved (dirty flag?)
     tree.save(args.sbt_name, sparseness=args.sparseness)
 
 

diff --git a/sourmash/logging.py b/sourmash/logging.py
@@ -1,13 +1,27 @@
+import atexit
+import os
 import sys
 from io import StringIO
 
 _quiet = False
 _debug = False
 def set_quiet(val, print_debug=False):
-    global _quiet, _debug
+    global _quiet, _debug, _trace
     _quiet = bool(val)
     _debug = bool(print_debug)
 
+#_trace = True if "SOURMASH_TRACE" in os.environ else False
+if "SOURMASH_TRACE" in os.environ:
+    _trace = open(os.environ["SOURMASH_TRACE"], "w")
+
+    @atexit.register
+    def flush_and_close():
+        global _trace
+        _trace.flush()
+        _trace.close()
+else:
+    _trace = None
+
 
 def print_results(s, *args, **kwargs):
     if _quiet:
@@ -41,6 +55,17 @@ def debug(s, *args, **kwargs):
         sys.stderr.flush()
 
 
+def trace(s, *args, **kwargs):
+    "Low level execution information (even more verbose than debug)"
+    if not _trace:
+        return
+
+    print(s.format(*args, **kwargs), file=_trace,
+          end=kwargs.get('end', u'\n'))
+    if kwargs.get('flush'):
+        sys.stderr.flush()
+
+
 def error(s, *args, **kwargs):
     "A simple error logging function => stderr."
     print(u'\r\033[K', end=u'', file=sys.stderr)

diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py
@@ -1,5 +1,7 @@
 # -*- coding: UTF-8 -*-
 
+from collections import namedtuple
+import math
 from struct import pack, unpack
 import sys
 from tempfile import NamedTemporaryFile
@@ -86,6 +88,14 @@ def matches(self, mh):
 
         return self._methodcall(lib.nodegraph_matches, mh._objptr)
 
+    def intersection_count(self, other):
+        if isinstance(other, Nodegraph):
+            return self._methodcall(lib.nodegraph_intersection_count, other._objptr)
+        else:
+            # FIXME: we could take MinHash and sets here too (or anything that can be
+            # converted to a list of ints...)
+            raise TypeError("Must be a Nodegraph")
+
     def to_khmer_nodegraph(self):
         import khmer
         try:
@@ -159,3 +169,62 @@ def calc_expected_collisions(graph, force=False, max_false_pos=.2):
             raise SystemExit(1)
 
     return fp_all
+
+
+def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
+    """
+    Utility function for estimating optimal nodegraph args.
+
+      - num_kmers: number of unique kmers [required]
+      - mem_cap: the allotted amount of memory [optional, conflicts with f]
+      - fp_rate: the desired false positive rate [optional, conflicts with M]
+    """
+    if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
+        return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
+    elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
+        return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
+    else:
+        raise TypeError("num_kmers and either mem_cap or fp_rate"
+                        " must be defined.")
+
+
+def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
+    """
+    Estimate optimal nodegraph args.
+
+     - num_kmers: number of unique kmer
+     - mem_cap: the allotted amount of memory
+    """
+    n_tables = math.log(2) * (mem_cap / float(num_kmers))
+    int_n_tables = int(n_tables)
+    if int_n_tables == 0:
+        int_n_tables = 1
+    ht_size = int(mem_cap / int_n_tables)
+    mem_cap = ht_size * int_n_tables
+    fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
+    res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+                                "fp_rate"])
+    return res(int_n_tables, ht_size, mem_cap, fp_rate)
+
+
+def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
+    """
+    Estimate optimal memory.
+
+    - num_kmers: the number of unique kmers
+    - des_fp_rate: the desired false positive rate
+    """
+    n_tables = math.log(des_fp_rate, 0.5)
+    int_n_tables = int(n_tables)
+    if int_n_tables == 0:
+        int_n_tables = 1
+
+    ht_size = int(-num_kmers / (
+        math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
+    ht_size = max(ht_size, 1)
+    mem_cap = ht_size * int_n_tables
+    fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
+
+    res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+                                "fp_rate"])
+    return res(int_n_tables, ht_size, mem_cap, fp_rate)