Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] SBT scaffold #1201

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ uintptr_t nodegraph_get_kmer(const SourmashNodegraph *ptr, const char *kmer);

const uint64_t *nodegraph_hashsizes(const SourmashNodegraph *ptr, uintptr_t *size);

uintptr_t nodegraph_intersection_count(const SourmashNodegraph *ptr, const SourmashNodegraph *optr);

uintptr_t nodegraph_ksize(const SourmashNodegraph *ptr);

uintptr_t nodegraph_matches(const SourmashNodegraph *ptr, const SourmashKmerMinHash *mh_ptr);
Expand Down
4 changes: 3 additions & 1 deletion sourmash/cli/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ def subparser(subparsers):
)
subparser.add_argument(
'-x', '--bf-size', metavar='S', type=float, default=1e5,
help='Bloom filter size used for internal nodes'
help='Maximum Bloom filter size used for internal nodes. Set to 0 '
'to calculate an optimal size given the complexity of the datasets '
'(using a HyperLogLog to estimate the number of unique k-mers'
)
subparser.add_argument(
'-f', '--force', action='store_true',
Expand Down
20 changes: 18 additions & 2 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,20 @@ def index(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

batch = False
if args.append:
tree = load_sbt_index(args.sbt_name)
else:
tree = create_sbt_index(args.bf_size, n_children=args.n_children)
bf_size = args.bf_size
if bf_size == 0:
bf_size = None

tree = create_sbt_index(bf_size, n_children=args.n_children)
batch = True

# TODO: set up storage here
storage_info = tree._setup_storage(args.sbt_name)
tree.storage = storage_info.storage

if args.sparseness < 0 or args.sparseness > 1.0:
error('sparseness must be in range [0.0, 1.0].')
Expand Down Expand Up @@ -375,7 +385,7 @@ def index(args):
ss.minhash = ss.minhash.downsample(scaled=args.scaled)
scaleds.add(ss.minhash.scaled)

tree.insert(ss)
tree.insert(ss, batch=batch)
n += 1

if not ss:
Expand All @@ -398,6 +408,8 @@ def index(args):
error('nums = {}; scaleds = {}', repr(nums), repr(scaleds))
sys.exit(-1)

tree = tree.finish()

notify('')

# did we load any!?
Expand All @@ -406,6 +418,10 @@ def index(args):
sys.exit(-1)

notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
# TODO: if all nodes are already saved (like in the scaffold/batch case)
# we can potentially set structure_only=True here. An alternative is to
# modify Node.save to verify if the data is already saved or still need to
# be saved (dirty flag?)
tree.save(args.sbt_name, sparseness=args.sparseness)


Expand Down
27 changes: 26 additions & 1 deletion sourmash/logging.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
import atexit
import os
import sys
from io import StringIO

_quiet = False
_debug = False
def set_quiet(val, print_debug=False):
global _quiet, _debug
global _quiet, _debug, _trace
_quiet = bool(val)
_debug = bool(print_debug)

#_trace = True if "SOURMASH_TRACE" in os.environ else False
if "SOURMASH_TRACE" in os.environ:
_trace = open(os.environ["SOURMASH_TRACE"], "w")

@atexit.register
def flush_and_close():
global _trace
_trace.flush()
_trace.close()
else:
_trace = None


def print_results(s, *args, **kwargs):
if _quiet:
Expand Down Expand Up @@ -41,6 +55,17 @@ def debug(s, *args, **kwargs):
sys.stderr.flush()


def trace(s, *args, **kwargs):
"Low level execution information (even more verbose than debug)"
if not _trace:
return

print(s.format(*args, **kwargs), file=_trace,
end=kwargs.get('end', u'\n'))
if kwargs.get('flush'):
sys.stderr.flush()


def error(s, *args, **kwargs):
"A simple error logging function => stderr."
print(u'\r\033[K', end=u'', file=sys.stderr)
Expand Down
69 changes: 69 additions & 0 deletions sourmash/nodegraph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# -*- coding: UTF-8 -*-

from collections import namedtuple
import math
from struct import pack, unpack
import sys
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -86,6 +88,14 @@ def matches(self, mh):

return self._methodcall(lib.nodegraph_matches, mh._objptr)

def intersection_count(self, other):
if isinstance(other, Nodegraph):
return self._methodcall(lib.nodegraph_intersection_count, other._objptr)
else:
# FIXME: we could take MinHash and sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("Must be a Nodegraph")

def to_khmer_nodegraph(self):
import khmer
try:
Expand Down Expand Up @@ -159,3 +169,62 @@ def calc_expected_collisions(graph, force=False, max_false_pos=.2):
raise SystemExit(1)

return fp_all


def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
"""
Utility function for estimating optimal nodegraph args.

- num_kmers: number of unique kmers [required]
- mem_cap: the allotted amount of memory [optional, conflicts with f]
- fp_rate: the desired false positive rate [optional, conflicts with M]
"""
if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
else:
raise TypeError("num_kmers and either mem_cap or fp_rate"
" must be defined.")


def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
"""
Estimate optimal nodegraph args.

- num_kmers: number of unique kmer
- mem_cap: the allotted amount of memory
"""
n_tables = math.log(2) * (mem_cap / float(num_kmers))
int_n_tables = int(n_tables)
if int_n_tables == 0:
int_n_tables = 1
ht_size = int(mem_cap / int_n_tables)
mem_cap = ht_size * int_n_tables
fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
return res(int_n_tables, ht_size, mem_cap, fp_rate)


def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
"""
Estimate optimal memory.

- num_kmers: the number of unique kmers
- des_fp_rate: the desired false positive rate
"""
n_tables = math.log(des_fp_rate, 0.5)
int_n_tables = int(n_tables)
if int_n_tables == 0:
int_n_tables = 1

ht_size = int(-num_kmers / (
math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
ht_size = max(ht_size, 1)
mem_cap = ht_size * int_n_tables
fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables

res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
return res(int_n_tables, ht_size, mem_cap, fp_rate)
Loading