Skip to content

Commit

Permalink
add a HLL impl, move alphabet stuff to encodings
Browse files Browse the repository at this point in the history
hll ffi
expose cardinality, add_hash and add_sequence
  • Loading branch information
luizirber committed Oct 12, 2020
1 parent 8596341 commit 5522636
Show file tree
Hide file tree
Showing 25 changed files with 5,186 additions and 687 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ doc: build .PHONY
cd doc && make html

include/sourmash.h: src/core/src/lib.rs \
src/core/src/ffi/hyperloglog.rs \
src/core/src/ffi/minhash.rs \
src/core/src/ffi/signature.rs \
src/core/src/ffi/nodegraph.rs \
Expand Down
21 changes: 21 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_INVALID_HASH_FUNCTION = 1104,
SOURMASH_ERROR_CODE_READ_DATA = 1201,
SOURMASH_ERROR_CODE_STORAGE = 1202,
SOURMASH_ERROR_CODE_HLL_PRECISION_BOUNDS = 1301,
SOURMASH_ERROR_CODE_IO = 100001,
SOURMASH_ERROR_CODE_UTF8_ERROR = 100002,
SOURMASH_ERROR_CODE_PARSE_INT = 100003,
Expand All @@ -45,6 +46,8 @@ typedef uint32_t SourmashErrorCode;

typedef struct SourmashComputeParameters SourmashComputeParameters;

typedef struct SourmashHyperLogLog SourmashHyperLogLog;

typedef struct SourmashKmerMinHash SourmashKmerMinHash;

typedef struct SourmashNodegraph SourmashNodegraph;
Expand Down Expand Up @@ -115,6 +118,24 @@ bool computeparams_track_abundance(const SourmashComputeParameters *ptr);

uint64_t hash_murmur(const char *kmer, uint64_t seed);

void hll_add_hash(SourmashHyperLogLog *ptr, uint64_t hash);

void hll_add_sequence(SourmashHyperLogLog *ptr, const char *sequence, uintptr_t insize, bool force);

uintptr_t hll_cardinality(const SourmashHyperLogLog *ptr);

void hll_free(SourmashHyperLogLog *ptr);

uintptr_t hll_ksize(const SourmashHyperLogLog *ptr);

void hll_merge(SourmashHyperLogLog *ptr, const SourmashHyperLogLog *optr);

SourmashHyperLogLog *hll_new(void);

void hll_update_mh(SourmashHyperLogLog *ptr, const SourmashKmerMinHash *optr);

SourmashHyperLogLog *hll_with_error_rate(double error_rate, uintptr_t ksize);

void kmerminhash_add_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

void kmerminhash_add_hash(SourmashKmerMinHash *ptr, uint64_t h);
Expand Down
52 changes: 52 additions & 0 deletions sourmash/hll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: UTF-8 -*-

import sys
from tempfile import NamedTemporaryFile

from ._lowlevel import ffi, lib
from .utils import RustObject, rustcall, decode_str
from .exceptions import SourmashError
from .minhash import to_bytes, MinHash


class HLL(RustObject):
__dealloc_func__ = lib.hll_free

def __init__(self, error_rate, ksize):
self._objptr = lib.hll_with_error_rate(error_rate, ksize)

def __len__(self):
return self.cardinality()

def cardinality(self):
return self._methodcall(lib.hll_cardinality)

@property
def ksize(self):
return self._methodcall(lib.hll_ksize)

def add_sequence(self, sequence, force=False):
"Add a sequence into the sketch."
self._methodcall(lib.hll_add_sequence, to_bytes(sequence), len(sequence), force)

def add_kmer(self, kmer):
"Add a kmer into the sketch."
if len(kmer) != self.ksize:
raise ValueError("kmer to add is not {} in length".format(self.ksize))
self.add_sequence(kmer)

def add(self, h):
if isinstance(h, str):
return self.add_kmer(h)
return self._methodcall(lib.hll_add_hash, h)

def update(self, other):
if isinstance(other, HLL):
return self._methodcall(lib.hll_merge, other._objptr)
elif isinstance(other, MinHash):
return self._methodcall(lib.hll_update_mh, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("Must be a HyperLogLog or MinHash")

61 changes: 61 additions & 0 deletions sourmash/nodegraph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# -*- coding: UTF-8 -*-

from collections import namedtuple
import math
from struct import pack, unpack
import sys
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -159,3 +161,62 @@ def calc_expected_collisions(graph, force=False, max_false_pos=.2):
raise SystemExit(1)

return fp_all


def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
"""
Utility function for estimating optimal nodegraph args.
- num_kmers: number of unique kmers [required]
- mem_cap: the allotted amount of memory [optional, conflicts with f]
- fp_rate: the desired false positive rate [optional, conflicts with M]
"""
if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
else:
raise TypeError("num_kmers and either mem_cap or fp_rate"
" must be defined.")


def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
"""
Estimate optimal nodegraph args.
- num_kmers: number of unique kmer
- mem_cap: the allotted amount of memory
"""
n_tables = math.log(2) * (mem_cap / float(num_kmers))
int_n_tables = int(n_tables)
if int_n_tables == 0:
int_n_tables = 1
ht_size = int(mem_cap / int_n_tables)
mem_cap = ht_size * int_n_tables
fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
return res(int_n_tables, ht_size, mem_cap, fp_rate)


def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
"""
Estimate optimal memory.
- num_kmers: the number of unique kmers
- des_fp_rate: the desired false positive rate
"""
n_tables = math.log(des_fp_rate, 0.5)
int_n_tables = int(n_tables)
if int_n_tables == 0:
int_n_tables = 1

ht_size = int(-num_kmers / (
math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
ht_size = max(ht_size, 1)
mem_cap = ht_size * int_n_tables
fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables

res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
return res(int_n_tables, ht_size, mem_cap, fp_rate)
26 changes: 17 additions & 9 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def search_transcript(node, seq, threshold):
from .sbt_storage import FSStorage, IPFSStorage, RedisStorage, ZipStorage
from .logging import error, notify, debug
from .index import Index
from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions
from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions, optimal_size
from .hll import HLL

STORAGES = {
'FSStorage': FSStorage,
Expand Down Expand Up @@ -1284,24 +1285,35 @@ def scaffold(datasets, storage):
InternalNode = namedtuple("InternalNode", "element left right")
BinaryLeaf = namedtuple("BinaryLeaf", "element")

hll = HLL(0.01, 1)
heap = []
for i, d1 in enumerate(datasets):
try:
d1_mh = d1.data.minhash
except AttributeError:
d1_mh = d1.minhash
hll.update(d1_mh)

for j, d2 in enumerate(datasets):
if i > j:
if isinstance(d1, SigLeaf):
common = d1.data.minhash.count_common(d2.data.minhash)
common = d1_mh.count_common(d2.data.minhash)
elif isinstance(d1, Leaf):
# TODO: Nodegraph comparison
raise NotImplementedError()
elif isinstance(d1, SourmashSignature):
common = d1.minhash.count_common(d2.minhash)
common = d1_mh.count_common(d2.minhash)
else:
raise ValueError("Input not supported")

# heapq defaults to a min heap,
# invert "common" here to avoid having to use the
# internal methods for a max heap
heapq.heappush(heap, (-common, i, j))
#n_unique_hashes = max(len(hll), 1)
n_unique_hashes = len(hll)
num_htables, htable_size, mem_use, fp_rate = optimal_size(n_unique_hashes, fp_rate=0.01)
factory = GraphFactory(31, htable_size, num_htables)

processed = set()
total_datasets = len(datasets)
Expand Down Expand Up @@ -1329,7 +1341,6 @@ def scaffold(datasets, storage):
else:
raise ValueError("Input not supported")

# TODO: keep an HLL to count unique elements
new_internal = InternalNode(element, BinaryLeaf(d1), BinaryLeaf(d2))
next_round.append(new_internal)

Expand Down Expand Up @@ -1371,9 +1382,6 @@ def scaffold(datasets, storage):

# TODO: save d into storage and unload it

# TODO: proper factory using the HLL
factory = GraphFactory(31, 1e5, 4)

# Finish processing leaves, start dealing with internal nodes in next_round
while len(next_round) > 1:
current_round = next_round
Expand Down Expand Up @@ -1403,7 +1411,7 @@ def scaffold(datasets, storage):
next_round.append(new_internal)

# TODO: make Node (data = Nodegraphs) for d1 and d2
# (maybe use len(HLL) to set Nodegraph size?)
# use the factory to create Nodegraph
# TODO: save d1 and d2 Nodes into storage and unload them
# PROBLEM: d1,d2 don't have stable names,
# and the SBT format uses internal.N where N is
Expand Down Expand Up @@ -1431,7 +1439,7 @@ def scaffold(datasets, storage):
next_round.append(new_internal)

# TODO: make Node (data = Nodegraphs) for d
# (maybe use len(HLL) to set Nodegraph size?)
# use the factory to create Nodegraph
# TODO: save d into storage and unload it
# PROBLEM: d doesn't have a stable name,
# and the SBT format uses internal.N where N is
Expand Down
4 changes: 2 additions & 2 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ parallel = ["rayon"]

[dependencies]
backtrace = "=0.3.46" # later versions require rust 1.40
bytecount = "0.6.0"
byteorder = "1.3.4"
cfg-if = "1.0"
failure = "0.1.8" # can remove after .backtrace() is available in std::error::Error
finch = { version = "0.3.0", optional = true }
fixedbitset = "0.3.0"
getset = "0.1.1"
log = "0.4.8"
md5 = "0.7.0"
murmurhash3 = "0.0.5"
Expand All @@ -40,7 +41,6 @@ serde_json = "1.0.53"
primal-check = "0.2.3"
thiserror = "1.0"
typed-builder = "0.7.0"
getset = "0.1.1"

[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen]
version = "0.2.62"
Expand Down
3 changes: 2 additions & 1 deletion src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ use wasm_bindgen::prelude::*;
use getset::{CopyGetters, Getters, Setters};
use typed_builder::TypedBuilder;

use crate::encodings::HashFunctions;
use crate::index::MHBT;
use crate::signature::Signature;
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHashBTree};
use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHashBTree};
use crate::sketch::Sketch;
use crate::Error;

Expand Down
Loading

0 comments on commit 5522636

Please sign in to comment.