-
Notifications
You must be signed in to change notification settings - Fork 1
/
Bin_index.py
executable file
·69 lines (59 loc) · 2.05 KB
/
Bin_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
from random import sample, seed
"""Creates indices as follows:
select N random sites
extract allele array from selected sites
convert to bits by allele number % 2 (or other divisor)
convert remainder to int and store in DB with cgMLST object ID
"""
class Bin_indices:
def __init__(self, indexlengths, schema, modulo=2):
self.indices = list()
for indexlength in indexlengths:
self.indices.append(
Bin_index(indexlength, schema=schema, modulo=modulo))
def search(self, item):
hits = set()
for index in self.indices:
hits.update(index.search(item))
return hits
def __len__(self):
return len(self.indices[0])
class Bin_index:
def __init__(self, indexlength, schema, randseed=None, modulo=2):
self.schema = schema
self.seed = randseed
seed(self.seed)
self.N = indexlength
self.sites = sample(range(len(self.schema)), self.N)
self.modulo = modulo
self.dict = dict()
def add(self, alleles):
idx = self.index_alleles(alleles)
self.dict.setdefault(idx,list()).append(alleles)
def search(self, alleles):
idx = self.index_alleles(alleles)
return self.dict.get(idx,list())
def index_alleles(self, alleles):
return int("".join([hex(alleles[x] % self.modulo)[2:] for x in self.sites]),self.modulo)
def accept_prob(self, threshold):
Paccept = 1
for i in range(self.N):
Paccept = Paccept * (1-(1-1/self.modulo)*(threshold+i)/(len(self.schema)))
return Paccept
def __len__(self):
return self.N
if __name__ == "__main__":
## Calculation to show how different index lengths result in
# accept probabilities at different mismatch/loci ratios
indexlength = 10
m = 10
N = 2500
alleles = list(range(N))
indices = Bin_indices([indexlength]*5, alleles, modulo=2)
print(len(indices))
for indexlength in [8, 16, 32, 64, 128]:
for m in [2, 5, 10, 50, 100, 200, 400, 1000, 2000]:
index = Bin_index(indexlength, alleles, modulo=2, randseed=3)
print("indexlength: {} m: {} N: {}, Paccept: {}".format(indexlength,
m, N, index.accept_prob(m)))