-
Notifications
You must be signed in to change notification settings - Fork 17
/
indexit.py
67 lines (51 loc) · 1.62 KB
/
indexit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pymultihash as pmh
import re
from bs4 import BeautifulSoup
import myrequests as requests
import base64
IPFSGateway = "http://blamestross.com/ipfs/"
INDEX_PATH = "index.json"
def onecount(bloomint):
count = 0
while bloomint>0:
count += bloomint % 2
bloomint //= 2
return count
def generateBloomFilter(wordlist):
f = 0
j = 0
for w in wordlist:
hashInt = 0
hashVal = pmh.genHash(w, 0x12)
for i in range(0, 10):
try:
tmpInt = 2**256-1
for j in range(0,10):
tmpInt &= pmh.parseHash(hashVal)
hashVal = pmh.genHash(hashVal, 0x12)
hashInt = (hashInt << 256) | tmpInt
except Exception as e:
print("error ",e)
print(hashVal, w, i, j, len(wordlist))
hashVal = pmh.genHash(hashVal, 0x12)
f |= hashInt
j += 1
return f
def wordInFilter(bloomInt, testWord):
hashVal = pmh.genHash(testWord, 0x12)
hashInt = pmh.parseHash(hashVal)
return (bloomInt & hashInt) == hashInt
def filterInFilter(bloomInt, testInt):
return (bloomInt & testInt) == testInt
def tokenizeHTML(html):
raw = BeautifulSoup(html, 'html.parser').get_text()
wordlist = map(lambda x: x.strip().lower(), re.split(r'[ \n\t]', raw))
longlist = filter(lambda x: len(x) > 1, wordlist)
return list(set(longlist))
def indexFile(IPFSHash):
path = IPFSGateway+IPFSHash
req = requests.get(path)
print("got request")
rawText = req.text
bloom = generateBloomFilter(tokenizeHTML(rawText))
return bloom