-
Notifications
You must be signed in to change notification settings - Fork 0
/
inverted_index.py
71 lines (61 loc) · 1.83 KB
/
inverted_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import MapReduce
import sys
from collections import *
from math import *
from nltk.corpus import PlaintextCorpusReader
import io
"""
Inverted Index for Vector Space model of Information Retrieval in the Simple Python
MapReduce Framework. Takes documents as input and create an index of tf-idf scores
for each term in the document.
"""
mr = MapReduce.MapReduce()
# =============================
corpus_root = './Data/books'
docs = PlaintextCorpusReader(corpus_root, '.*')
fields = docs.fileids()
number_of_documents = len(fields)
def mapper(key, value):
# print(record)
# record : [doc_id, doc_contents]
# key: document identifier
# value: document contents
# key = record[0]
# value = record[1]
max_freq = 0
word_freq = defaultdict(int)
words = value.split()
for w in words:
word_freq[w] = word_freq[w] + 1
for w in words:
if (word_freq[w] > max_freq):
max_freq = word_freq[w]
else:
max_freq = max_freq
#print max_freq
for w in words:
tf_norm = float(word_freq[w])/float(max_freq)
mr.emit_intermediate(w,[key, tf_norm])
def reducer(key, list_of_values):
# key: term
# value: [doc_id, normalized term frequency]
index = []
count = 0
for x in list_of_values:
if not(x in index):
index.append(x)
count = count + 1
df = float(number_of_documents)/float(count)
idf = log(df, 2)
for y in index:
y[1] = y[1]*idf
mr.emit((key,count, index))
#output = [term, no of docs containing the term, tf-idf]
# # =============================
# def createIndex(file):
# mr.execute(file, mapper, reducer)
readpath = './InvertedIndex/big_input.json'
with open(readpath,"r",encoding = "utf16") as f:
data = f.read()
# print(text)
mr.execute(data, mapper, reducer)