-
Notifications
You must be signed in to change notification settings - Fork 0
/
Task1-First3Runs.py
221 lines (177 loc) · 6.84 KB
/
Task1-First3Runs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import Indexer
import io
import os
import math
import string
from collections import Counter
# GLOBAL CONSTANTS
CURRENT_DIR = os.getcwd()
ps = string.punctuation
trans = str.maketrans(ps, " ")
OUTPUT_DIR = "Outputs/"
DOC_TOKEN_COUNT = {}
INVERTED_INDEX = {}
QUERY_ID = 0
with open("common_words", 'r') as f:
STOP_WORDS = f.read().splitlines()
def avg_doc_len():
# Returns the average document length for the documents in this input corpus
total_length = 0
for doc in DOC_TOKEN_COUNT:
total_length += DOC_TOKEN_COUNT[doc]
return float(total_length) / float(len(DOC_TOKEN_COUNT))
def query_index(sent_q):
q_words = sent_q.lower().split()
index = {}
for word in q_words:
if word in INVERTED_INDEX:
index[word] = INVERTED_INDEX[word]
else:
index[word] = {}
return index
def read_rel_info():
relevant_docs = []
rel_docs_in_corpus = []
with io.open("cacm.rel.txt", 'r', encoding="utf-8") as relevance_file:
for line in relevance_file.readlines():
values = line.split()
if values and (values[0] == str(QUERY_ID)):
relevant_docs.append(values[2])
for doc_id in DOC_TOKEN_COUNT:
if doc_id in relevant_docs:
rel_docs_in_corpus.append(doc_id)
return rel_docs_in_corpus
def rel_doc_count(docs_with_term, relevant_docs):
count = 0
for doc_id in docs_with_term:
if doc_id in relevant_docs:
count += 1
return count
def BM25_score(new_q):
# Computes BM25 scores for all documents in the given index
# Returns a map of the document ids with their BM25 score
new_q = new_q.lower()
DOC_SCORE = {}
rel_docs = read_rel_info()
R = len(rel_docs)
q_tf = Counter(new_q.split())
new_q_index = query_index(new_q)
avdl = avg_doc_len()
N = len(DOC_TOKEN_COUNT)
k1 = 1.2
k2 = 100
b = 0.75
for query_term in new_q.split():
qf = q_tf[query_term]
n = len(new_q_index[query_term])
if query_term in INVERTED_INDEX:
r = rel_doc_count(INVERTED_INDEX[query_term], rel_docs)
else:
r = 0
dl = 0
for doc in new_q_index[query_term]:
f = new_q_index[query_term][doc]
if doc in DOC_TOKEN_COUNT:
dl = DOC_TOKEN_COUNT[doc]
K = k1 * ((1 - b) + (b * (float(dl) / float(avdl))))
relevance_part = math.log(((r + 0.5) / (R - r + 0.5)) / ((n - r + 0.5) / (N - n - R + r + 0.5)))
k1_part = ((k1 + 1) * f) / (K + f)
k2_part = ((k2 + 1) * qf) / (k2 + qf)
if doc in DOC_SCORE:
DOC_SCORE[doc] += (relevance_part * k1_part * k2_part)
else:
DOC_SCORE[doc] = (relevance_part * k1_part * k2_part)
# return doc scores
return DOC_SCORE
def QLM_score(new_q):
# Computes QLM scores for all documents in the given index
# Returns a map of the document ids with their QLM score
DOC_SCORE_QLM = {}
C = 0
lambda_value = 0.35
new_q_index = query_index(new_q)
# Initialize all docs with score = 0
for doc in DOC_TOKEN_COUNT:
# DOC_SCORE_QLM[doc] = 0
C = C + DOC_TOKEN_COUNT[doc] # total number of words in collection
for query_term in new_q.split():
cq = 0
for doc in new_q_index[query_term]:
cq = cq + new_q_index[query_term][doc] # total occurrence of query term in collection
for doc in new_q_index[query_term]:
D = DOC_TOKEN_COUNT[doc] # total number of words in doc
fq = new_q_index[query_term][doc] # total occurrence of query term in doc
first_part = float(1 - lambda_value) * (fq / D)
second_part = float(lambda_value) * (cq / C)
if doc in DOC_SCORE_QLM:
DOC_SCORE_QLM[doc] += math.log(first_part + second_part)
else:
DOC_SCORE_QLM[doc] = math.log(first_part + second_part)
# return doc scores in descending order.
return DOC_SCORE_QLM
def tfidf_score(new_q):
# Computes tf-idf scores for all documents in the given index
# Returns a map of the document ids with their tfidf score
DOC_SCORE_TFIDF = {}
tf_idf_dict = {}
new_q_index = query_index(new_q)
for term in new_q_index:
idf = 1.0 + math.log(float(len(DOC_TOKEN_COUNT)) / float(len(new_q_index[term].keys()) + 1))
for doc_id in new_q_index[term]:
tf = float(new_q_index[term][doc_id]) / float(DOC_TOKEN_COUNT[doc_id])
if term not in tf_idf_dict:
tf_idf_dict[term] = {}
tf_idf_dict[term][doc_id] = tf * idf
for term in new_q_index:
for doc in new_q_index[term]:
doc_weight = 0
doc_weight = doc_weight + tf_idf_dict[term][doc] # get_doc_weight(doc,fetched_index,tf_idf_dict)
if doc in DOC_SCORE_TFIDF:
doc_weight = doc_weight + DOC_SCORE_TFIDF[doc]
DOC_SCORE_TFIDF.update({doc: doc_weight})
# return doc scores
return DOC_SCORE_TFIDF
def write_to_file(doc_scores, q_id, output_file):
# Write output scores to a text file
rank = 0
with open(OUTPUT_DIR + output_file + ".txt", "a+") as out_file:
# Counter(doc_scores).most_common(100):
sorted_scores = [(k, doc_scores[k]) for k in sorted(doc_scores, key=doc_scores.get, reverse=True)]
for i in range(1, min(len(sorted_scores), 101)):
doc, score = sorted_scores[i]
rank += 1
out_file.write(str(q_id) + " Q0 " + doc + " " + str(rank) + " " + str(score) + " " + output_file + "\n")
if __name__ == '__main__':
Indexer.unigram_index(False)
INVERTED_INDEX = Indexer.INVERTED_INDEX
DOC_TOKEN_COUNT = Indexer.DOC_TOKEN_COUNT
models = ["BM25RelevanceRun", "TFIDFRun", "QLRun"]
for model in models:
if os.path.exists(OUTPUT_DIR + model + ".txt"):
os.remove(OUTPUT_DIR + model + ".txt")
query_file = open("cacm.query.txt", 'r')
queries = []
query = ""
for line in query_file.readlines():
if line == "\n":
continue
if line.startswith("<DOCNO>") or line.startswith("<DOC>"):
continue
if line.startswith("</DOC>"):
queries.append(query.strip().lower())
query = ""
continue
query += " " + line.rstrip("\n").strip().translate(trans)
for q in queries:
QUERY_ID += 1
scores = BM25_score(q)
OUTPUT_FILE = models[0]
write_to_file(scores, QUERY_ID, OUTPUT_FILE)
scores = tfidf_score(q)
OUTPUT_FILE = models[1]
write_to_file(scores, QUERY_ID, OUTPUT_FILE)
scores = QLM_score(q)
OUTPUT_FILE = models[2]
write_to_file(scores, QUERY_ID, OUTPUT_FILE)
print("Completed Retrieval for query : " + q)
print("End of Retrieval.")