-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDocSimilarity.py
145 lines (117 loc) · 6.18 KB
/
DocSimilarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#===========================================================================================================================
# Tafadzwa Pasipanodya
# Computer Science SYE
# Python Module to provide sentence comparison functionality for a multi-document summarization application
#===========================================================================================================================
import nltk
import os
import math
import TextProcessing
class DocSimiliarity(object):
#------------------------------
# The class Construtor
#------------------------------
def __init__(self):
self.text = TextProcessing.TextProcessing()
#-----------------------------------------------------------------------------------------------------------------------
# Method to compute the TF-IDF values for all words in a document cluster
#
# Preconditions: 1. type(sentences) == list-of-sentence #this is a list of sentence objects. Please check sentence object
# #documentation for more information
#
# Returns: dict-of-word-float - this is the dictionary mapping every word to a Term frequency
#-----------------------------------------------------------------------------------------------------------------------
def TFs(self, sentences):
# method variables
tfs = {}
# every sentence
for sent in sentences:
wordFreqs = sent.getWordFreqs()
# every word
for word in wordFreqs.keys():
if tfs.get(word, 0) != 0:
tfs[word] = tfs[word] + wordFreqs[word]
else:
tfs[word] = wordFreqs[word]
return tfs
#-----------------------------------------------------------------------------------------------------------------------
# Method to compute the term frequency for a word in a given sentence
#
# Preconditions: 1.) type(word) == string and type(sentence) == sentence #this is the sentene data structure#
#
# Returns: the term frequency -> int
#-----------------------------------------------------------------------------------------------------------------------
def TFw(self, word, sentence):
return sentence.getWordFreqs().get(word, 0)
#-----------------------------------------------------------------------------------------------------------------------
# Method to compute the IDF value of a given word
#
# Preconditions: 1.) type(word) == String (word is a stemmed word)
# 2.) type(sentences) == list #this is a list of sentence data structures#
# 3.) word != None and sentences != None
#
# Returns: dict-of-str-&-float #this is the dictionary of word -> idf-value for all the words in our cluster
#-----------------------------------------------------------------------------------------------------------------------
def IDFs(self, sentences):
N = len(sentences)
idf = 0
idfs = {}
words = {}
w2 = []
# every sentence in our cluster
for sent in sentences:
# every word in a sentence
for word in sent.getStemmedWords():
# dont calculate a word's IDF value more than once
if sent.getWordFreqs().get(word, 0) != 0:
words[word] = words.get(word, 0)+ 1
for word in words:
n = words[word]
# avoid zero division errors
try:
w2.append(n)
idf = math.log10(float(N)/n)
except ZeroDivisionError:
idf = 0
# reset variables
idfs[word] = idf
return idfs
#--------------------------------------------------------------------------------------------------------------------
# Method to calculate the IDF values for all the words in a document cluster
#
# Preconditions: 1. type(word) == str #this is the word whose IDF value we want to find
# 2. type(idfs) == dict-of-str-&-float #this is the dictionary of all the word's IDF values
#
# Returns: float - the word's idf value
#--------------------------------------------------------------------------------------------------------------------
def IDF(self, word, idfs):
return idfs[word]
#----------------------------------------------------------------------------------------------------------------------
# Method to compute the similarity score between 2 sentences
#
# Preconditions: 1.) type(sent1) == sentence #this is a sentence object for our sentence#
# 2.) type(sent2) == sentence #this is a sentence object for our query. a query is just a list of words#
# 3.) Ttype(sentences) == list-of-sentence. #this is a list of all the sentences in our cluster#
#
# Returns: float - The similarity between the sentence and the document
#----------------------------------------------------------------------------------------------------------------------
def sim(self, sentence1, sentence2, idfs):
# funcrion data
numerator = 0
denom1 = 0
denom2 = 0
# calculate the numerator first
for word in sentence2.getStemmedWords():
numerator += self.TFw(word, sentence2) * self.TFw(word, sentence1) * self.IDF(word, idfs) ** 2
# calculate the denominator next
for word in sentence1.getStemmedWords():
denom2 += (self.TFw(word, sentence1) * self.IDF(word, idfs)) ** 2
for word in sentence2.getStemmedWords():
denom1 += (self.TFw(word, sentence2) * self.IDF(word, idfs)) ** 2
# calculate the similarity score last
try:
return numerator / (math.sqrt(denom1) * math.sqrt(denom2))
# just in case some bug led to a zero error (should be impossible)
# but just make sure program doesn't crash
except ZeroDivisionError:
return float("-inf")