-
Notifications
You must be signed in to change notification settings - Fork 0
/
probabilities.py
277 lines (233 loc) · 11.7 KB
/
probabilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
from __future__ import division
__author__ = 'sagar, chitesh'
'''
Class holds all the probabilities matrices and maps. USed by all other modules for calculations.
Performs all the operations from the probabilities
'''
from collections import Counter, defaultdict
import logging
import re
from constants import Constants
# set debug level
logging.basicConfig(level=logging.INFO)
class Probabilities:
word_prob = Counter() # P(W|S)
speech_prob = Counter() # P(S)w
speech_rel_prob = Counter() # P(Si+1|Si)
total_speech_count = 0 # total occurrences of speech in train data
unknown_words = defaultdict(str) # stores new words and best possible speech for them
default_unknown_prob = 1.0
total_sentences = 0
suffix_counter = Counter() # suffixes of words and their counter
# Best - MEMM
word_prev_speech_prob = defaultdict(Counter)
word_next_speech_prob = defaultdict(Counter)
word_count = Counter()
@staticmethod
def train_from_data(data):
# data is in the form <word, part_of_speech><word, part_of_speech>
# Let's parse each line one by one and calculate P(W|S), P(S), and P(Si+1|Si)
# positions of words and tags in train data
logging.info("Started working on training data, let's see what we have got!")
WORD_POS = 0
SPEECH_POS = 1
Probabilities.total_speech_count = 2 # start and end symbols
for line in data:
# number of sentences
Probabilities.total_sentences += 1
# (PS1) probability of first part of speech
Probabilities.speech_rel_prob[line[SPEECH_POS][0] + Constants.delimiter + Constants.first_word] += 1
# (PSn) probability of last part of speech
Probabilities.speech_rel_prob[Constants.last_word +
Constants.delimiter + line[SPEECH_POS][len(line[0]) - 1]] += 1
previous_speech = ""
previous_word = ""
for j in range(0, len(line[WORD_POS])):
word = line[WORD_POS][j]
speech = line[SPEECH_POS][j]
# transition probabilities
if previous_speech:
tran_key = speech + Constants.delimiter + previous_speech
Probabilities.speech_rel_prob[tran_key] += 1
previous_speech = speech
# total number of words in the train data
Probabilities.total_speech_count += 1
# increase count for corresponding speech
Probabilities.speech_prob[speech] += 1
# create unique key in the form word+speech e.g. <the,det> -> thedet to save the counts for it
# increase count for corresponding word-speech
key = word + Constants.delimiter + speech
Probabilities.word_prob[key] += 1
# separate last 3 letters as suffix from data and maintain counter for it
if len(word) > 2:
suffix = word[-3:]
Probabilities.suffix_counter[suffix + Constants.delimiter + speech] += 1
# Best - MEMM
# P(word/prev-speech)
Probabilities.word_prev_speech_prob[word][previous_speech] += 1
if previous_word:
# P(word/next-speech)
Probabilities.word_next_speech_prob[previous_word][speech] += 1
previous_word = word
Probabilities.word_count[word] += 1
Probabilities.apply_smoothing()
logging.info("Finished learning from data. Total words : " + str(Probabilities.total_speech_count) + ". " +
"whoa! nice training set. " if Probabilities.total_speech_count > 50000 else " hmm, I know a better "
"training set.")
# Updates the transition probabilities to avoid case of zero probability. Also converts counts to the
# corresponding probability
@staticmethod
def apply_smoothing():
speeches = Probabilities.speech_prob.keys()
for current in speeches:
Probabilities.speech_rel_prob[current + Constants.delimiter + Constants.first_word] += 1
Probabilities.speech_rel_prob[Constants.last_word + Constants.delimiter + current] += 1
Probabilities.speech_prob[current] += 2
Probabilities.total_speech_count += 2
for previous in speeches:
Probabilities.speech_rel_prob[current + Constants.delimiter + previous] += 1
Probabilities.speech_prob[current] += 1
Probabilities.speech_prob[previous] += 1
Probabilities.total_speech_count += 2
Probabilities.convert_word_speech_probabilities()
Probabilities.convert_transition_probabilities()
Probabilities.convert_speech_probabilities()
Probabilities.default_unknown_prob = 0.5 / Probabilities.total_speech_count
# convert count(words|speech) to P(words|speech)
@staticmethod
def convert_word_speech_probabilities():
for word_speech in Probabilities.word_prob.keys():
speech = word_speech.split(Constants.delimiter)[1]
Probabilities.word_prob[word_speech] /= Probabilities.speech_prob[speech]
# convert count(speech) to P(speech)
@staticmethod
def convert_speech_probabilities():
for speech in Probabilities.speech_prob.keys():
Probabilities.speech_prob[speech] /= Probabilities.total_speech_count
# convert count(Si+1|Si) to P(Si+1|Si)
@staticmethod
def convert_transition_probabilities():
for key in Probabilities.speech_rel_prob.keys():
speech = key.split(Constants.delimiter)
# if probability of coming first or last in sentence
if speech[1] == Constants.first_word or speech[0] == Constants.last_word:
Probabilities.speech_rel_prob[key] /= Probabilities.total_sentences
else:
Probabilities.speech_rel_prob[key] /= Probabilities.speech_prob[speech[0]]
# returns P(Si+1|Si)
@staticmethod
def get_transition_prob(current, previous):
key = current + Constants.delimiter + previous
return Probabilities.speech_rel_prob[key]
# P(S1)
@staticmethod
def get_first_speech_prob(speech):
return Probabilities.get_transition_prob(speech, Constants.first_word)
# P(Sn)
@staticmethod
def get_last_speech_prob(speech):
return Probabilities.get_transition_prob(Constants.last_word, speech)
# returns P(word|speech)
@staticmethod
def get_word_probability(word, speech):
key = word + Constants.delimiter + speech
prob = Probabilities.word_prob[key]
# word is seen first time for this speech
if prob == 0:
# check if its totally new word
best_speech = Probabilities.unknown_words[word]
if not best_speech:
# yes, its completely new word
prob = Probabilities.default_unknown_prob
# if heuristically calculated speech matches current speech
elif best_speech == speech:
prob = 1.0
else:
prob = Probabilities.default_unknown_prob
return prob
@staticmethod
def get_naive_word_probability(word, speech):
key = word + Constants.delimiter + speech
prob = Probabilities.word_prob[key]
return prob
@staticmethod
def get_posterior_word_probability(word, speech):
key = word + Constants.delimiter + speech
prob = Probabilities.word_prob[key]
if prob == 0:
prob = Probabilities.default_unknown_prob
return prob
@staticmethod
def get_best_possible_speech(word):
speech = Constants.default_speech_tag
if Probabilities.contains_digit(word):
speech = Constants.number_tag
elif word.endswith("ly"):
speech = Constants.adverb_tag
elif word.endswith("-like"):
speech = Constants.adjective_tag
elif word.endswith("ed"):
speech = Constants.verb_tag
# Check suffix of word to calculate best possible speech
elif len(word) > 2:
max_speech = ""
max_speech_cnt = 0
total_cnt = 0
for s in Probabilities.speech_prob.keys():
count = Probabilities.suffix_counter[word[-3:] + Constants.delimiter + s]
total_cnt += count
if max_speech_cnt < count:
max_speech_cnt = count
max_speech = s
# if 60% of time same speech comes for given suffix, use it
if max_speech and max_speech_cnt / total_cnt > 0.6:
speech = max_speech
Probabilities.unknown_words[word] = speech
return speech
@staticmethod
def contains_digit(word):
return bool(re.compile("\d").search(word))
@staticmethod
# Best - MEMM
def convert_word_seq_speech_probabilities():
for word in Probabilities.word_count.keys():
for prev_speech in Probabilities.speech_prob.keys():
Probabilities.word_prev_speech_prob[word][prev_speech] = (Probabilities.word_prev_speech_prob[word][
prev_speech] + 1) / \
Probabilities.word_count[word]
for next_speech in Probabilities.speech_prob.keys():
Probabilities.word_next_speech_prob[word][next_speech] = (Probabilities.word_next_speech_prob[word][
next_speech] + 1) / \
Probabilities.word_count[word]
@staticmethod
def get_prev_word_speech_probability(word, speech):
prob = Probabilities.word_prev_speech_prob[word][speech]
return prob
@staticmethod
def get_next_word_speech_probability(word, speech):
prob = Probabilities.word_next_speech_prob[word][speech]
return prob
@staticmethod
def convert_algo_results(results, sentence):
previous_speech = ""
for wordIndex in range(len(sentence)):
word = sentence[wordIndex]
if Probabilities.word_count[word] == 0:
for resultIndex in range(len(results)):
if wordIndex != 0:
previous_speech = results[resultIndex][wordIndex - 1]
if len(sentence) != 1 and wordIndex != len(sentence) - 1:
next_speech = results[resultIndex][wordIndex + 1]
if wordIndex == 0:
Probabilities.word_prev_speech_prob[word][next_speech] += 1
elif wordIndex == len(sentence):
Probabilities.word_next_speech_prob[word][previous_speech] += 1
else:
Probabilities.word_next_speech_prob[word][previous_speech] += 1
Probabilities.word_prev_speech_prob[word][next_speech] += 1
for next in Probabilities.speech_prob.keys():
Probabilities.word_next_speech_prob[word][next] = (Probabilities.word_next_speech_prob[word][
next] + 1) / (len(results) + 1)
for prev in Probabilities.speech_prob.keys():
Probabilities.word_prev_speech_prob[word][prev] = (Probabilities.word_next_speech_prob[word][
prev] + 1) / (len(results) + 1)