-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonomodel.py
90 lines (70 loc) · 2.41 KB
/
monomodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pickle
import sys
import os
start_tag = '<s>'
end_tag = '<\s>'
main_map = {}
def ngram_cal(count_word, n_of_word, n_of_tag, gram):
for i in range(len(count_word) - gram + 1):
ngram_tags = ''
ngram_word = ''
for j in range(gram):
ngram_word += count_word[i + j][0] + ' '
ngram_tags += count_word[i + j][1] + ' '
if ngram_tags[:len(ngram_tags) - 1] in n_of_tag:
n_of_tag[ngram_tags[:len(ngram_tags) - 1]] += 1
else:
n_of_tag[ngram_tags[:len(ngram_tags) - 1]] = 1
if ngram_word[:len(ngram_word) - 1] in n_of_word:
n_of_word[ngram_word[:len(ngram_word) - 1]] += 1
else:
n_of_word[ngram_word[:len(ngram_word) - 1]] = 1
return n_of_word, n_of_tag
def accumulate(file, count_word):
finput = open(file, 'r')
count_word.append([start_tag, start_tag])
hindi = {}
eng = {}
for line in finput:
if line != '\n':
tokens = line.split()
var = [tokens[0], tokens[2]]
count_word.append(var)
else:
count_word.append([end_tag, end_tag])
count_word.append([start_tag, start_tag])
count_word.append([end_tag, end_tag])
return count_word, hindi, eng
def accumulate1(file, count_word):
finput = open(file, 'r')
count_word.append([start_tag, start_tag])
hindi = {}
eng = {}
for line in finput:
if line != '\n':
tokens = line.split()
var = [tokens[1], main_map[tokens[4]]]
count_word.append(var)
else:
count_word.append([end_tag, end_tag])
count_word.append([start_tag, start_tag])
count_word.append([end_tag, end_tag])
return count_word, hindi, eng
if __name__=="__main__":
map_f = open('mapping.txt')
a = map_f.readlines()
for i in a:
main_map[i.split(':')[0]] = i.split(':')[1].strip()
file = sys.argv[1]
file2 = sys.argv[2]
n_of_word = {} # structure - {'ngram of words':[cnt of n,cnt of n-1]}
n_of_tag = {} # structure - {'ngram of tags':[cnt of n,cnt of n-1]}
count_word = [] # structure - [[word,tag],[word,tag]...]
count_word, hindi, eng = accumulate(file, count_word)
count_word, hindi, eng = accumulate1(file2, count_word)
#print count_word
for gram in range(2,4):
n_of_word, n_of_tag = ngram_cal(count_word, n_of_word, n_of_tag, gram)
print n_of_word
pickle.dump(n_of_tag,open('monotag.pkl','wb'))
pickle.dump(n_of_word,open('monoword.pkl','wb'))