-
Notifications
You must be signed in to change notification settings - Fork 14
/
compute_vocab_pos.py
128 lines (110 loc) · 3.97 KB
/
compute_vocab_pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path as osp
import json
import nltk
import sys
nltk_tags = ["$", "--", ",", ".", "''", "(", ")", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ",
"JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$",
"RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
"WDT", "WP", "WP$", "WRB", ":"]
nouns = ["NN", "NNP", "NNPS", "NNS"]
verbs = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
adjectives = ["JJ", "JJR", "JJS"]
adjectivesJJ = ["JJ"]
adjectivesJJR = ["JJR"]
adjectivesJJS = ["JJS"]
numbers = ["CD"]
adverbs = ["RB", "RBR", "RBS"]
determiners = ["DT"]
prepositions = ["IN"]
particles = ["RP"]
pronouns = ["PRP", "PRP$"]
def parsePos(pos, tagsDict):
phraseDict = {}
for t in nltk_tags:
phraseDict[t] = []
for word, tag in pos:
word = word.lower()
if word not in tagsDict[tag]:
tagsDict[tag][word] = 0
tagsDict[tag][word] += 1
phraseDict[tag].append(word)
return tagsDict, phraseDict
def updateVocab(words, vocab):
for word in words:
word = word.lower()
if word not in vocab:
vocab[word] = 0
vocab[word] += 1
return vocab
def parseQueries(queryFile):
tagsDict = {} # words per POS tag
for t in nltk_tags:
tagsDict[t] = {}
vocab = {} # all words
of = open(queryFile, 'r')
queries = json.load(of)['results']
query2TagDict = {} # words per video
for q in queries:
qs = queries[q]['sentences']
vid_id = q
query2TagDict[vid_id] = []
for query in qs:
if query == "": break
# clean punctuation
query = query.replace(',', ' ')
query = query.replace('.', ' ')
query = query.replace(':', ' ')
query = query.replace(';', ' ')
query = query.replace('!', ' ')
query = query.replace('?', ' ')
query = query.replace('"', ' ')
query = query.replace('&', ' and ')
query = query.replace('@', ' ')
query = query.replace('(', ' ')
query = query.replace(')', ' ')
query = query.replace('[', ' ')
query = query.replace(']', ' ')
query = query.replace('<', ' ')
query = query.replace('>', ' ')
query = query.replace('`', ' ')
query = query.replace('#', ' ')
query = query.replace(u'\u2019', "'")
#print(query)
#
words = query.split()
vocab = updateVocab(words, vocab)
#
pos = nltk.pos_tag(nltk.word_tokenize(query))
#if "'" in query:
# pass
# #print(query)
#elif(len(pos) != len(query.split())):
# pass
# #print(query)
tagsDict, phraseDict = parsePos(pos, tagsDict)
query2TagDict[vid_id].append({'query': query, 'phraseDict': phraseDict})
return tagsDict, query2TagDict, vocab
if __name__ == '__main__':
# may need: nltk.download('averaged_perceptron_tagger')
#nltk.download('averaged_perceptron_tagger')
captionsPath = '/data2/activity_net/captions/'
fileName = 'val_1.json'
print(sys.argv[1])
tagsDict, query2TagDict, vocab = parseQueries(sys.argv[1])
pos_stats = {}
pos_stats['tagsDict'] = tagsDict
pos_stats['query2TagDict'] = query2TagDict
pos_stats['vocab'] = vocab
json.dump(pos_stats, open(fileName + '-pos.json', 'w'))
# save some tags
for tag in ['NN', 'NNS', 'PRP', 'VB']:
fout = open('vocab.%s.tsv' % tag, 'w')
for a, b in tagsDict[tag].items(): fout.write('%s\t%d\n' % (a.encode('utf-8'), b))
fout.close()
# save full vocab
fout = open('vocab.tsv', 'w')
for a, b in vocab.items(): fout.write('%s\t%d\n' % (a.encode('utf-8'), b))
fout.close()