-
Notifications
You must be signed in to change notification settings - Fork 67
/
preprocess.py
79 lines (74 loc) · 3.19 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/python
import numpy as np
"""Functions to do the following:
* Create vocabulary
* Create dictionary mapping from word to word_id
* Map words in captions to word_ids"""
def build_vocab(word_count_thresh):
"""Function to create vocabulary based on word count threshold.
Input:
word_count_thresh: Threshold to choose words to include to the vocabulary
Output:
vocabulary: Set of words in the vocabulary"""
sents_train = open('text_files/sents_train_lc_nopunc.txt','r').read().splitlines()
sents_val = open('text_files/sents_val_lc_nopunc.txt','r').read().splitlines()
sents_test = open('text_files/sents_test_lc_nopunc.txt','r').read().splitlines()
unk_required = False
all_captions = []
word_counts = {}
for sent in sents_train + sents_val + sents_test:
caption = sent.split('\t')[-1]
caption = '<BOS> ' + caption + ' <EOS>'
all_captions.append(caption)
for word in caption.split(' '):
if word_counts.has_key(word):
word_counts[word] += 1
else:
word_counts[word] = 1
for word in word_counts.keys():
if word_counts[word] < word_count_thresh:
word_counts.pop(word)
unk_required = True
return word_counts,unk_required
def word_to_word_ids(word_counts,unk_required):
"""Function to map individual words to their id's.
Input:
word_counts: Dictionary with words mapped to their counts
Output:
word_to_id: Dictionary with words mapped to their id's. """
count = 0
word_to_id = {}
id_to_word = {}
if unk_required:
word_to_id['<UNK>'] = count
id_to_word[count] = '<UNK>'
count += 1
for word in word_counts.keys():
word_to_id[word] = count
id_to_word[count] = word
count+=1
return word_to_id,id_to_word
def convert_caption(caption,word_to_id,max_caption_length):
"""Function to map each word in a caption to it's respective id and to retrieve caption masks
Input:
caption: Caption to convert to word_to_word_ids
word_to_id: Dictionary mapping words to their respective id's
max_caption_length: Maximum number of words allowed in a caption
Output:
caps: Captions with words mapped to word id's
cap_masks: Caption masks with 1's at positions of words and 0's at pad locations"""
caps,cap_masks = [],[]
if type(caption) == 'str':
caption = [caption] # if single caption, make it a list of captions of length one
for cap in caption:
nWords = cap.count(' ') + 1
cap = cap + ' <EOS>'*(max_caption_length-nWords)
cap_masks.append([1.0]*nWords + [0.0]*(max_caption_length-nWords))
curr_cap = []
for word in cap.split(' '):
if word_to_id.has_key(word):
curr_cap.append(word_to_id[word]) # word is present in chosen vocabulary
else:
curr_cap.append(word_to_id['<UNK>']) # word not present in chosen vocabulary
caps.append(curr_cap)
return np.array(caps),np.array(cap_masks)