-
Notifications
You must be signed in to change notification settings - Fork 2
/
vocab_file_writer.py
42 lines (31 loc) · 1.24 KB
/
vocab_file_writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""Create vocabolary file from text corpus.
"""
INPUT_FILE = 'train.txt'
OUTPUT_FILE = 'vocab-train.txt'
# Vocab max token size (number of words)
TOKEN_TO_PICK = 800000
vocab_dict = {}
token_count = 0
if __name__ == '__main__':
with open(INPUT_FILE, "r") as input_file:
with open(OUTPUT_FILE, "w") as output_file:
for line in input_file:
tokens = line.split()
token_count += len(tokens)
for token in tokens:
value = vocab_dict.get(token, 0)
vocab_dict[token] = value + 1
data = [(value, key) for key, value in vocab_dict.items()]
data.sort(reverse=True)
ori_data_len = len(data)
print("Found so many token:", ori_data_len)
print("Selecting so many of the top token:", TOKEN_TO_PICK)
data = data[:TOKEN_TO_PICK]
print("New number of token:", len(data))
print("Removed so many token:", ori_data_len - len(data))
output_file.write("<S>\n</S>\n<UNK>")
for d in data:
value, key = d
output_file.write("\n")
output_file.write(key)
print("Token count of input file:", token_count)