-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utilsV2.py
131 lines (118 loc) · 4.78 KB
/
data_utilsV2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
from collections import Counter
import config
import re
import numpy as np
import pickle
train_neg_base = "./data/train/neg/"
train_pos_base = "./data/train/pos"
test_neg_base = "./data/test/neg"
test_pos_base = "./data/test/pos"
base_paths = [train_neg_base, train_pos_base, test_neg_base, test_pos_base]
def read_data(base_paths):
train_path_and_labels = {}
test_path_and_labels = {}
all_file = []
all_data = []
label_encode = {"neg": 0, "pos": 1}
for base_path in base_paths:
label = base_path.split("/")[3]
label = label_encode[label]
files = os.listdir(base_path)
for file in files:
path = os.path.join(base_path, file)
if "train" in path:
train_path_and_labels[path] = label
else:
test_path_and_labels[path] = label
all_file.append(path)
for file in all_file:
with open(file, "r")as f:
all_data.append(f.read())
all_data = [data.replace("<br /><br />", "") for data in all_data]
return all_data, train_path_and_labels, test_path_and_labels
def make_vocab(all_data):
text = " ".join(all_data)
counter_words = Counter(text.split(" "), )
most_common_words = counter_words.most_common(1000)
word2index = {"PAD": 0}
for index, word in enumerate(most_common_words, start=1):
word2index[word[0]] = index
word2index["UNK"] = len(word2index)
index2word = {index: word for word, index in word2index.items()}
return word2index, index2word
def make_dataset(base_paths):
sentence_split_symbol = ".|!"
train_data = {"text": [], "label": [], "sentence_length": []}
test_data = {"text": [], "label": [], "sentence_length": []}
all_data, train_path_and_labels, test_path_and_labels = read_data(base_paths)
word2index, index2word = make_vocab(all_data)
for path, label in train_path_and_labels.items():
temp_text = []
temp_length = []
with open(path, "r")as f: # reda doc for train dataset
content = f.read()
sentences = content.split(".")
sentences = sentences[:config.max_s]
train_data["label"].append(label)
for s in sentences:
s = s.split(" ")[:config.max_w]
if "" in s: s.remove("")
if len(s) == 0:
continue
word_index = []
for word in s:
try:
word_index.append(word2index[word])
except:
word_index.append(word2index["UNK"])
length = len(word_index)
word_index = word_index + [0] * (config.max_w - len(word_index))
temp_length.append(length)
temp_text.append(word_index)
if len(temp_text) < config.max_s:
for _ in range(0, config.max_s - len(temp_text)):
temp_text.append([0 for _ in range(0, config.max_w)])
temp_length.append(0)
train_data["text"].append(temp_text)
train_data["sentence_length"].append(temp_length)
with open("./data/train.pkl", "wb")as f:
pickle.dump(train_data, f)
for path, label in test_path_and_labels.items():
temp_text = []
temp_length = []
with open(path, "r")as f: # reda doc for test dataset
content = f.read()
sentences = content.split(".")
sentences = sentences[:config.max_s]
test_data["label"].append(label)
for s in sentences:
s = s.split(" ")[:config.max_w]
if "" in s: s.remove("")
if len(s) == 0:
continue
word_index = []
for word in s:
try:
word_index.append(word2index[word])
except:
word_index.append(word2index["UNK"])
length = len(word_index)
word_index = word_index + [0] * (config.max_w - len(word_index))
temp_length.append(length)
temp_text.append(word_index)
if len(temp_text) < config.max_s:
for _ in range(0, config.max_s - len(temp_text)):
temp_text.append([0 for _ in range(0, config.max_w)])
temp_length.append(0)
test_data["text"].append(temp_text)
test_data["sentence_length"].append(temp_length)
with open("./data/test.pkl", "wb")as f:
pickle.dump(test_data, f)
if __name__ == "__main__":
train_neg_base = "./data/train/neg/"
train_pos_base = "./data/train/pos"
test_neg_base = "./data/test/neg"
test_pos_base = "./data/test/pos"
base_paths = [train_neg_base, train_pos_base, test_neg_base, test_pos_base]
make_dataset(base_paths)