-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataset.py
84 lines (75 loc) · 2.76 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import random
def create_vocab(name):
path = "./alphabet/" + name
file = open(path + "_dict.txt", mode="r", encoding="utf8")
lines = file.readlines()
list = []
dict = {}
for i in lines:
value, index = i.split("\t")
list.append(value)
dict[value] = int(index)
return list, dict
def get_items(path, word_dict, intent_dict, slot_dict):
file = open(path, mode="r", encoding="utf8")
item = []
lines = file.readlines()
tmp_word = []
tmp_slot = []
for i in lines:
if i == "\n":
continue
i = i.replace("\n", "", -1)
line = i.split(" ")
if (len(line)) == 1:
item.append((tmp_word, [1 for j in range(len(tmp_word))], tmp_slot,
[intent_dict[line[0]] for j in range(len(tmp_word))]))
tmp_slot = []
tmp_word = []
else:
if line[0] in word_dict:
tmp_word.append(word_dict[line[0]])
else:
tmp_word.append(1)
tmp_slot.append(slot_dict[line[1]])
return item
class DataProcessor:
def __init__(self):
self.intent_list, self.intent_dict = create_vocab("intent")
self.word_list, self.word_dict = create_vocab("word")
self.slot_list, self.slot_dict = create_vocab("slot")
self.batch_size = 1
self.data = {'train': get_items(
"./data/snips/train.txt",
self.word_dict,
self.intent_dict,
self.slot_dict,
), 'test': get_items(
"./data/snips/test.txt",
self.word_dict,
self.intent_dict,
self.slot_dict,
), 'dev': get_items(
"./data/snips/dev.txt",
self.word_dict,
self.intent_dict,
self.slot_dict,
)}
def get_data(self, name="train", batch_size=32):
data = self.data[name]
random.shuffle(data)
word, seq, slot, intent = ([data[i][j] for i in range(len(data))]for j in range(4))
batches = [(word[i:i + batch_size], seq[i:i + batch_size], slot[i:i + batch_size], intent[i:i + batch_size]) for i in
range(0, len(word), batch_size)]
all_batch = []
for batch in batches:
max_len = max([len(item) for item in batch[0]])
length = [len(item) for item in batch[0]]
all_batch.append(
([[item + [0 for i in range(max_len - len(item))] for item in batch[j]] for j in range(len(batch))]+[length])
)
return all_batch
if __name__ == "__main__":
# datasets = get_dataset(8, 73, 16, 10)
data_processor = DataProcessor()
data_processor.get_data('train')