-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
78 lines (68 loc) · 2.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
# Load a clean dataset
def load_clean_sentences(filename):
return load(open(filename, 'rb'))
# Fit a tokenizer, mapping words to integers
def create_tokenizer(lines):
print(len(lines))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# Length of the longest sequence in a list of phrases.
def max_length(lines):
return max(len(line.split()) for line in lines)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
X = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X, maxlen=length, padding='post')
return X
# one hot encode target sequence
def encode_output(sequences, vocab_size):
ylist = list()
for sequence in sequences:
encoded = to_categorical(sequence, num_classes=vocab_size)
ylist.append(encoded)
y = array(ylist)
y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
return y
# <=======MAIN LOGIC=======>
# load datasets
dataset = load_clean_sentences('english-korean-both.pkl')
train = load_clean_sentences('english-korean-train.pkl')
test = load_clean_sentences('english-korean-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(array(dataset)[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(array(dataset)[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare korean tokenizer
ko_tokenizer = create_tokenizer(array(dataset)[:, 1])
ko_vocab_size = len(ko_tokenizer.word_index) + 1
ko_length = max_length(array(dataset)[:, 1])
print('Korean Vocabulary Size: %d' % ko_vocab_size)
print('Korean Max Length: %d' % (ko_length))
# Prepare training data
trainX = encode_sequences(ko_tokenizer, ko_length, array(train)[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, array(train)[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# Prepare testing data
testX = encode_sequences(ko_tokenizer, ko_length, array(test)[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, array(test)[:, 0])
testY = encode_output(testY, eng_vocab_size)
print(testX[5])
print(testY[5])