-
Notifications
You must be signed in to change notification settings - Fork 12
/
datautilsbca.py
118 lines (96 loc) · 3.32 KB
/
datautilsbca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import os.path
import pandas as pd
from io import StringIO
import io
import unicodedata
import re
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from nltk import sent_tokenize
def read_data(raw_text):
content = raw_text
content = content.split() #splits the text by spaces (default split character)
content = np.array(content)
content = np.reshape(content, [-1, ])
return content
def clean_(t_):
t_ = re.sub('\s+',' ',t_)
t_ = re.sub('- ','',t_)
url_reg = r'[a-z]*[:.]+\S+'
t_ = re.sub(url_reg, '', t_)
t_ = re.sub('([.,!?()])', r' \1 ', t_)
t_ = re.sub(r'\'s', ' \'s', t_)
t_ = re.sub(r'\'re', ' \'re', t_)
t_ = re.sub(r'\'ll', ' \'ll', t_)
t_ = re.sub(r'\'m', ' \'m', t_)
t_ = re.sub(r'\'d', ' \'d', t_)
t_ = re.sub(r'can\'t', 'can n\'t', t_)
t_ = re.sub(r'n\'t', ' n\'t', t_)
t_ = re.sub(r'sn\'t', 's n\'t', t_)
t_ = re.sub('\s{2,}', ' ', t_)
t_ = t_.lower()
mydict = us_gb_dict()
t_ = replace_all(t_, mydict)
return(t_)
def us_gb_dict():
filepath = 'us_gb.txt'
with open(filepath, 'r') as fp:
read = fp.read()
us = []
gb = []
gb_f = True
for i in read.splitlines():
line = i.strip()
#print(line)
if line == "US":
gb_f = False
elif gb_f == True:
gb.append(line)
else:
us.append(line)
us2gb = dict(zip(gb, us))
return us2gb
def replace_all(text, mydict):
for gb, us in mydict.items():
text = text.replace(gb, us)
return text
def zero_pad(X, seq_len):
return np.array([x[:seq_len - 1] + [0] * max(seq_len - len(x), 1) for x in X], dtype = int)
def zero_pad_test(X, seq_len_div):
diff = seq_len_div - (len(X)%seq_len_div)
return np.concatenate((np.array([x for x in X],dtype = int),np.zeros((diff,len(X[0])),dtype = int)), axis = 0)
def read_test_set(df_test, dictionary, SEQUENCE_LEN_D = 40, SEQUENCE_LEN = 65, BATCH_SIZE = 10):
X_train = []
for i in df_test['text1']:
i = sent_tokenize(i)
X_train.append([dictionary['START_SENT']])
for j in i[:SEQUENCE_LEN_D-2]:
x = read_data(str(j).lower())
data = []
data.append(dictionary['START'])
for word in x:
if word in dictionary:
index = dictionary[word]
count_iv_train += 1
else:
index = dictionary['UNK']
count_oov_train += 1
data.append(index)
data.append(dictionary['END'])
X_train.append(data)
X_train.append([dictionary['END_SENT']])
for k in range(max(SEQUENCE_LEN_D - (len(i)+2), 0)):
X_train.append([0]) # pad token maps to 0
print('len of test set: ', len(X_train)//BATCH_SIZE)
#X_test_ = zero_pad(X_test_, SEQUENCE_LEN)
#X_test_ = zero_pad_test(X_test_, BATCH_SIZE*SEQUENCE_LEN_D)
rank_val = df_test['label']
target_val = np.array(rank_val)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = target_val.reshape(len(target_val), 1)
y_test = onehot_encoder.fit_transform(integer_encoded)
return X_train, y_test