-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
186 lines (155 loc) · 4.83 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import random
import os
import math
import numpy as np
import torch
import pickle
class ClassifierEvaluationDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings.input_ids)
class AverageMeter(object):
def __init__(self):
self.clear()
def clear(self):
self.cnt = 0
self.sum = 0
self.avg = 0
def update(self, val, n=1):
self.cnt += n
self.sum += val * n
self.avg = self.sum / self.cnt
def evaluate(model, batches):
model.eval()
meters = collections.defaultdict(lambda: AverageMeter())
with torch.no_grad():
for inputs, targets in batches:
losses = model.autoenc(inputs, targets, args)
for k, v in losses.items():
meters[k].update(v.item(), inputs.size(1))
loss = model.loss({k: meter.avg for k, meter in meters.items()})
meters['loss'].update(loss)
return meters
def shuffle_in_unison(a, b):
c = list(zip(a, b))
random.shuffle(c)
a, b = zip(*c)
return a, b
def get_anneal_weight(step, args):
assert(args.fn != 'none')
if args.fn == 'logistic':
return 1 - float(1/(1+np.exp(-args.k*(step-args.tot_steps))))
elif args.fn == 'sigmoid':
return 1 - (math.tanh((step - args.tot_steps * 1.5)/(args.tot_steps / 3))+ 1)
elif args.fn == 'linear':
return min(1, step/args.tot_steps)
else:
exit("wrong option in args.fn")
def clean_line(line):
line = line.lower().replace(',','').replace('.','').replace('\'', '').replace('!','').replace(')','').replace('(','').replace(';','').replace('-', ' ').replace('/', ' ').replace('%','').replace('?','').replace('"', '')
return line
def set_seed(seed): # set the random seed for reproducibility
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
def get_cbow(vocab, paths=["data/lexicon/neg_words.txt", "data/lexicon/pos_words.txt"]):
count = 0
cbow = dict(vocab.word2idx)
for path in paths:
with open(path) as f:
for line in f:
word = line[:-1]
count += 1
if word in cbow:
del cbow[word]
# print("Found {} sentiment words in {}".format(count, path))
# print("Original w2i size: {}".format(len(vocab.word2idx)))
return { w: it for it, (w, i) in enumerate(cbow.items())}
def create_glove(embd_dim=300):
path = 'glove/glove.6B.{}d.txt'.format(embd_dim)
print("Creating glove embds from {}".format(path))
glove = {}
with open(path, 'rb') as f:
for idx, l in enumerate(f):
if(idx % 100000 == 0):
print(idx)
line = l.decode().split()
glove[line[0]] = np.array(line[1:]).astype(np.float)
pickle.dump(glove, open('glove/glove-{}.pkl'.format(embd_dim), 'wb'))
def load_glove(embd_dim=300):
path = 'glove/glove-{}.pkl'.format(embd_dim)
if(os.path.exists(path)):
glove = pickle.load(open(path, 'rb'))
else:
exit("did not find glove in given path")
return glove
def get_glove(vocab, embd_dim=300):
glove = load_glove(embd_dim)
glove_mat = np.zeros((len(vocab.word2idx), glove[next(iter(glove))].size))
for word, idx in vocab.word2idx.items():
glove_mat[idx] = glove[word] if word in glove else np.random.normal(scale=0.6, size=(glove[next(iter(glove))].size, ))
return glove_mat
def strip_eos(sents):
return [sent[:sent.index('<eos>')] if '<eos>' in sent else sent
for sent in sents]
def load_sent(path, split=True):
sents = []
with open(path) as f:
for line in f:
if(split):
sents.append(line.split())
else:
sents.append(line)
return sents
def write_sent(sents, path):
with open(path, 'w') as f:
for s in sents:
l = ' '.join(s) + '\n'
if l == '\n':
f.write('<blank> \n')
else:
f.write(l)
def write_doc(docs, path):
with open(path, 'w') as f:
for d in docs:
for s in d:
f.write(' '.join(s) + '\n')
f.write('\n')
def write_z(z, path):
with open(path, 'w') as f:
for zi in z:
for zij in zi:
f.write('%f ' % zij)
f.write('\n')
def logging(s, path, print_=True):
if print_:
print(s)
if path:
with open(path, 'a+') as f:
f.write(s + '\n')
def lerp(t, p, q):
return (1-t) * p + t * q
# spherical interpolation https://github.com/soumith/dcgan.torch/issues/14#issuecomment-199171316
def slerp(t, p, q):
o = np.arccos(np.dot(p/np.linalg.norm(p), q/np.linalg.norm(q)))
so = np.sin(o)
return np.sin((1-t)*o) / so * p + np.sin(t*o) / so * q
def interpolate(z1, z2, n):
z = []
for i in range(n):
zi = lerp(1.0*i/(n-1), z1, z2)
z.append(np.expand_dims(zi, axis=0))
return np.concatenate(z, axis=0)
def shuffle_two_lists(list1, list2):
temp = list(zip(list1, list2))
random.shuffle(temp)
res1, res2 = zip(*temp)
return res1, res2
def flip(p=0.008):
return True if random.random() < p else False
if __name__ == "__main__":
pass