-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_sent140_vecs.py
110 lines (89 loc) · 3.84 KB
/
gen_sent140_vecs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Script to generate sparse bag-of-word vectors for the Sent140 dataset, to be
used after downloading and preprocessing data from LEAF. See README.
"""
import pickle
import numpy as np
import json
from collections import defaultdict
from progressbar import progressbar
from scipy.sparse import coo_matrix
def generate_bags(user_x, user_y, VOCAB_SIZE, vocab):
"""
Create sparse bag-of-word vectors and targets from text samples.
Args:
- user_x: {list} of strings
- user_y: {list} if integers
- VOCAB_SIZE: {int} size of bag-of-words vectors
- vocab: {dict} mapping from word token to index
Returns:
- bags: {coo_matrix} of bag
- y: {np.array} of targets
"""
y_numpy = np.array(user_y, dtype=np.int32)
bags = np.zeros((len(user_x), VOCAB_SIZE), dtype=np.float32)
# split posts and map tokens to indexes
for (i, post) in enumerate(user_x):
words = post[4].split(' ')
for word in words:
if word in vocab:
bags[i, vocab[word]] += 1
# discard any posts without any tokens in to top VOCAB_SIZE most frequent
total_words_in_vocab = np.sum(bags, axis=1)
bad_samples = np.where(total_words_in_vocab == 0)[0]
bags = np.delete(bags, bad_samples, axis=0)
y_numpy = np.delete(y_numpy, bad_samples)
total_words_in_vocab = np.delete(total_words_in_vocab, bad_samples)
bags = bags / total_words_in_vocab[:,None]
return coo_matrix(bags), y_numpy
def main():
VOCAB_SIZE = 5000 # size of bag-of-words vector
train_fname = 'all_data_niid_0_keep_10_train_8.json' # generated by LEAF
test_fname = 'all_data_niid_0_keep_10_test_8.json' # generated by LEAF
# path to training data file generated by LEAF tool
with open(train_fname, 'r') as f:
train = json.load(f)['user_data']
users = train.keys()
# path to test data file generated by LEAF tool
with open(test_fname, 'r') as f:
test = json.load(f)['user_data']
# calculate the frequency of each word token in the entire dataset
print('Calculating word frequencies...')
words_freqs = defaultdict(int)
for user in progressbar(users):
for x in train[user]['x']:
words = x[4].split(' ')
for word in words:
words_freqs[word] += 1
# order the words by total usage to get the top VOCAB_SIZE tokens
sorted_word_freqs = sorted(words_freqs.items(), key=lambda kv: kv[1], reverse=True)
vocab = sorted_word_freqs[:VOCAB_SIZE]
vocab = dict([(v[0], i) for (i, v) in enumerate(vocab)])
# gen and save training vectors as sparse arrays
print('Generating training bag-of-words...')
train_x = []
train_y = []
for user in progressbar(users):
bags, targets = generate_bags( train[user]['x'],
train[user]['y'],
VOCAB_SIZE,
vocab)
train_x.append(bags)
train_y.append(targets)
with open('./datasets/sent140/train_data_sparse_10k.pkl', 'wb') as f:
pickle.dump((train_x, train_y), f)
# gen and save training vectors as sparse arrays
print('Generating testing bag-of-words...')
test_x = []
test_y = []
for user in progressbar(users):
bags, targets = generate_bags( test[user]['x'],
test[user]['y'],
VOCAB_SIZE,
vocab)
test_x.append(bags)
test_y.append(targets)
with open('./datasets/sent140/test_data_sparse_10k.pkl', 'wb') as f:
pickle.dump((test_x, test_y), f)
if __name__ == '__main__':
main()