-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathvectorize_documents.py
130 lines (121 loc) · 3.79 KB
/
vectorize_documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import _pickle as pickle
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
w2v_file = "data/glove.840B.300d.txt"
w2v = None
def vectorize_tagged_items(item_features, item_feature_labels, w2v_pickle=None):
"""
Returns dense 300d vector representations of each item by taking the average word embedding of their tags.
Params
------
item_features: sparse scipy coo or csr matrix of shape (num_items, num_tags), where each row represents a bag of words vector
item_feature_labels: array of strings of length num_tags; string labels for each tag corresponding to column indices of item_features
"""
global w2v
items, vocab = get_item_objects(item_features, item_feature_labels)
if w2v_pickle:
with open(w2v_pickle, 'rb') as f:
w2v = pickle.load(f)
if not w2v:
w2v = load_bin_vec(w2v_file, vocab)
W, word_idx_map = get_W(w2v)
item_embeddings = make_document_embeddings(items, W, word_idx_map)
return items, item_embeddings
def get_item_objects(item_features, item_feature_labels):
item_features = item_features.tocoo()
vocab = set(item_feature_labels)
items = [None] * item_features.shape[0]
prev_item_idx = -1
i = 0
while i < len(item_features.row):
item_idx = item_features.row[i]
while i < len(item_features.row) and item_idx == prev_item_idx:
# Still building the current item's tag document
tag = item_feature_labels[item_features.col[i]]
items[item_idx]['tags'].append(tag)
split_tags = tag.split("-")
for t in split_tags:
items[item_idx]['text'].append(t)
items[item_idx]['weights'].append(1)
i += 1
prev_item_idx = item_idx
if i < len(item_features.row):
item_idx = item_features.row[i]
if i < len(item_features.row):
# Starting a new item
tag = item_feature_labels[item_features.col[i]]
split_tags = tag.split("-")
items[item_idx] = {
'text': split_tags,
'tags': [tag],
'weights': [1] * len(split_tags)
}
i += 1
prev_item_idx = item_idx
return items, vocab
def sparse_vectorize_tagged_items(item_features, item_feature_labels):
items, _ = get_item_objects(item_features, item_feature_labels)
return items, item_features.toarray()
def make_document_embeddings(documents, word_vecs, word_idx_map):
"""
Transforms documents into a weighted average of the embeddings of their words.
"""
data = []
total = 0
for doc in documents:
if doc:
weights = doc["weights"]
words = doc["text"]
word_embeddings = []
for word in words:
total += 1
if word in word_idx_map:
idx = word_idx_map[word]
try:
word_embeddings.append(word_vecs[idx])
except:
print(word)
print(idx)
print(len(word_embeddings))
raise
else:
word_embeddings.append(np.random.normal(0, 1, (300,)))
matrix = np.array(word_embeddings, dtype='float32')
data.append(np.average(matrix, axis=0, weights=weights))
else:
data.append(np.random.normal(0, 1, (300,)))
print(str(total) + " words embedded.")
return np.array(data)
def load_bin_vec(fname, vocab):
"""
Loads 300x1 word vecs from GloVe Common Crawl 840B (https://nlp.stanford.edu/projects/glove/)
"""
word_vecs = {}
with open(fname, "r", encoding='latin-1') as f:
for line in f:
vec_line = line.rstrip().split(' ')
word = vec_line[0]
word_vec = np.array([float(vec_line[i]) for i in range(1, len(vec_line))])
if word in vocab:
word_vecs[word] = word_vec
else:
continue
if len(word_vecs) == len(vocab):
break
return word_vecs
def get_W(word_vecs, k=300):
"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k))
W[0] = np.zeros(k)
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map