-
Notifications
You must be signed in to change notification settings - Fork 204
/
Copy pathw2v.py
62 lines (52 loc) · 2.55 KB
/
w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from __future__ import print_function
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np
def train_word2vec(sentence_matrix, vocabulary_inv,
num_features=300, min_word_count=1, context=10):
"""
Trains, saves, loads Word2Vec model
Returns initial weights for embedding layer.
inputs:
sentence_matrix # int matrix: num_sentences x max_sentence_len
vocabulary_inv # dict {int: str}
num_features # Word vector dimensionality
min_word_count # Minimum word count
context # Context window size
"""
model_dir = 'models'
model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
model_name = join(model_dir, model_name)
if exists(model_name):
embedding_model = word2vec.Word2Vec.load(model_name)
print('Load existing Word2Vec model \'%s\'' % split(model_name)[-1])
else:
# Set values for various parameters
num_workers = 2 # Number of threads to run in parallel
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model
print('Training Word2Vec model...')
sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
size=num_features, min_count=min_word_count,
window=context, sample=downsampling)
# If we don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
# Saving the model for later use. You can load it later using Word2Vec.load()
if not exists(model_dir):
os.mkdir(model_dir)
print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
embedding_model.save(model_name)
# add unknown words
embedding_weights = {key: embedding_model[word] if word in embedding_model else
np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
for key, word in vocabulary_inv.items()}
return embedding_weights
if __name__ == '__main__':
import data_helpers
print("Loading data...")
x, _, _, vocabulary_inv_list = data_helpers.load_data()
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
w = train_word2vec(x, vocabulary_inv)