-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2vec.py
53 lines (42 loc) · 1.75 KB
/
doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from gensim import models
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pathlib
import json
class Doc2VecModel:
def __init__(self):
if pathlib.Path('./my_model.doc2vec').is_file():
self.model = models.Doc2Vec.load('my_model.doc2vec')
else:
self.model = self.create_model()
return
def create_model(self, size=1000, window=5, min_count=5, iter=20):
with open('./db.json', 'r') as in_file:
movies = json.load(in_file)
docs = []
for movie in movies:
review_comb = ""
for review in movie['reviews']:
review_comb += review
docs.append(TaggedDocument(words=[x.lower() for x
in review_comb.split(" ")],
tags=[movie['title']]))
self.model = Doc2Vec(docs, vector_size=size, window=window,
min_count=min_count, workers=8, epochs=iter)
self.model.save("my_model.doc2vec")
return self.model
def recommendation(self, movie_title, topn=None):
if topn is None:
topn = len(self.model.docvecs)
return self.model.docvecs.most_similar([movie_title],
topn=topn)
def main(self):
pass
#print(self.model.docvecs['The Godfather'])
#print(self.model.similar_by_word('airplane'))
#print(self.model.similar_by_word('plant'))
#print(self.model.similar_by_word('air'))
#print(self.model.docvecs.most_similar(["The Godfather"],
#topn=len(self.model.docvecs)))
if __name__ == "__main__":
d2v = Doc2VecModel()
d2v.main()