-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimizer.py
200 lines (163 loc) · 5.83 KB
/
optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import numpy as np
from textwrap import shorten
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib2tikz import save as tikz_save
from recommender import Recommender
from itertools import product
from tfidf_lsa import calculate_corpus_var
class defaultlist(list):
"""List returning default value when accessing uninitialized index.
Original implementation: http://stackoverflow.com/a/8719940/315168
"""
def __init__(self, fx):
self._fx = fx
def __setitem__(self, index, value):
while len(self) <= index:
self.append(self._fx())
list.__setitem__(self, index, value)
def __getitem__(self, index):
"""Allows self.dlist[0] style access before value is initialized."""
while len(self) <= index:
self.append(self._fx())
return list.__getitem__(self, index)
def calculate_variation(movie, director, genres, imdb_keyword, lsa, rec):
film_quantity = rec.film_quantity()
ratio = {'director': director,
'genres': genres,
'imdb_keywords': imdb_keyword,
'lsa': lsa}
res = rec.recommend(movie, ratio, film_quantity)
movs = [r['title'] for r in res] # all recommendations
# tot = 0
scores = []
for mov in objective[movie]:
try:
scores.append(movs.index(mov)) # append to scores the movie score
# tot += movs.index(mov)
except Exception as e:
print(e)
scores.append(0) # Movie not found!
return scores
def get_cmap(n, name='jet'):
return plt.cm.get_cmap(name, n)
def rand_jitter(arr):
stdev = .01*(max(arr)-min(arr))
return arr + np.random.randn(len(arr)) * stdev
def calculate(movie):
"""
max_df_v = [50]
min_df_v = [5]
max_features_v = [None] # TF-IDF features
n_components_v = [1000] # LSA components
"""
max_df_v = [100]
min_df_v = [2, 5, 10, 25]
max_features_v = [None] # TF-IDF features
n_components_v = [1000] # LSA components
director_v = [0]
genres_v = [0]
imdb_keywords_v = [0]
lsa_v = [1]
# Enumerate movies
x = [x for x, y in enumerate(objective[movie])]
my_xticks = objective[movie] # labels in x axis
tfidf_lsa_variations = product(max_df_v, min_df_v,
n_components_v, max_features_v)
i = 0
results = []
labels = []
plt.ylabel("Posición de la recomendación")
plt.xlabel("Películas")
for max_df, min_df, n_components, max_features in tfidf_lsa_variations:
# Train a new model
calculate_corpus_var(max_df, min_df, n_components, max_features)
rec = Recommender()
movie_weights = product(director_v, genres_v, imdb_keywords_v, lsa_v)
for director, genres, imdb_keywords, lsa in movie_weights:
if lsa == 0 and imdb_keywords == 0:
continue
if imdb_keywords == 0 and lsa == 6:
continue
if imdb_keywords == 0 and lsa == 12:
continue
i += 1
# Iterate weight combinations
result = calculate_variation(movie, director, genres,
imdb_keywords, lsa, rec)
results.append(result)
print(result)
label_template = "feat {} cmpn {} mndf {} mxdf {} keyw {}\% lsa {}\% tot {}"
try:
ratiop = int((imdb_keywords/(imdb_keywords + lsa)) * 100)
except Exception:
ratiop = 0.0
label = label_template.format(max_features, n_components,
min_df, max_df, ratiop,
100 - ratiop, sum(result))
labels.append(label)
print(label)
# sort results and labels accordingly:
totals = defaultlist(int)
for result in results:
for idx, score in enumerate(result):
totals[idx] += score
print(totals)
for idx, result in enumerate(results):
results[idx] = [z for (y, z) in sorted(zip(totals, result),
key=lambda pair: pair[0])]
print(results)
# Tuncate longer titles to 12 characters
my_xticks = [
shorten(z, width=15, placeholder=".")
for (y, z)
in sorted(zip(totals, my_xticks),
key=lambda pair: pair[0])
]
plt.xticks(x, my_xticks, rotation=45)
cmap = get_cmap(i) # Number of variations
for idx, (result, label) in enumerate(zip(results, labels)):
plt.scatter(rand_jitter(x), result, color=cmap(idx), label=label)
fit = np.polyfit(x, result, deg=1)
plt.plot(np.unique(x), np.poly1d(fit)(np.unique(x)),
color=cmap(idx), alpha=0.5)
# Save figure and clean - One chart per film with all variations
plt.gca().set_ylim(bottom=0)
fontP = FontProperties()
fontP.set_size('small')
plt.legend(loc='best', prop=fontP)
tikz_save("./{}.tex".format(''.join(movie)),
figureheight='\\figureheight',
figurewidth='\\figurewidth')
plt.clf() # Clean figure
if __name__ == '__main__':
"""
"""
objective = {
"2001: A Space Odyssey": [
"Alien",
"Distrito 9",
"Interstellar",
"Solaris",
"The Martian",
"Gravity",
"Planet of the Apes"
],
"Apocalypse Now": [
"Platoon",
"Full Metal Jacket",
"Paths of Glory",
"Saving Private Ryan",
"The Deer Hunter"
],
"Ratatouille": [
"The Incredibles",
"Toy Story 2",
"Monsters, Inc.",
"Shrek",
"Ponyo",
"My Neighbor Totoro"
]
}
for movie in objective.keys():
calculate(movie)