-
Notifications
You must be signed in to change notification settings - Fork 227
/
Copy pathknn_recommender.py
257 lines (219 loc) · 8.65 KB
/
knn_recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import os
import time
import gc
import argparse
# data science imports
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
# utils import
from fuzzywuzzy import fuzz
class KnnRecommender:
"""
This is an item-based collaborative filtering recommender with
KNN implmented by sklearn
"""
def __init__(self, path_movies, path_ratings):
"""
Recommender requires path to data: movies data and ratings data
Parameters
----------
path_movies: str, movies data file path
path_ratings: str, ratings data file path
"""
self.path_movies = path_movies
self.path_ratings = path_ratings
self.movie_rating_thres = 0
self.user_rating_thres = 0
self.model = NearestNeighbors()
def set_filter_params(self, movie_rating_thres, user_rating_thres):
"""
set rating frequency threshold to filter less-known movies and
less active users
Parameters
----------
movie_rating_thres: int, minimum number of ratings received by users
user_rating_thres: int, minimum number of ratings a user gives
"""
self.movie_rating_thres = movie_rating_thres
self.user_rating_thres = user_rating_thres
def set_model_params(self, n_neighbors, algorithm, metric, n_jobs=None):
"""
set model params for sklearn.neighbors.NearestNeighbors
Parameters
----------
n_neighbors: int, optional (default = 5)
algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
metric: string or callable, default 'minkowski', or one of
['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
n_jobs: int or None, optional (default=None)
"""
if n_jobs and (n_jobs > 1 or n_jobs == -1):
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
self.model.set_params(**{
'n_neighbors': n_neighbors,
'algorithm': algorithm,
'metric': metric,
'n_jobs': n_jobs})
def _prep_data(self):
"""
prepare data for recommender
1. movie-user scipy sparse matrix
2. hashmap of movie to row index in movie-user scipy sparse matrix
"""
# read data
df_movies = pd.read_csv(
os.path.join(self.path_movies),
usecols=['movieId', 'title'],
dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(
os.path.join(self.path_ratings),
usecols=['userId', 'movieId', 'rating'],
dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
# filter data
df_movies_cnt = pd.DataFrame(
df_ratings.groupby('movieId').size(),
columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= @self.movie_rating_thres').index)) # noqa
movies_filter = df_ratings.movieId.isin(popular_movies).values
df_users_cnt = pd.DataFrame(
df_ratings.groupby('userId').size(),
columns=['count'])
active_users = list(set(df_users_cnt.query('count >= @self.user_rating_thres').index)) # noqa
users_filter = df_ratings.userId.isin(active_users).values
df_ratings_filtered = df_ratings[movies_filter & users_filter]
# pivot and create movie-user matrix
movie_user_mat = df_ratings_filtered.pivot(
index='movieId', columns='userId', values='rating').fillna(0)
# create mapper from movie title to index
hashmap = {
movie: i for i, movie in
enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title)) # noqa
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
# clean up
del df_movies, df_movies_cnt, df_users_cnt
del df_ratings, df_ratings_filtered, movie_user_mat
gc.collect()
return movie_user_mat_sparse, hashmap
def _fuzzy_matching(self, hashmap, fav_movie):
"""
return the closest match via fuzzy ratio.
If no match found, return None
Parameters
----------
hashmap: dict, map movie title name to index of the movie in data
fav_movie: str, name of user input movie
Return
------
index of the closest match
"""
match_tuple = []
# get match
for title, idx in hashmap.items():
ratio = fuzz.ratio(title.lower(), fav_movie.lower())
if ratio >= 60:
match_tuple.append((title, idx, ratio))
# sort
match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
if not match_tuple:
print('Oops! No match is found')
else:
print('Found possible matches in our database: '
'{0}\n'.format([x[0] for x in match_tuple]))
return match_tuple[0][1]
def _inference(self, model, data, hashmap,
fav_movie, n_recommendations):
"""
return top n similar movie recommendations based on user's input movie
Parameters
----------
model: sklearn model, knn model
data: movie-user matrix
hashmap: dict, map movie title name to index of the movie in data
fav_movie: str, name of user input movie
n_recommendations: int, top n recommendations
Return
------
list of top n similar movie recommendations
"""
# fit
model.fit(data)
# get input movie index
print('You have input movie:', fav_movie)
idx = self._fuzzy_matching(hashmap, fav_movie)
# inference
print('Recommendation system start to make inference')
print('......\n')
t0 = time.time()
distances, indices = model.kneighbors(
data[idx],
n_neighbors=n_recommendations+1)
# get list of raw idx of recommendations
raw_recommends = \
sorted(
list(
zip(
indices.squeeze().tolist(),
distances.squeeze().tolist()
)
),
key=lambda x: x[1]
)[:0:-1]
print('It took my system {:.2f}s to make inference \n\
'.format(time.time() - t0))
# return recommendation (movieId, distance)
return raw_recommends
def make_recommendations(self, fav_movie, n_recommendations):
"""
make top n movie recommendations
Parameters
----------
fav_movie: str, name of user input movie
n_recommendations: int, top n recommendations
"""
# get data
movie_user_mat_sparse, hashmap = self._prep_data()
# get recommendations
raw_recommends = self._inference(
self.model, movie_user_mat_sparse, hashmap,
fav_movie, n_recommendations)
# print results
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(fav_movie))
for i, (idx, dist) in enumerate(raw_recommends):
print('{0}: {1}, with distance '
'of {2}'.format(i+1, reverse_hashmap[idx], dist))
def parse_args():
parser = argparse.ArgumentParser(
prog="Movie Recommender",
description="Run KNN Movie Recommender")
parser.add_argument('--path', nargs='?', default='../data/MovieLens',
help='input data path')
parser.add_argument('--movies_filename', nargs='?', default='movies.csv',
help='provide movies filename')
parser.add_argument('--ratings_filename', nargs='?', default='ratings.csv',
help='provide ratings filename')
parser.add_argument('--movie_name', nargs='?', default='',
help='provide your favoriate movie name')
parser.add_argument('--top_n', type=int, default=10,
help='top n movie recommendations')
return parser.parse_args()
if __name__ == '__main__':
# get args
args = parse_args()
data_path = args.path
movies_filename = args.movies_filename
ratings_filename = args.ratings_filename
movie_name = args.movie_name
top_n = args.top_n
# initial recommender system
recommender = KnnRecommender(
os.path.join(data_path, movies_filename),
os.path.join(data_path, ratings_filename))
# set params
recommender.set_filter_params(50, 50)
recommender.set_model_params(20, 'brute', 'cosine', -1)
# make recommendations
recommender.make_recommendations(movie_name, top_n)