-
Notifications
You must be signed in to change notification settings - Fork 2
/
recommend.py
247 lines (199 loc) · 8.42 KB
/
recommend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""A Yelp-powered Restaurant Recommendation Program"""
from abstractions import *
from data import ALL_RESTAURANTS, CATEGORIES, USER_FILES, load_user_file
from ucb import main, trace, interact
from utils import distance, mean, zip, enumerate, sample
from visualize import draw_map
##################################
# Phase 2: Unsupervised Learning #
##################################
def find_closest(location, centroids):
"""Return the centroid in centroids that is closest to location.
If multiple centroids are equally close, return the first one.
>>> find_closest([3.0, 4.0], [[0.0, 0.0], [2.0, 3.0], [4.0, 3.0], [5.0, 5.0]])
[2.0, 3.0]
"""
# BEGIN Question 3
return min(centroids, key = lambda centroid: distance(location, centroid))
# END Question 3
def group_by_first(pairs):
"""Return a list of pairs that relates each unique key in the [key, value]
pairs to a list of all values that appear paired with that key.
Arguments:
pairs -- a sequence of pairs
>>> example = [ [1, 2], [3, 2], [2, 4], [1, 3], [3, 1], [1, 2] ]
>>> group_by_first(example)
[[2, 3, 2], [2, 1], [4]]
"""
keys = []
for key, _ in pairs:
if key not in keys:
keys.append(key)
return [[y for x, y in pairs if x == key] for key in keys]
def group_by_centroid(restaurants, centroids):
"""Return a list of clusters, where each cluster contains all restaurants
nearest to a corresponding centroid in centroids. Each item in
restaurants should appear once in the result, along with the other
restaurants closest to the same centroid.
"""
# BEGIN Question 4
closest_centroid = [find_closest(restaurant_location(x), centroids) for x in restaurants]
return group_by_first([[closest_centroid[i], restaurants[i]] for i in range(len(restaurants))])
# END Question 4
def find_centroid(cluster):
"""Return the centroid of the locations of the restaurants in cluster."""
# BEGIN Question 5
location_list = [restaurant_location(x) for x in cluster]
total_x, total_y, i = 0, 0, 0
while i < len(location_list):
total_x += location_list[i][0]
total_y += location_list[i][1]
i += 1
return [total_x / len(location_list), total_y / len(location_list)]
# END Question 5
def k_means(restaurants, k, max_updates=100):
"""Use k-means to group restaurants by location into k clusters."""
assert len(restaurants) >= k, 'Not enough restaurants to cluster'
old_centroids, n = [], 0
# Select initial centroids randomly by choosing k different restaurants
centroids = [restaurant_location(r) for r in sample(restaurants, k)]
while old_centroids != centroids and n < max_updates:
old_centroids = centroids
# BEGIN Question 6
clusters = group_by_centroid(restaurants, centroids)
centroids = [find_centroid(i) for i in clusters]
# END Question 6
n += 1
return centroids
################################
# Phase 3: Supervised Learning #
################################
def find_predictor(user, restaurants, feature_fn):
"""Return a rating predictor (a function from restaurants to ratings),
for a user by performing least-squares linear regression using feature_fn
on the items in restaurants. Also, return the R^2 value of this model.
Arguments:
user -- A user
restaurants -- A sequence of restaurants
feature_fn -- A function that takes a restaurant and returns a number
"""
reviews_by_user = {review_restaurant_name(review): review_rating(review)
for review in user_reviews(user).values()}
xs = [feature_fn(r) for r in restaurants]
ys = [reviews_by_user[restaurant_name(r)] for r in restaurants]
# BEGIN Question 7
list_sxx = [(xs[i] - mean(xs))**2 for i in range(len(xs))]
list_syy = [(ys[i] - mean(ys)) ** 2 for i in range(len(ys))]
list_sxy = [((xs[i] - mean(xs)) * (ys[i] - mean(ys))) for i in range(len(xs))]
sxx = 0
syy = 0
sxy = 0
for i in range(len(xs)):
sxx += list_sxx[i]
syy += list_syy[i]
sxy += list_sxy[i]
b = sxy / sxx
a = mean(ys) - (b * mean(xs))
r_squared = (sxy ** 2) / (sxx * syy)
# END Question 7
def predictor(restaurant):
return b * feature_fn(restaurant) + a
return predictor, r_squared
def best_predictor(user, restaurants, feature_fns):
"""Find the feature within feature_fns that gives the highest R^2 value
for predicting ratings by the user; return a predictor using that feature.
Arguments:
user -- A user
restaurants -- A list of restaurants
feature_fns -- A sequence of functions that each takes a restaurant
"""
reviewed = user_reviewed_restaurants(user, restaurants)
# BEGIN Question 8
sequence_of_tuples = [find_predictor(user, reviewed, feature_fns[i]) for i in range(len(feature_fns))]
return max(sequence_of_tuples, key = lambda x: x[1])[0]
# END Question 8
def rate_all(user, restaurants, feature_fns):
"""Return the predicted ratings of restaurants by user using the best
predictor based on a function from feature_fns.
Arguments:
user -- A user
restaurants -- A list of restaurants
feature_fns -- A sequence of feature functions
"""
predictor = best_predictor(user, ALL_RESTAURANTS, feature_fns)
reviewed = user_reviewed_restaurants(user, restaurants)
# BEGIN Question 9
ratings_dictionary = {}
for r in restaurants:
name = restaurant_name(r)
if r in reviewed:
ratings_dictionary[name] = user_rating(user, name)
else:
ratings_dictionary[name] = predictor(r)
return ratings_dictionary
# END Question 9
def search(query, restaurants):
"""Return each restaurant in restaurants that has query as a category.
Arguments:
query -- A string
restaurants -- A sequence of restaurants
"""
# BEGIN Question 10
return [restaurants[i] for i in range(len(restaurants)) if query in restaurant_categories(restaurants[i])]
# END Question 10
def feature_set():
"""Return a sequence of feature functions."""
return [lambda r: mean(restaurant_ratings(r)),
restaurant_price,
lambda r: len(restaurant_ratings(r)),
lambda r: restaurant_location(r)[0],
lambda r: restaurant_location(r)[1]]
@main
def main(*args):
import argparse
parser = argparse.ArgumentParser(
description='Run Recommendations',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('-u', '--user', type=str, choices=USER_FILES,
default='test_user',
metavar='USER',
help='user file, e.g.\n' +
'{{{}}}'.format(','.join(sample(USER_FILES, 3))))
parser.add_argument('-k', '--k', type=int, help='for k-means')
parser.add_argument('-q', '--query', choices=CATEGORIES,
metavar='QUERY',
help='search for restaurants by category e.g.\n'
'{{{}}}'.format(','.join(sample(CATEGORIES, 3))))
parser.add_argument('-p', '--predict', action='store_true',
help='predict ratings for all restaurants')
parser.add_argument('-r', '--restaurants', action='store_true',
help='outputs a list of restaurant names')
args = parser.parse_args()
# Output a list of restaurant names
if args.restaurants:
print('Restaurant names:')
for restaurant in sorted(ALL_RESTAURANTS, key=restaurant_name):
print(repr(restaurant_name(restaurant)))
exit(0)
# Select restaurants using a category query
if args.query:
restaurants = search(args.query, ALL_RESTAURANTS)
else:
restaurants = ALL_RESTAURANTS
# Load a user
assert args.user, 'A --user is required to draw a map'
user = load_user_file('{}.dat'.format(args.user))
# Collect ratings
if args.predict:
ratings = rate_all(user, restaurants, feature_set())
else:
restaurants = user_reviewed_restaurants(user, restaurants)
names = [restaurant_name(r) for r in restaurants]
ratings = {name: user_rating(user, name) for name in names}
# Draw the visualization
if args.k:
centroids = k_means(restaurants, min(args.k, len(restaurants)))
else:
centroids = [restaurant_location(r) for r in restaurants]
draw_map(centroids, restaurants, ratings)