-
Notifications
You must be signed in to change notification settings - Fork 1
/
top-sites.py
executable file
·331 lines (274 loc) · 9.99 KB
/
top-sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/usr/bin/env python
# built in
from __future__ import division
import argparse
import cProfile
import logging
import os
import pstats
from datetime import datetime
from functools import wraps
import StringIO
import multiprocessing
import re
# external
import requests
# local imports
from objs.site import Website, mapreduce, map_function, \
reduce_function, partition_data, MapReduceSite
from objs.top_sites import AlexaTopSites
logger = logging.getLogger(__name__)
def timed(f):
"""
Decorator to do a simple time measure for the length of the method call.
Args:
f: Function to be timed.
Returns:
Prints out the time it takes to run
"""
@wraps(f)
def wrapper(*args, **kwargs):
start = datetime.now()
result = f(*args, **kwargs)
elapsed = datetime.now() - start
time_ms = int(elapsed.total_seconds() * 1000) # milliseconds
# TODO: should dump this somewhere more useful
print "%s took %d ms to finish" % (f.__name__, time_ms)
print "with args: %s kwargs: %s" % (args, kwargs)
return result
return wrapper
@timed
def fill_site_data(url):
"""
Makes a request to the URL and then runs a map reduce method to
count the words on the site and the number of times they appear.
Args:
url: HTTP URL to call and run analysis on.
Returns:
Website containing calculated values as well as the content of the
response to the URL.
None if there was an error reading the site.
"""
try:
site = MapReduceSite(url=url)
site.request_homepage()
# site.calculate_word_count()
return site
except requests.exceptions.ConnectionError as e:
logger.exception('Could not connect!')
return None
except:
logger.exception('Error connecting to site!')
return None
@timed
def find_average_word_count(sites):
"""
Find the average of the total word count for all the sites read.
Args:
sites (list Website): List of all the websites analyzed.
Returns:
(float): Average number of words per site.
"""
total = 0
for site in sites:
total += site.word_count_size
return total / len(sites)
@timed
def find_top_20_headers(sites):
"""
Find the top 20 headers returned from the website requests and the
percentage of sites that returned that header.
Args:
sites (list Website): List of websites that have their word counts
analyzed.
Returns:
(dict): Dict where the key is the header and the value is the percentage
of sites that returned the header. Only returns the first 20 sorted by
their percentage value.
"""
header_dict = {}
for site in sites:
for header in site.headers:
if header not in header_dict:
header_dict[header] = []
header_dict[header].append(site.name)
headers_with_percent = {}
for header in header_dict:
headers_with_percent[header] = (len(header_dict[header]) / len(
sites)) * 100.0
sorted_headers = sorted(headers_with_percent.items(), key=lambda x: x[1],
reverse=True)
# logging.info('Headers with pct: %s', sorted_headers)
return sorted_headers[:20]
@timed
def find_top_20_headers_map_reduce(sites, worker_count):
"""
Find the top 20 headers returned from the website requests and the
percentage of sites that returned that header. This method uses a
map-reduce algorithm to count the headers.
Args:
sites (list Website): List of websites that have their word counts
analyzed.
worker_count (int): Number of sub processes to use to run map-reduce
header count.
Returns:
(dict): Dict where the key is the header and the value is the percentage
of sites that returned the header. Only returns the first 20 sorted by
their percentage value.
"""
all_headers = []
for site in sites:
all_headers.extend(site.headers)
logging.debug('all headers: %s', all_headers)
header_count = mapreduce(
all_items=all_headers,
worker_count=worker_count,
partition_func=partition_data,
reduce_func=reduce_function,
map_func=map_function
)
logging.debug('header count: %s', header_count)
headers_with_percent = {}
for header in header_count:
logging.debug('Header: %s', header)
headers_with_percent[header] = (header_count[header] / len(
sites)) * 100.0
sorted_headers = sorted(headers_with_percent.items(), key=lambda x: x[1],
reverse=True)
# logging.info('Headers with pct: %s', sorted_headers)
return sorted_headers[:20]
def parse_s3_url(url):
"""Parses the given URL to extract S3 bucket name and key name. URL must
match one of the following formats for S3 urls:
* http(s)://bucket.s3.amazonaws.com/key
* http(s)://bucket.s3-aws-region.amazonaws.com/key
* http(s)://s3.amazonaws.com/bucket/key
* http(s)://s3-aws-region.amazonaws.com/bucket/key
* s3://bucket/key
Args:
url (str): the url to be parsed
Returns:
(str, str): tuple of (bucket name, key)
"""
match = re.search('^https?://([^.]+).s3.amazonaws.com/(.*)', url)
if match:
return match.group(1), match.group(2)
match = re.search('^https?://([^.]+).s3-[^.]+.amazonaws.com/(.*)', url)
if match:
return match.group(1), match.group(2)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)/(.*)', url)
if match:
return match.group(1), match.group(2)
match = re.search('^https?://s3-[^.]+.amazonaws.com/([^\/]+)/(.*)', url)
if match:
return match.group(1), match.group(2)
match = re.search('^s3://([^\/]+)/(.*)', url)
if match:
return match.group(1), match.group(2)
return None, None
def main(
aws_access_key_id=None,
aws_secret_access_key=None,
local_file_location=None,
s3_file_location=None,
worker_processes=1
):
if not aws_access_key_id and not aws_secret_access_key:
aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']
top_sites = AlexaTopSites(
aws_secret_access_key=aws_secret_access_key,
aws_access_key_id=aws_access_key_id
)
# top_sites.request_top_sites()
# print top_sites.get_site_urls()
if local_file_location:
top_sites.load_from_local_file(local_file_location)
elif s3_file_location:
# parse the S3 URL into a bucket and object key
bucket, key = parse_s3_url(s3_file_location)
top_sites.load_from_s3(bucket, key)
else:
top_sites.request_top_sites()
sites = top_sites.get_site_urls()
pool = multiprocessing.Pool(processes=worker_processes)
results = [pool.apply_async(fill_site_data, args=(site,)) for site in sites]
full_sites = [p.get() for p in results]
# remove site with no return result
full_sites = [site for site in full_sites if site is not None]
# do separate calculation here
for site in full_sites:
site.calculate_word_count()
average_word_count = find_average_word_count(full_sites)
sorted_by_word_count = sorted(full_sites, key=lambda x: x.word_count_size,
reverse=True)
top_headers = find_top_20_headers_map_reduce(full_sites, worker_processes)
logger.debug('Sorted by word count: %s', sorted_by_word_count)
logging.info('Websites sorted by their word count')
for index, site in enumerate(sorted_by_word_count):
logging.info('Site: %s - Rank: %d', site.url, index + 1)
logger.info('Average word count: %s', average_word_count)
logger.debug('Top 20 headers: %s', top_headers)
logging.info('Top 20 headers and the percentage of sites that returned '
'them')
for header in top_headers:
logging.info('Header: %s - Pct: %05.2f', header[0], header[1])
if __name__ == '__main__':
# TODO: add proper log configuration
logging.basicConfig(level=logging.INFO)
# a lot of URL calls get hung up, set this to DEBUG to see them
logging.getLogger("requests.packages.urllib3").setLevel(logging.DEBUG)
parser = argparse.ArgumentParser(
description='Calculate word count and header count for Alexa top 100 '
'sites')
parser.add_argument(
'--access-key-id',
dest='access_key_id',
default=None,
help='AWS access key ID to use for S3 and AWS Top Sites API calls'
)
parser.add_argument(
'--secret-access-key',
dest='secret_access_key',
default=None,
help='AWS secret access key to use for S3 and AWS Top Sites API calls'
)
parser.add_argument(
'--local-file',
dest='local_file_location',
default=None,
help='Location of the local file containing the top sites data'
)
parser.add_argument(
'--s3-location',
dest='s3_location',
default=None,
help='Location of the top sites data file in S3.'
)
parser.add_argument(
'--worker-processes',
dest='worker_count',
default=4,
type=int,
help='Number of sub processes to use for requesting site home pages'
)
args = parser.parse_args()
pr = cProfile.Profile()
pr.enable()
main(
aws_access_key_id=args.access_key_id,
aws_secret_access_key=args.secret_access_key,
local_file_location=args.local_file_location,
s3_file_location=args.s3_location,
worker_processes=args.worker_count
)
pr.disable()
s = StringIO.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats('get_site_urls')
ps.print_stats('fill_site_data')
ps.print_stats('calculate_word_count')
ps.print_stats('find_average_word_count')
ps.print_stats('find_top_20_headers_map_reduce')
print(s.getvalue())