-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounting_cars.py
117 lines (89 loc) · 3.94 KB
/
counting_cars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
import nltk, re, urllib2, os
from bs4 import BeautifulSoup, SoupStrainer
def clean_wikilist(filename):
# open saved html file
html = open(filename).read()
# collect bulleted items only
bullets = SoupStrainer("li")
# make soup out of the bulleted items
soup = BeautifulSoup(html, 'lxml', parse_only = bullets).prettify()
# remove html from soup
raw = nltk.clean_html(soup)
# remove extra lines
raw = re.sub(r'\n \n \n \n \n', r'\n', raw)
raw = re.sub(r'\n \n \n', r'\n', raw)
# create and clean tokens
tokens = raw.split('\n')
tokens = [re.sub(r'^\s+(?=[\S]+)', r'', token) for token in tokens]
tokens = [token for token in tokens if not re.findall(r'\[[0-9]+\]|\([\S\s]+[\(\)]?|^\s+$|^[\s\[\]\(\)0-9]+$', token)]
tokens = list(set(tokens))
return tokens
cars = ['bmw', 'chevy', 'benz']
# for car in cars:
def autos_ge():
# open saved html file
html = open('autos-ge.html').read()
# create soup object
soup = BeautifulSoup(html)
# select current major manufacturers
majors = soup.select('span.mw-headline')
majors = [w for w in majors if w.parent.parent.previous_sibling.contents[0]['id'] == 'Current_major_manufacturers']
major_tokens = [nltk.clean_html(str(w)) for w in majors]
major_tokens = [re.sub(r'\[\s\S\s\]', r'', token) for token in major_tokens]
# select current minor manufacturers
minors = soup.select('li')
minors = [w for w in minors if w.parent.parent.previous_sibling.contents[0]['id'] == 'Current_minor_manufacturers']
minor_tokens = [nltk.clean_html(str(w)) for w in minors]
minor_tokens = [re.sub(r'\s\(\S+\)', r'', token) for token in minor_tokens]
# combine lists
tokens = list(set(minor_tokens + major_tokens))
return tokens
def autos_uk():
html = open('autos-uk.html').read()
soup = BeautifulSoup(html)
mfrs = soup.select('li')
mfrs = [w for w in mfrs if w.parent.previous_sibling.previous_sibling.string == 'Current manufacturers:']
mfrs = [nltk.clean_html(str(w)) for w in mfrs]
mfrs = [re.sub(r'\s\([\S\s]+\)', r'', token) for token in mfrs]
return mfrs
def autos_us():
html = open('autos-us.html').read()
soup = BeautifulSoup(html)
first = soup.find('li').contents[0]
second = first.parent.next_sibling.next_sibling.contents[0]
third = second.parent.next_sibling.next_sibling.contents[0]
majors = [first, second, third]
minors = soup.select('ul li ul li')
major_tokens = [nltk.clean_html(str(w)) for w in majors]
minor_tokens = [nltk.clean_html(str(w)) for w in minors]
minor_tokens = [re.sub(r'\s\([\S\s]+\)|\[\s\S\s\]|\n\s[A-Za-z]+', r'', token) for token in minor_tokens]
tokens = list(set(major_tokens + minor_tokens))
return tokens
def write_autos(auto_list):
with open('autos-clean.txt', 'a') as output_file:
for word in auto_list:
output_file.write("%s\n" % word)
def rap_search(auto_list):
# search for each brand name
for brand in auto_list:
url = 'http://research.blackyouthproject.com/raplyrics/results/?all/1989-2009/' + word
# save the search results page
results_html = urllib2.urlopen(url).read()
# save it as a file named after the brand
results = word + '.html'
with open(results, 'w') as results_file:
results_file.write(results_html)
def count_rap_results():
# for all html files in current directory
for filename in os.listdir('.'):
if filename.endswith('html'):
# select song titles
html = open(filename).read()
soup = BeautifulSoup(html)
songs = soup.select('.title')
# count number of song titles
count = len(songs)
# write brand names and number of songs into a text file
with open('count_rap_autos.txt', 'a') as counter_file:
counter_file.write('%s%15d\n' % (filename[:-5], count))