Skip to content

Commit 92c81c1

Browse files
authored
Update boingboing_comments.py
1 parent 5ec30dd commit 92c81c1

File tree

1 file changed

+66
-63
lines changed

1 file changed

+66
-63
lines changed

Diff for: BoingBoing/boingboing_comments.py

+66-63
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""
2-
https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html
3-
The page setup is such that, by default,
2+
URL: https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html
3+
The page setup is such that, by default,
44
approx. 19 <div class="topic-post clearfix regular"></div> tags on boingboing comments are
5-
loaded at startup and the remaining get loaded once the page is manually scrolled down.
6-
We use selenium web-driver to achieve the manual scrolling.
7-
We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox
8-
or the chromedriver (latest release 2.3.6 -->
5+
loaded at startup and the remaining get loaded once the page is manually scrolled down.
6+
We use selenium web-driver to achieve manual scrolling.
7+
We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox
8+
or the chromedriver (latest release 2.3.6 -->
99
https://chromedriver.storage.googleapis.com/index.html?path=2.36/)
1010
Note: the executable driver file must be in the same directory as the .py file.
1111
"""
@@ -17,7 +17,7 @@
1717

1818
from bs4 import BeautifulSoup
1919
from selenium.webdriver.support.ui import WebDriverWait
20-
from selenium.webdriver.support import expected_conditions as EC
20+
from selenium.webdriver.support import expected_conditions as ec
2121
from selenium.webdriver.common.by import By
2222

2323
SCREEN_HEIGHT_IN_PIXELS = 1080
@@ -27,34 +27,35 @@
2727
# Fixing the 'IncompleteRead' bug using http
2828
# https://stackoverflow.com/questions/14149100/incompleteread-using-httplib
2929
http.client.HTTPConnection._http_vsn = 10
30-
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
30+
http.client.HTTPConnection._http_vsn_str = "HTTP/1.0"
31+
3132

3233
def fetch_comment_info(browser, url, postno, cur, delay=100):
3334
"""
3435
Fetches user comments in 'url'.
3536
"""
36-
37+
3738
comments = {}
3839
# indicates presence of div_class_share but no a_class_bbs
3940
try:
40-
# Added timeout for the error: http.client.RemoteDisconnected:
41+
# Added timeout for the error: http.client.RemoteDisconnected:
4142
# Remote end closed connection without response
4243
browser.set_page_load_timeout(200)
4344
browser.get(url)
4445
except http.client.RemoteDisconnected:
4546
return comments
46-
47-
WebDriverWait(browser, delay).until(EC.presence_of_element_located\
48-
((By.CLASS_NAME, "container")))
47+
48+
WebDriverWait(browser, delay).until(
49+
ec.presence_of_element_located((By.CLASS_NAME, "container"))
50+
)
4951

5052
soup = BeautifulSoup(browser.page_source, "html.parser")
51-
53+
5254
# Replies, Views, Users, Likes and Links
53-
num = 0
5455
topic_str = ["replies", "view", "user", "like", "link"]
5556
topic_map = [0] * len(topic_str)
56-
57-
div_class_topicmap = soup.find("div", attrs={"class":"topic-map"})
57+
58+
div_class_topicmap = soup.find("div", attrs={"class": "topic-map"})
5859
if div_class_topicmap:
5960
li_all = div_class_topicmap.find_all("li")
6061
for li_tag in li_all:
@@ -65,87 +66,89 @@ def fetch_comment_info(browser, url, postno, cur, delay=100):
6566
if i in li_text:
6667
str_found = True
6768
break
68-
69+
6970
if str_found and span_class_number:
7071
if "k" in span_class_number.text:
7172
if "." in span_class_number.text:
7273
tmp = re.findall(r"\d+\.\d+", span_class_number.text)[0]
7374
else:
7475
tmp = re.findall(r"\d+", span_class_number.text)[0]
75-
76+
7677
num = int(float(tmp) * 1000)
7778
else:
7879
num = int(span_class_number.text)
79-
80+
8081
for i, _ in enumerate(topic_str):
8182
if topic_str[i] in li_text:
8283
topic_map[i] = num
83-
84+
8485
# Replies, Views, Users, Likes and Links
85-
86-
tmp = 0
86+
87+
tmp = 0
8788
query = "UPDATE posts SET c_page_url = %s, replies = %s, views = %s, \
8889
users = %s, likes = %s, links = %s WHERE postno = %s;"
8990
if topic_map[0] >= 1:
9091
tmp = topic_map[0] - 1
9192
data = (url, tmp, topic_map[1], topic_map[2], topic_map[3], topic_map[4], postno)
92-
93+
9394
cur.execute(query, data)
94-
95-
scrolls = math.ceil(topic_map[0]/COMMENTS_SCREEN_SIZE)
95+
96+
scrolls = math.ceil(topic_map[0] / COMMENTS_SCREEN_SIZE)
9697

9798
for i in range(scrolls):
9899
soup = BeautifulSoup(browser.page_source, "html.parser")
99-
div_class_comment = soup.find_all("div", \
100-
attrs={"class":\
101-
"topic-post clearfix regular"}) + \
102-
soup.find_all("div", \
103-
attrs={"class":\
104-
"topic-post clearfix topic-owner \
105-
group-editors regular"})
106-
100+
div_class_comment = soup.find_all(
101+
"div", attrs={"class": "topic-post clearfix regular"}
102+
) + soup.find_all(
103+
"div",
104+
attrs={
105+
"class": "topic-post clearfix topic-owner \
106+
group-editors regular"
107+
},
108+
)
109+
107110
comm_no = 1
108111
for dc_comment in div_class_comment:
109-
div_class_user_card = dc_comment.find("div", \
110-
attrs={"class":"names trigger-user-card"})
111-
postedby = None
112+
div_class_user_card = dc_comment.find("div", attrs={"class": "names trigger-user-card"})
112113
if div_class_user_card:
113114
span_class_firstusername = dc_comment.find("span")
114115
if span_class_firstusername:
115116
postedby = span_class_firstusername.find("a").text
116-
117-
post_date = dc_comment.find("div", attrs={"class":"post-info post-date"})
118-
a_class_post_date = post_date.find("a", attrs={"class":"post-date"})
119-
posteddate = a_class_post_date.find("span")['title']
120-
div_class_cooked = dc_comment.find("div", attrs={"class":"cooked"})
121-
comm_text = div_class_cooked.text.strip().replace('\n', '').replace('\r', '')
122-
123-
dict_primary_key = postedby + ' ' + posteddate + ' ' + comm_text
124-
117+
118+
post_date = dc_comment.find("div", attrs={"class": "post-info post-date"})
119+
a_class_post_date = post_date.find("a", attrs={"class": "post-date"})
120+
posteddate = a_class_post_date.find("span")["title"]
121+
div_class_cooked = dc_comment.find("div", attrs={"class": "cooked"})
122+
comm_text = div_class_cooked.text.strip().replace("\n", "").replace("\r", "")
123+
124+
dict_primary_key = postedby + " " + posteddate + " " + comm_text
125+
125126
if dict_primary_key not in comments:
126127
comments[dict_primary_key] = {}
127-
comments[dict_primary_key]['postedby'] = postedby
128-
comments[dict_primary_key]['date'] = datetime.strptime\
129-
(posteddate, "%b %d, %Y %I:%M %p").date()
130-
131-
comments[dict_primary_key]['comm_no'] = comm_no
132-
133-
div_class_cooked = dc_comment.find("div", attrs={"class":"cooked"})
134-
comments[dict_primary_key]['comm_text'] = comm_text
135-
136-
div_class_actions = dc_comment.find("div", attrs={"class":"actions"})
128+
comments[dict_primary_key]["postedby"] = postedby
129+
comments[dict_primary_key]["date"] = datetime.strptime(
130+
posteddate, "%b %d, %Y %I:%M %p"
131+
).date()
132+
133+
comments[dict_primary_key]["comm_no"] = comm_no
134+
comments[dict_primary_key]["comm_text"] = comm_text
135+
136+
div_class_actions = dc_comment.find("div", attrs={"class": "actions"})
137137
comment_like_list = re.findall(r"\d+", div_class_actions.text.strip())
138-
138+
139139
if comment_like_list:
140140
comment_likes = int(comment_like_list[0])
141141
else:
142142
comment_likes = 0
143-
comments[dict_primary_key]['likes'] = comment_likes
144-
143+
comments[dict_primary_key]["likes"] = comment_likes
144+
145145
comm_no += 1
146-
147-
browser.execute_script("window.scrollTo({}, {});".format(i*SCREEN_HEIGHT_IN_PIXELS, \
148-
(i+1)*SCREEN_HEIGHT_IN_PIXELS))
146+
147+
browser.execute_script(
148+
"window.scrollTo({}, {});".format(
149+
i * SCREEN_HEIGHT_IN_PIXELS, (i + 1) * SCREEN_HEIGHT_IN_PIXELS
150+
)
151+
)
149152
time.sleep(SCROLL_WAIT_TIME)
150-
153+
151154
return comments

0 commit comments

Comments
 (0)