1
1
"""
2
- https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html
3
- The page setup is such that, by default,
2
+ URL: https://stackoverflow.com/questions/41706274/beautifulsoup-returns-incomplete-html
3
+ The page setup is such that, by default,
4
4
approx. 19 <div class="topic-post clearfix regular"></div> tags on boingboing comments are
5
- loaded at startup and the remaining get loaded once the page is manually scrolled down.
6
- We use selenium web-driver to achieve the manual scrolling.
7
- We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox
8
- or the chromedriver (latest release 2.3.6 -->
5
+ loaded at startup and the remaining get loaded once the page is manually scrolled down.
6
+ We use selenium web-driver to achieve manual scrolling.
7
+ We need either the geckodriver(https://github.com/mozilla/geckodriver/releases) for firefox
8
+ or the chromedriver (latest release 2.3.6 -->
9
9
https://chromedriver.storage.googleapis.com/index.html?path=2.36/)
10
10
Note: the executable driver file must be in the same directory as the .py file.
11
11
"""
17
17
18
18
from bs4 import BeautifulSoup
19
19
from selenium .webdriver .support .ui import WebDriverWait
20
- from selenium .webdriver .support import expected_conditions as EC
20
+ from selenium .webdriver .support import expected_conditions as ec
21
21
from selenium .webdriver .common .by import By
22
22
23
23
SCREEN_HEIGHT_IN_PIXELS = 1080
27
27
# Fixing the 'IncompleteRead' bug using http
28
28
# https://stackoverflow.com/questions/14149100/incompleteread-using-httplib
29
29
http .client .HTTPConnection ._http_vsn = 10
30
- http .client .HTTPConnection ._http_vsn_str = 'HTTP/1.0'
30
+ http .client .HTTPConnection ._http_vsn_str = "HTTP/1.0"
31
+
31
32
32
33
def fetch_comment_info (browser , url , postno , cur , delay = 100 ):
33
34
"""
34
35
Fetches user comments in 'url'.
35
36
"""
36
-
37
+
37
38
comments = {}
38
39
# indicates presence of div_class_share but no a_class_bbs
39
40
try :
40
- # Added timeout for the error: http.client.RemoteDisconnected:
41
+ # Added timeout for the error: http.client.RemoteDisconnected:
41
42
# Remote end closed connection without response
42
43
browser .set_page_load_timeout (200 )
43
44
browser .get (url )
44
45
except http .client .RemoteDisconnected :
45
46
return comments
46
-
47
- WebDriverWait (browser , delay ).until (EC .presence_of_element_located \
48
- ((By .CLASS_NAME , "container" )))
47
+
48
+ WebDriverWait (browser , delay ).until (
49
+ ec .presence_of_element_located ((By .CLASS_NAME , "container" ))
50
+ )
49
51
50
52
soup = BeautifulSoup (browser .page_source , "html.parser" )
51
-
53
+
52
54
# Replies, Views, Users, Likes and Links
53
- num = 0
54
55
topic_str = ["replies" , "view" , "user" , "like" , "link" ]
55
56
topic_map = [0 ] * len (topic_str )
56
-
57
- div_class_topicmap = soup .find ("div" , attrs = {"class" :"topic-map" })
57
+
58
+ div_class_topicmap = soup .find ("div" , attrs = {"class" : "topic-map" })
58
59
if div_class_topicmap :
59
60
li_all = div_class_topicmap .find_all ("li" )
60
61
for li_tag in li_all :
@@ -65,87 +66,89 @@ def fetch_comment_info(browser, url, postno, cur, delay=100):
65
66
if i in li_text :
66
67
str_found = True
67
68
break
68
-
69
+
69
70
if str_found and span_class_number :
70
71
if "k" in span_class_number .text :
71
72
if "." in span_class_number .text :
72
73
tmp = re .findall (r"\d+\.\d+" , span_class_number .text )[0 ]
73
74
else :
74
75
tmp = re .findall (r"\d+" , span_class_number .text )[0 ]
75
-
76
+
76
77
num = int (float (tmp ) * 1000 )
77
78
else :
78
79
num = int (span_class_number .text )
79
-
80
+
80
81
for i , _ in enumerate (topic_str ):
81
82
if topic_str [i ] in li_text :
82
83
topic_map [i ] = num
83
-
84
+
84
85
# Replies, Views, Users, Likes and Links
85
-
86
- tmp = 0
86
+
87
+ tmp = 0
87
88
query = "UPDATE posts SET c_page_url = %s, replies = %s, views = %s, \
88
89
users = %s, likes = %s, links = %s WHERE postno = %s;"
89
90
if topic_map [0 ] >= 1 :
90
91
tmp = topic_map [0 ] - 1
91
92
data = (url , tmp , topic_map [1 ], topic_map [2 ], topic_map [3 ], topic_map [4 ], postno )
92
-
93
+
93
94
cur .execute (query , data )
94
-
95
- scrolls = math .ceil (topic_map [0 ]/ COMMENTS_SCREEN_SIZE )
95
+
96
+ scrolls = math .ceil (topic_map [0 ] / COMMENTS_SCREEN_SIZE )
96
97
97
98
for i in range (scrolls ):
98
99
soup = BeautifulSoup (browser .page_source , "html.parser" )
99
- div_class_comment = soup .find_all ("div" , \
100
- attrs = {"class" :\
101
- "topic-post clearfix regular" }) + \
102
- soup .find_all ("div" , \
103
- attrs = {"class" :\
104
- "topic-post clearfix topic-owner \
105
- group-editors regular" })
106
-
100
+ div_class_comment = soup .find_all (
101
+ "div" , attrs = {"class" : "topic-post clearfix regular" }
102
+ ) + soup .find_all (
103
+ "div" ,
104
+ attrs = {
105
+ "class" : "topic-post clearfix topic-owner \
106
+ group-editors regular"
107
+ },
108
+ )
109
+
107
110
comm_no = 1
108
111
for dc_comment in div_class_comment :
109
- div_class_user_card = dc_comment .find ("div" , \
110
- attrs = {"class" :"names trigger-user-card" })
111
- postedby = None
112
+ div_class_user_card = dc_comment .find ("div" , attrs = {"class" : "names trigger-user-card" })
112
113
if div_class_user_card :
113
114
span_class_firstusername = dc_comment .find ("span" )
114
115
if span_class_firstusername :
115
116
postedby = span_class_firstusername .find ("a" ).text
116
-
117
- post_date = dc_comment .find ("div" , attrs = {"class" :"post-info post-date" })
118
- a_class_post_date = post_date .find ("a" , attrs = {"class" :"post-date" })
119
- posteddate = a_class_post_date .find ("span" )[' title' ]
120
- div_class_cooked = dc_comment .find ("div" , attrs = {"class" :"cooked" })
121
- comm_text = div_class_cooked .text .strip ().replace (' \n ' , '' ).replace (' \r ' , '' )
122
-
123
- dict_primary_key = postedby + ' ' + posteddate + ' ' + comm_text
124
-
117
+
118
+ post_date = dc_comment .find ("div" , attrs = {"class" : "post-info post-date" })
119
+ a_class_post_date = post_date .find ("a" , attrs = {"class" : "post-date" })
120
+ posteddate = a_class_post_date .find ("span" )[" title" ]
121
+ div_class_cooked = dc_comment .find ("div" , attrs = {"class" : "cooked" })
122
+ comm_text = div_class_cooked .text .strip ().replace (" \n " , "" ).replace (" \r " , "" )
123
+
124
+ dict_primary_key = postedby + " " + posteddate + " " + comm_text
125
+
125
126
if dict_primary_key not in comments :
126
127
comments [dict_primary_key ] = {}
127
- comments [dict_primary_key ]['postedby' ] = postedby
128
- comments [dict_primary_key ]['date' ] = datetime .strptime \
129
- (posteddate , "%b %d, %Y %I:%M %p" ).date ()
130
-
131
- comments [dict_primary_key ]['comm_no' ] = comm_no
132
-
133
- div_class_cooked = dc_comment .find ("div" , attrs = {"class" :"cooked" })
134
- comments [dict_primary_key ]['comm_text' ] = comm_text
135
-
136
- div_class_actions = dc_comment .find ("div" , attrs = {"class" :"actions" })
128
+ comments [dict_primary_key ]["postedby" ] = postedby
129
+ comments [dict_primary_key ]["date" ] = datetime .strptime (
130
+ posteddate , "%b %d, %Y %I:%M %p"
131
+ ).date ()
132
+
133
+ comments [dict_primary_key ]["comm_no" ] = comm_no
134
+ comments [dict_primary_key ]["comm_text" ] = comm_text
135
+
136
+ div_class_actions = dc_comment .find ("div" , attrs = {"class" : "actions" })
137
137
comment_like_list = re .findall (r"\d+" , div_class_actions .text .strip ())
138
-
138
+
139
139
if comment_like_list :
140
140
comment_likes = int (comment_like_list [0 ])
141
141
else :
142
142
comment_likes = 0
143
- comments [dict_primary_key ][' likes' ] = comment_likes
144
-
143
+ comments [dict_primary_key ][" likes" ] = comment_likes
144
+
145
145
comm_no += 1
146
-
147
- browser .execute_script ("window.scrollTo({}, {});" .format (i * SCREEN_HEIGHT_IN_PIXELS , \
148
- (i + 1 )* SCREEN_HEIGHT_IN_PIXELS ))
146
+
147
+ browser .execute_script (
148
+ "window.scrollTo({}, {});" .format (
149
+ i * SCREEN_HEIGHT_IN_PIXELS , (i + 1 ) * SCREEN_HEIGHT_IN_PIXELS
150
+ )
151
+ )
149
152
time .sleep (SCROLL_WAIT_TIME )
150
-
153
+
151
154
return comments
0 commit comments