1
1
from http .client import RemoteDisconnected
2
2
import time
3
- import json
4
3
import re
5
4
from bs4 import BeautifulSoup
6
5
from datetime import datetime
7
6
from selenium import webdriver
8
7
from selenium .webdriver .chrome .options import Options
9
- from selenium .webdriver .support .ui import WebDriverWait , Select
10
- from selenium .webdriver .support import expected_conditions as EC
8
+ from selenium .webdriver .support .ui import WebDriverWait
9
+ from selenium .webdriver .support import expected_conditions as ec
11
10
from selenium .webdriver .common .by import By
12
11
from selenium .common .exceptions import TimeoutException
13
12
import psycopg2
@@ -23,32 +22,33 @@ def get_browser(headless=False, extensions=False, notifications=False, incognito
23
22
chrome_options = Options ()
24
23
if headless :
25
24
chrome_options .add_argument ("--headless" )
26
-
25
+
27
26
if not extensions :
28
27
chrome_options .add_argument ("--disable-extensions" )
29
-
28
+
30
29
if not notifications :
31
- chrome_options .add_argument (' --disable-notifications' )
32
-
30
+ chrome_options .add_argument (" --disable-notifications" )
31
+
33
32
if incognito :
34
- chrome_options .add_argument (' --incognito' )
33
+ chrome_options .add_argument (" --incognito" )
35
34
36
- driver = webdriver .Chrome (executable_path = 'C:\\ Aptana Workspace\\ chromedriver.exe' ,
37
- options = chrome_options )
35
+ driver = webdriver .Chrome (
36
+ executable_path = "C:\\ Aptana Workspace\\ chromedriver.exe" , options = chrome_options
37
+ )
38
38
return driver
39
39
40
40
41
41
def main ():
42
42
conn = psycopg2 .connect (host = HOST , database = DATABASE , user = USER , password = PASSWORD )
43
43
cur = conn .cursor ()
44
44
driver = get_browser (headless = False , incognito = True )
45
-
45
+
46
46
page_url = "https://www.schneier.com/"
47
47
idx = 1
48
48
49
49
while True :
50
50
print (f"Processing page no. { idx } ..." )
51
-
51
+
52
52
try :
53
53
driver .set_page_load_timeout (200 )
54
54
driver .get (page_url )
@@ -58,50 +58,56 @@ def main():
58
58
except RemoteDisconnected :
59
59
print (f"\t Error 404: { page_url } not found." )
60
60
continue
61
-
62
- WebDriverWait (driver , timeout = 40 ).until (EC .presence_of_element_located ((By .CLASS_NAME , "stepthrough" )))
61
+
62
+ WebDriverWait (driver , timeout = 40 ).until (
63
+ ec .presence_of_element_located ((By .CLASS_NAME , "stepthrough" ))
64
+ )
63
65
soup = BeautifulSoup (driver .page_source , "html.parser" )
64
66
65
- ealier_entry = soup .find ("div" , attrs = {"class" : "stepthrough" }).find ("a" , attrs = {"class" : "earlier" })
66
-
67
+ ealier_entry = soup .find ("div" , attrs = {"class" : "stepthrough" }).find (
68
+ "a" , attrs = {"class" : "earlier" }
69
+ )
70
+
67
71
if not ealier_entry :
68
72
break
69
73
70
74
articles = soup .find ("div" , attrs = {"id" : "content" }).find_all ("article" )
71
75
72
76
for article in articles :
73
77
h2_tag = article .find ("h2" , attrs = {"class" : "entry" })
74
- id = h2_tag ['id' ]
78
+ id_ = h2_tag ["id" ]
75
79
76
80
a_tag = h2_tag .find ("a" )
77
- url = a_tag [' href' ] if a_tag else None
81
+ url = a_tag [" href" ] if a_tag else None
78
82
title = a_tag .text .strip () if a_tag else None
79
-
80
- body_tags = article .find_all (re .compile ("[p|strong|i|ul]" ), attrs = {"class" : None , "id" : None , "type" : None })
83
+
84
+ body_tags = article .find_all (
85
+ re .compile ("[p|strong|i|ul]" ), attrs = {"class" : None , "id" : None , "type" : None }
86
+ )
81
87
body = " " .join ([k .text .strip () for k in body_tags ])
82
-
88
+
83
89
entry_tag = article .find ("p" , attrs = {"class" : "entry-tags" })
84
90
tag_arr = [k .text for k in entry_tag .find_all ("a" )] if entry_tag else ["" ]
85
- tags = ', ' .join (tag_arr )
86
-
91
+ tags = ", " .join (tag_arr )
92
+
87
93
posted_tag = article .find ("p" , attrs = {"class" : "posted" })
88
94
date_obj = None
89
95
if posted_tag :
90
96
datetime_tag = posted_tag .find ("a" ).text .strip ()
91
97
date_obj = datetime .strptime (datetime_tag , "Posted on %B %d, %Y at %I:%M %p" )
92
-
98
+
93
99
query = """
94
100
INSERT INTO article(id, url, title, body, tags, posted_datetime)
95
101
SELECT sub_query.* FROM
96
102
(SELECT %s AS id, %s, %s, %s, %s, %s) sub_query
97
103
LEFT JOIN article a ON sub_query.id = a.id
98
104
WHERE a.id IS NULL;
99
105
"""
100
-
101
- data = (id , url , title , body , tags , date_obj )
106
+
107
+ data = (id_ , url , title , body , tags , date_obj )
102
108
cur .execute (query , data )
103
-
104
- comment_arr = [k [' href' ] for k in posted_tag .find_all ("a" )]
109
+
110
+ comment_arr = [k [" href" ] for k in posted_tag .find_all ("a" )]
105
111
if len (comment_arr ) != 2 :
106
112
print (f"\t No comments found for this article - { url } " )
107
113
continue
@@ -118,21 +124,25 @@ def main():
118
124
except RemoteDisconnected :
119
125
print (f"\t Error 404: { comment_url } not found." )
120
126
continue
121
-
122
- WebDriverWait (driver , timeout = 40 ).until (EC .presence_of_element_located ((By .CLASS_NAME , "subscribe-comments" )))
127
+
128
+ WebDriverWait (driver , timeout = 40 ).until (
129
+ ec .presence_of_element_located ((By .CLASS_NAME , "subscribe-comments" ))
130
+ )
123
131
soup = BeautifulSoup (driver .page_source , "html.parser" )
124
-
132
+
125
133
comment_tags = soup .find_all ("article" )[1 :]
126
-
134
+
127
135
for comment in comment_tags :
128
- cid = comment .find ("div" , attrs = {"class" : re .compile ("comment by-" )})['id' ]
129
-
136
+ cid = comment .find ("div" , attrs = {"class" : re .compile ("comment by-" )})["id" ]
137
+
130
138
comment_credit = comment .find ("p" , attrs = {"class" : "commentcredit" })
131
139
commented_by = comment_credit .find ("span" ).text .strip ()
132
-
133
- comment_body_tags = comment .find_all (re .compile ("[p|strong|i|ul]" ), attrs = {"class" : None , "id" : None , "type" : None })
140
+
141
+ comment_body_tags = comment .find_all (
142
+ re .compile ("[p|strong|i|ul]" ), attrs = {"class" : None , "id" : None , "type" : None }
143
+ )
134
144
comment_body = " " .join ([k .text .strip () for k in comment_body_tags ])
135
-
145
+
136
146
posted_tag = comment_credit .find_all ("a" )[- 1 ]
137
147
date_obj = None
138
148
if posted_tag :
@@ -150,13 +160,13 @@ def main():
150
160
WHERE c.id IS NULL;
151
161
"""
152
162
153
- data = (cid , id , comment_body , commented_by , date_obj )
163
+ data = (cid , id_ , comment_body , commented_by , date_obj )
154
164
cur .execute (query , data )
155
165
156
- page_url = ealier_entry [' href' ]
166
+ page_url = ealier_entry [" href" ]
157
167
idx += 1
158
168
time .sleep (3 )
159
-
169
+
160
170
driver .quit ()
161
171
conn .commit ()
162
172
cur .close ()
@@ -166,4 +176,4 @@ def main():
166
176
167
177
168
178
if __name__ == "__main__" :
169
- main ()
179
+ main ()
0 commit comments