Skip to content

Commit da41de4

Browse files
authored
Update schneier.py
1 parent afecb0c commit da41de4

File tree

1 file changed

+51
-41
lines changed

1 file changed

+51
-41
lines changed

Diff for: Schneier on Security/schneier.py

+51-41
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
from http.client import RemoteDisconnected
22
import time
3-
import json
43
import re
54
from bs4 import BeautifulSoup
65
from datetime import datetime
76
from selenium import webdriver
87
from selenium.webdriver.chrome.options import Options
9-
from selenium.webdriver.support.ui import WebDriverWait, Select
10-
from selenium.webdriver.support import expected_conditions as EC
8+
from selenium.webdriver.support.ui import WebDriverWait
9+
from selenium.webdriver.support import expected_conditions as ec
1110
from selenium.webdriver.common.by import By
1211
from selenium.common.exceptions import TimeoutException
1312
import psycopg2
@@ -23,32 +22,33 @@ def get_browser(headless=False, extensions=False, notifications=False, incognito
2322
chrome_options = Options()
2423
if headless:
2524
chrome_options.add_argument("--headless")
26-
25+
2726
if not extensions:
2827
chrome_options.add_argument("--disable-extensions")
29-
28+
3029
if not notifications:
31-
chrome_options.add_argument('--disable-notifications')
32-
30+
chrome_options.add_argument("--disable-notifications")
31+
3332
if incognito:
34-
chrome_options.add_argument('--incognito')
33+
chrome_options.add_argument("--incognito")
3534

36-
driver = webdriver.Chrome(executable_path='C:\\Aptana Workspace\\chromedriver.exe',
37-
options=chrome_options)
35+
driver = webdriver.Chrome(
36+
executable_path="C:\\Aptana Workspace\\chromedriver.exe", options=chrome_options
37+
)
3838
return driver
3939

4040

4141
def main():
4242
conn = psycopg2.connect(host=HOST, database=DATABASE, user=USER, password=PASSWORD)
4343
cur = conn.cursor()
4444
driver = get_browser(headless=False, incognito=True)
45-
45+
4646
page_url = "https://www.schneier.com/"
4747
idx = 1
4848

4949
while True:
5050
print(f"Processing page no. {idx}...")
51-
51+
5252
try:
5353
driver.set_page_load_timeout(200)
5454
driver.get(page_url)
@@ -58,50 +58,56 @@ def main():
5858
except RemoteDisconnected:
5959
print(f"\tError 404: {page_url} not found.")
6060
continue
61-
62-
WebDriverWait(driver, timeout=40).until(EC.presence_of_element_located((By.CLASS_NAME, "stepthrough")))
61+
62+
WebDriverWait(driver, timeout=40).until(
63+
ec.presence_of_element_located((By.CLASS_NAME, "stepthrough"))
64+
)
6365
soup = BeautifulSoup(driver.page_source, "html.parser")
6466

65-
ealier_entry = soup.find("div", attrs={"class": "stepthrough"}).find("a", attrs={"class": "earlier"})
66-
67+
ealier_entry = soup.find("div", attrs={"class": "stepthrough"}).find(
68+
"a", attrs={"class": "earlier"}
69+
)
70+
6771
if not ealier_entry:
6872
break
6973

7074
articles = soup.find("div", attrs={"id": "content"}).find_all("article")
7175

7276
for article in articles:
7377
h2_tag = article.find("h2", attrs={"class": "entry"})
74-
id = h2_tag['id']
78+
id_ = h2_tag["id"]
7579

7680
a_tag = h2_tag.find("a")
77-
url = a_tag['href'] if a_tag else None
81+
url = a_tag["href"] if a_tag else None
7882
title = a_tag.text.strip() if a_tag else None
79-
80-
body_tags = article.find_all(re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None})
83+
84+
body_tags = article.find_all(
85+
re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None}
86+
)
8187
body = " ".join([k.text.strip() for k in body_tags])
82-
88+
8389
entry_tag = article.find("p", attrs={"class": "entry-tags"})
8490
tag_arr = [k.text for k in entry_tag.find_all("a")] if entry_tag else [""]
85-
tags = ', '.join(tag_arr)
86-
91+
tags = ", ".join(tag_arr)
92+
8793
posted_tag = article.find("p", attrs={"class": "posted"})
8894
date_obj = None
8995
if posted_tag:
9096
datetime_tag = posted_tag.find("a").text.strip()
9197
date_obj = datetime.strptime(datetime_tag, "Posted on %B %d, %Y at %I:%M %p")
92-
98+
9399
query = """
94100
INSERT INTO article(id, url, title, body, tags, posted_datetime)
95101
SELECT sub_query.* FROM
96102
(SELECT %s AS id, %s, %s, %s, %s, %s) sub_query
97103
LEFT JOIN article a ON sub_query.id = a.id
98104
WHERE a.id IS NULL;
99105
"""
100-
101-
data = (id, url, title, body, tags, date_obj)
106+
107+
data = (id_, url, title, body, tags, date_obj)
102108
cur.execute(query, data)
103-
104-
comment_arr = [k['href'] for k in posted_tag.find_all("a")]
109+
110+
comment_arr = [k["href"] for k in posted_tag.find_all("a")]
105111
if len(comment_arr) != 2:
106112
print(f"\tNo comments found for this article - {url}")
107113
continue
@@ -118,21 +124,25 @@ def main():
118124
except RemoteDisconnected:
119125
print(f"\tError 404: {comment_url} not found.")
120126
continue
121-
122-
WebDriverWait(driver, timeout=40).until(EC.presence_of_element_located((By.CLASS_NAME, "subscribe-comments")))
127+
128+
WebDriverWait(driver, timeout=40).until(
129+
ec.presence_of_element_located((By.CLASS_NAME, "subscribe-comments"))
130+
)
123131
soup = BeautifulSoup(driver.page_source, "html.parser")
124-
132+
125133
comment_tags = soup.find_all("article")[1:]
126-
134+
127135
for comment in comment_tags:
128-
cid = comment.find("div", attrs={"class": re.compile("comment by-")})['id']
129-
136+
cid = comment.find("div", attrs={"class": re.compile("comment by-")})["id"]
137+
130138
comment_credit = comment.find("p", attrs={"class": "commentcredit"})
131139
commented_by = comment_credit.find("span").text.strip()
132-
133-
comment_body_tags = comment.find_all(re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None})
140+
141+
comment_body_tags = comment.find_all(
142+
re.compile("[p|strong|i|ul]"), attrs={"class": None, "id": None, "type": None}
143+
)
134144
comment_body = " ".join([k.text.strip() for k in comment_body_tags])
135-
145+
136146
posted_tag = comment_credit.find_all("a")[-1]
137147
date_obj = None
138148
if posted_tag:
@@ -150,13 +160,13 @@ def main():
150160
WHERE c.id IS NULL;
151161
"""
152162

153-
data = (cid, id, comment_body, commented_by, date_obj)
163+
data = (cid, id_, comment_body, commented_by, date_obj)
154164
cur.execute(query, data)
155165

156-
page_url = ealier_entry['href']
166+
page_url = ealier_entry["href"]
157167
idx += 1
158168
time.sleep(3)
159-
169+
160170
driver.quit()
161171
conn.commit()
162172
cur.close()
@@ -166,4 +176,4 @@ def main():
166176

167177

168178
if __name__ == "__main__":
169-
main()
179+
main()

0 commit comments

Comments
 (0)