Skip to content

Commit 8dc345c

Browse files
committed
refacto
1 parent 8468cb8 commit 8dc345c

File tree

1 file changed

+32
-39
lines changed

1 file changed

+32
-39
lines changed

minet/reddit/scraper.py

+32-39
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from time import sleep
3-
from ural import get_domain_name, is_url
3+
from ural import is_url
44
from urllib.parse import urljoin
55

66
from minet.reddit.exceptions import RedditInvalidTargetError
@@ -156,7 +156,7 @@ def data_user_posts(
156156
link,
157157
error,
158158
):
159-
sub = post.scrape_one("a[class*='subreddit']", "href")
159+
sub = post.scrape_one("a.subreddit", "href")
160160
data = RedditUserPost(
161161
title=title,
162162
url=get_new_url(url),
@@ -179,28 +179,28 @@ def __init__(self):
179179

180180
def get_childs_l500(self, url, list_comments, parent_id):
181181
_, soup, _ = reddit_request(url, self.pool_manager)
182-
comments = soup.select("div.commentarea>div>div[class*='comment']")
182+
comments = soup.select("div.commentarea>div>div.comment")
183183
if parent_id is None:
184184
for com in comments:
185185
list_comments.append((None, com))
186-
else:
187-
for com in comments:
188-
child = com.find("div", class_="child")
189-
if child.text != "":
190-
child = child.find("div")
191-
child_com = child.find_all(
192-
"div",
193-
class_=lambda x: x
194-
and (
195-
"comment" in x
196-
or "deleted comment" in x
197-
or "morerecursion" in x
198-
or "morechildren" in x
199-
),
200-
recursive=False,
201-
)
202-
for ele in child_com:
203-
list_comments.append((parent_id, ele))
186+
return list_comments
187+
for com in comments:
188+
child = com.find("div", class_="child")
189+
if child.text != "":
190+
child = child.find("div")
191+
child_com = child.find_all(
192+
"div",
193+
class_=lambda x: x
194+
and (
195+
"comment" in x
196+
or "deleted comment" in x
197+
or "morerecursion" in x
198+
or "morechildren" in x
199+
),
200+
recursive=False,
201+
)
202+
for ele in child_com:
203+
list_comments.append((parent_id, ele))
204204
return list_comments
205205

206206
def get_comments(self, url: str, all):
@@ -220,9 +220,9 @@ def get_comments(self, url: str, all):
220220
error=error,
221221
)
222222
else:
223-
first_comments = soup.select("div.commentarea>div>div[class*='comment']")
223+
first_comments = soup.select("div.commentarea>div>div.comment")
224224
if all:
225-
more = soup.select("div.commentarea>div>div[class*='morechildren']")
225+
more = soup.select("div.commentarea>div>div.morechildren")
226226
for ele in more:
227227
a = ele.select_one("a")
228228
onclick = a["onclick"]
@@ -241,8 +241,7 @@ def get_comments(self, url: str, all):
241241
points = None
242242
else:
243243
comment_url = com.scrape_one("a.bylink", "href")
244-
try_author = com.select_one("div.entry.unvoted")
245-
author = try_author.scrape_one("a[class^='author']")
244+
author = com.scrape_one("div.entry.unvoted a.author")
246245
if not author:
247246
author = "[Deleted]"
248247
points = get_points(com)
@@ -286,7 +285,7 @@ def get_comments(self, url: str, all):
286285
m_comments.append((current_id, ele))
287286
data = RedditComment(
288287
comment_url=get_new_url(resolve_relative_url(comment_url)),
289-
author=author,
288+
author=author if author else "[Deleted]",
290289
id=current_id,
291290
parent=parent,
292291
points=points,
@@ -311,12 +310,10 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
311310
break
312311
list_buttons = post.select_one("ul.flat-list.buttons")
313312
if len(list_buttons.scrape("span.promoted-span")) == 0:
314-
title = post.force_select_one("a[class*='title']").get_text()
315-
post_url = list_buttons.scrape_one(
316-
"a[class^='bylink comments']", "href"
317-
)
313+
title = post.force_select_one("a.title").get_text()
314+
post_url = list_buttons.scrape_one("a.bylink.comments", "href")
318315
n_comments_scraped = list_buttons.select_one(
319-
"a[class^='bylink comments']"
316+
"a.bylink.comments"
320317
).get_text()
321318
match = re.match(r"(\d+)\s+comment(s)?", n_comments_scraped)
322319
if match:
@@ -325,9 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
325322
n_comments = 0
326323
upvote = get_points(post)
327324
published_date, edited_date = get_dates(post)
328-
link = resolve_relative_url(
329-
post.scrape_one("a[class*='title']", "href")
330-
)
325+
link = resolve_relative_url(post.scrape_one("a.title", "href"))
331326
if link == post_url:
332327
link = ""
333328
if add_text:
@@ -363,9 +358,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
363358
link,
364359
text_error,
365360
)
366-
try_content = text_soup.select_one(
367-
"div#siteTable div[class^='usertext']"
368-
)
361+
try_content = text_soup.select_one("div#siteTable div.usertext")
369362
if try_content:
370363
content = try_content.get_text()
371364
else:
@@ -431,8 +424,8 @@ def get_user_comments(self, url: str, limit: int):
431424
break
432425
post_title = comment.scrape_one("a.title")
433426
post_url = comment.scrape_one("a.bylink.may-blank", "href")
434-
post_author = comment.scrape_one("p.parent>a[class^='author']")
435-
post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
427+
post_author = comment.scrape_one("p.parent>a.author")
428+
post_subreddit = comment.scrape_one("a.subreddit", "href")
436429
points = get_points(comment)
437430
published_date, edited_date = get_dates(comment)
438431
text = comment.scrape_one("div.content div.md")

0 commit comments

Comments
 (0)