-
-
Notifications
You must be signed in to change notification settings - Fork 300
/
woopread.py
87 lines (70 loc) · 3.23 KB
/
woopread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
import logging
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
search_url = "https://woopread.com/?s=%s&post_type=wp-manga&author=&artist=&release="
class WoopReadCrawler(Crawler):
base_url = "https://woopread.com/"
def initialize(self):
self.regex_novel_id = r'"manga_id"\s*:\s*"(?P<id>\d+)"'
def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)
results = []
for tab in soup.select(".c-tabs-item__content"):
a = tab.select_one(".post-title h3 a")
latest = tab.select_one(".latest-chap .chapter a").text
votes = tab.select_one(".rating .total_votes").text
results.append(
{
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
"info": "%s | Rating: %s" % (latest, votes),
}
)
return results
def read_novel_info(self):
"""Get novel title, autor, cover etc"""
novel_webpage = self.get_soup(self.novel_url)
# novel_id_string = novel_webpage.find(text=re.compile(self.regex_novel_id))
# novel_id = re.search(self.regex_novel_id, novel_id_string).group("id")
self.novel_title = novel_webpage.select_one(".post-title h1").text.strip()
logger.info("Novel title: %s", self.novel_title)
self.novel_author = novel_webpage.select_one(".author-content").text.strip()
logger.info("Novel author: %s", self.novel_author)
self.novel_cover = novel_webpage.select_one('meta[property="og:image"]')[
"content"
]
logger.info("Novel cover: %s", self.novel_title)
response = self.submit_form(self.novel_url.strip("/") + "/ajax/chapters")
soup = self.make_soup(response)
for a in reversed(soup.select(".wp-manga-chapter a")):
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
def download_chapter_body(self, chapter):
"""Download body of a single chapter and return as clean html format."""
logger.info("Visiting %s", chapter["url"])
chapter_page = self.get_soup(chapter["url"])
""" TODO: ADD login option to download premium.
Downloading of premium content has not been implemented,
until if someone provides an active account. We return empty
body to skip downloding these chapters.
"""
if "This chapter is locked!" in chapter_page.text:
return str()
contents = chapter_page.select_one(".container .reading-content div")
for content in contents.select("p"):
for bad in ["Translator:", "Editor:"]:
if bad in content.text:
content.extract()
return self.cleaner.extract_contents(contents)