-
-
Notifications
You must be signed in to change notification settings - Fork 300
/
scribblehub.py
158 lines (133 loc) · 5.13 KB
/
scribblehub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
import logging
import re
from typing import Generator, Union
from bs4 import BeautifulSoup, Tag
from lncrawl.models import Chapter, SearchResult, Volume
from lncrawl.templates.browser.searchable import SearchableBrowserTemplate
from lncrawl.core.exeptions import FallbackToBrowser, LNException
#from urllib.parse import urljoin, quote_plus
from urllib.parse import urljoin
logger = logging.getLogger(__name__)
digit_regex = re.compile(r"\?toc=(\d+)#content1$")
class ScribbleHubCrawler(SearchableBrowserTemplate):
base_url = [
"https://www.scribblehub.com/",
"https://scribblehub.com/",
]
has_manga = False
has_mtl = False
def initialize(self) -> None:
self.cleaner.bad_css.update(
[
".p-avatar-wrap",
".sp-head",
".spdiv",
".chp_stats_feature",
".modern-footnotes-footnote",
".modern-footnotes-footnote__note",
".wi_authornotes",
]
)
self.cleaner.whitelist_attributes.update(
[
"border",
"class",
]
)
self.cleaner.whitelist_css_property.update(
[
"text-align",
]
)
def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]:
# self.visit(
# urljoin(
# self.home_url, "/?s={}&post_type=fictionposts".format(quote_plus(query))
# )
# )
# self.browser.wait(".search")
# for elem in self.browser.soup.select(
# ".fic .search_main_box .search_body .search_title a"
# ):
# yield elem
raise LNException('Browser Search not supported')
def select_search_items(self, query: str) -> Generator[Tag, None, None]:
raise FallbackToBrowser()
def parse_search_item(self, tag: Tag) -> SearchResult:
return SearchResult(
title=tag.text.strip(),
url=self.absolute_url(tag["href"]),
)
def visit_novel_page_in_browser(self) -> BeautifulSoup:
url_parts = self.novel_url.split("/")
self.novel_url = f'{url_parts[0]}/{url_parts[2]}/{url_parts[3]}/{url_parts[4]}/'
logger.debug(self.novel_url)
self.visit(self.novel_url)
self.browser.wait(".fictionposts-template-default")
def parse_title(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".fic_title")
assert tag
return tag.text.strip()
def parse_cover(self, soup: BeautifulSoup) -> str:
tag = soup.select_one(".fic_image img")
assert tag
if tag.has_attr("data-src"):
return self.absolute_url(tag["data-src"])
elif tag.has_attr("src"):
return self.absolute_url(tag["src"])
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
for a in soup.select(".nauth_name_fic"):
yield a.text.strip()
def parse_chapter_list_in_browser(
self,
) -> Generator[Union[Chapter, Volume], None, None]:
_pages = max(
[
int(digit_regex.search(a["href"]).group(1))
for a in self.browser.soup.select(".simple-pagination a")
if digit_regex.search(a["href"]) is not None
]
)
if not _pages:
_pages = 1
tags = self.browser.soup.select(".main .toc li a")
for i in range(2, _pages + 1):
self.browser.visit(urljoin(self.novel_url, f"?toc={i}#content1"))
self.browser.wait(".main")
tags += self.browser.soup.select(".main .toc li a")
for _id, _t in enumerate(reversed(tags)):
yield Chapter(
id=_id, url=self.absolute_url(_t.get("href")), title=_t.text.strip()
)
def parse_chapter_list(
self, soup: BeautifulSoup
) -> Generator[Union[Chapter, Volume], None, None]:
chapter_count = soup.find("span", {"class": "cnt_toc"})
chapter_count = (
int(chapter_count.text) if isinstance(chapter_count, Tag) else -1
)
possible_mypostid = soup.select_one("input#mypostid")
assert isinstance(possible_mypostid, Tag)
mypostid = int(str(possible_mypostid["value"]))
logger.info("#mypostid = %d", mypostid)
response = self.submit_form(
f"{self.home_url}wp-admin/admin-ajax.php",
{
"action": "wi_getreleases_pagination",
"pagenum": -1,
"mypostid": mypostid,
},
)
soup = self.make_soup(response)
for chapter in reversed(soup.select(".toc_ol a.toc_a")):
yield Chapter(
id=len(self.chapters) + 1,
url=self.absolute_url(str(chapter["href"])),
title=chapter.text.strip(),
)
def visit_chapter_page_in_browser(self, chapter: Chapter) -> None:
self.visit(chapter.url)
self.browser.wait(".site-content-contain")
def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return soup.select_one("div#chp_raw")