-
-
Notifications
You must be signed in to change notification settings - Fork 317
/
Copy pathwnmtl.py
94 lines (79 loc) · 3.3 KB
/
wnmtl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
import logging
from urllib.parse import urlparse
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
SEARCH_URL = (
"https://api.mystorywave.com/story-wave-backend/api/v1/content/books/search"
+ "?keyWord=%s&pageNumber=1&pageSize=50"
)
BOOK_INFO_URL = "https://api.mystorywave.com/story-wave-backend/api/v1/content/books/%s"
CHAPTER_LIST_URL = (
"https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/page"
+ "?sortDirection=ASC&bookId=%s&pageNumber=%d&pageSize=100"
)
CHAPTER_CONTENT_URL = (
"https://api.mystorywave.com/story-wave-backend/api/v1/content/chapters/%d"
)
class WNMTLCrawler(Crawler):
has_mtl = True
base_url = [
"https://www.wnmtl.org/",
"https://wnmtl.org/",
"http://www.wnmtl.org/",
"http://wnmtl.org/",
]
# NOTE: Disabled because it takes too long to responsd
# def search_novel(self, query):
# url = SEARCH_URL % quote(query).lower()
# data = self.get_json(url)
# results = []
# for item in data['data']['list']:
# results.append({
# 'title': item['title'],
# 'url': 'https://wnmtl.org/book/' + item['id'],
# 'info': 'Author: %s | %s | Last update: %s %s' % (
# item['authorPseudonym'], item['genreName'], item['lastUpdateChapterOrder'], item['lastUpdateChapterTitle']),
# })
# # end for
# return results
# # end def
def read_novel_info(self):
logger.debug(self.home_url)
self.set_header("site-domain", urlparse(self.novel_url).hostname or "")
self.novel_id = int(urlparse(self.novel_url).path.split("/")[2].split("-")[0])
logger.info("Novel ID %d", self.novel_id)
data = self.get_json(BOOK_INFO_URL % self.novel_id)
logger.debug(data)
self.novel_title = data["data"]["title"]
self.novel_cover = data["data"]["coverImgUrl"]
self.novel_author = data["data"]["authorPseudonym"]
chapter_data = []
data = self.get_json(CHAPTER_LIST_URL % (self.novel_id, 1))
chapter_data += data["data"]["list"]
futures = []
for page in range(2, data["data"]["totalPages"] + 1):
url = CHAPTER_LIST_URL % (self.novel_id, page)
futures.append(self.executor.submit(self.get_json, url))
for f in futures:
data = f.result()
chapter_data += data["data"]["list"]
for item in chapter_data:
if item["paywallStatus"] != "free" or item["status"] != "published":
continue
chap_id = len(self.chapters) + 1
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"url": CHAPTER_CONTENT_URL % item["id"],
"title": "Chapter %d: %s" % (item["chapterOrder"], item["title"]),
}
)
def download_chapter_body(self, chapter):
data = self.get_json(chapter["url"])
contents = data["data"]["content"].split("\n")
return "\n".join(["<p>" + x + "</p>" for x in contents])