-
-
Notifications
You must be signed in to change notification settings - Fork 300
/
foxteller.py
113 lines (95 loc) · 3.91 KB
/
foxteller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
import json
import base64
import logging
import re
from lncrawl.core.crawler import Crawler
logger = logging.getLogger(__name__)
search_url = "https://www.foxteller.com/search"
chapter_aux_url = "https://www.foxteller.com/aux_dem"
class FoxtellerCrawler(Crawler):
base_url = "https://www.foxteller.com/"
# NOTE: Disabled because it takes too long
# def search_novel(self, query):
# self.get_response(self.home_url) # for cookies
# query = query.lower().replace(' ', '+')
# soup = self.post_soup(search_url, data=dict(query=query))
# results = []
# for a in soup.select('a[href*="/novel/"]'):
# results.append({
# 'title': a.select_one('span .ellipsis-1').text.strip(),
# 'url': self.absolute_url(a['href']),
# 'info': a.select_one('span .text-brand').text.strip(),
# })
# # end for
# return results
# # end def
def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)
possible_title = soup.select_one(".novel-title h2")
assert possible_title, "No novel title"
self.novel_title = possible_title.text.strip()
logger.info("Novel title: %s", self.novel_title)
possible_image = soup.select_one(".novel-featureimg img")
if possible_image:
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)
for card in soup.select("#myTabContent #accordion .card"):
vol_id = len(self.chapters) // 100 + 1
self.volumes.append({"id": vol_id})
for a in card.select(".card-body a"):
chap_id = len(self.chapters) + 1
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
def download_chapter_body(self, chapter):
novel_id = None
chapter_id = None
soup = self.get_soup(chapter["url"])
for script in soup.select("head script"):
novel_id = re.findall(r"'novel_id': '(\d+)'", str(script))
chapter_id = re.findall(r"'chapter_id': '(\d+)'", str(script))
if novel_id and chapter_id:
novel_id = novel_id[0]
chapter_id = chapter_id[0]
break
if not (novel_id and chapter_id):
return None
data = json.dumps(
{
"x1": novel_id,
"x2": chapter_id,
}
)
headers = {
"Origin": self.home_url.strip("/"),
"Referer": chapter["url"].strip("/"),
"X-XSRF-TOKEN": self.cookies["XSRF-TOKEN"],
"X-CSRF-TOKEN": soup.select_one('meta[name="csrf-token"]')["content"],
"Accept": "application/json, text/plain, */*",
"Content-Type": "application/json;charset=UTF-8",
}
logger.debug(
"Request to %s with:\ndata = %s\nheaders=%s", chapter_aux_url, data, headers
)
response = self.scraper.post(chapter_aux_url, data=data, headers=headers)
data = response.json()
# From https://www.foxteller.com/js/chapter.js line 4397:4403
# Search text for the code block: Cookies.get("gdb1")
aux = data["aux"]
aux = re.sub(r"%Ra&", "A", aux)
aux = re.sub(r"%Rc&", "B", aux)
aux = re.sub(r"%Rb&", "C", aux)
aux = re.sub(r"%Rd&", "D", aux)
aux = re.sub(r"%Rf&", "E", aux)
aux = re.sub(r"%Re&", "F", aux)
aux = base64.b64decode(aux)
content = self.make_soup(aux)
content = "\n".join(str(p) for p in content.select("p"))
return self.clean_text(content)