From ebda6f58bac5b8593fe1e2b05053262a6a76de97 Mon Sep 17 00:00:00 2001 From: CryZFix Date: Sun, 22 Oct 2023 01:47:51 +0400 Subject: [PATCH 1/7] Fix source [Novelupdates] > novepupdates.py - Added detection of the teaser page and the transition to the real url of the chapter. The speed of operation has slightly decreased to avoid HTTP ERROR 503. > relibrary.py - Added a tag in cleaner to avoid collecting information about the book in the text of the chapter. --- lncrawl/templates/novelupdates.py | 14 ++++++++++++++ sources/en/r/relibrary.py | 1 + 2 files changed, 15 insertions(+) diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py index 8474860ef..465ad9c4a 100644 --- a/lncrawl/templates/novelupdates.py +++ b/lncrawl/templates/novelupdates.py @@ -46,6 +46,18 @@ def cleanup_prompts(self): except Exception: pass + def relibrary_url_transformation(self, response, chapter: Chapter) -> str: + soup = BeautifulSoup(response.text, "lxml") + post_url = soup.select("div > p > a")[-1]["href"] + if "page_id" in post_url: + return post_url + novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}" + time.sleep(5) + response = self.get_soup(novel_url) + chapters = response.select(".page_item > a") + time.sleep(5) + return chapters[chapter.id - 1]["href"] + def select_search_items(self, query: str): query = dict(sf=1, sh=query, sort="srank", order="asc", rl=1, mrl="min") soup = self.get_soup( @@ -138,6 +150,8 @@ def parse_chapter_item(self, tag: Tag, id: int) -> Chapter: def download_chapter_body_in_scraper(self, chapter: Chapter) -> None: response = self.get_response(chapter.url, allow_redirects=True) logger.info("%s => %s", chapter.url, response.url) + if "re-library" in response.url and "translations" not in response.url: + response.url = self.relibrary_url_transformation(response, chapter) chapter.url = response.url return self.parse_chapter_body(chapter, response.text) diff --git a/sources/en/r/relibrary.py b/sources/en/r/relibrary.py index ad6f174b2..30ef1bd9b 100644 --- a/sources/en/r/relibrary.py +++ b/sources/en/r/relibrary.py @@ -18,6 +18,7 @@ def initialize(self) -> None: self.init_executor(1) self.cleaner.bad_css.update( [ + "tr", ".nextPageLink", ".prevPageLink", ".su-button", From 0a116efe5f96e6c80cd4a0c4411812df6b8f698b Mon Sep 17 00:00:00 2001 From: CryZFix Date: Sun, 22 Oct 2023 02:10:22 +0400 Subject: [PATCH 2/7] lint flake8 --- sources/en/d/daotranslate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sources/en/d/daotranslate.py b/sources/en/d/daotranslate.py index 4b42680f4..8561596c4 100644 --- a/sources/en/d/daotranslate.py +++ b/sources/en/d/daotranslate.py @@ -11,7 +11,7 @@ class DaoTranslateCrawler(Crawler): base_url = "https://daotranslate.com/" - has_mtl= True + has_mtl = True def initialize(self): self.init_executor(ratelimit=1.1) @@ -57,7 +57,6 @@ def read_novel_info(self): self.novel_cover = possible_image["data-src"] logger.info("Novel cover: %s", self.novel_cover) - possible_author = soup.select_one( ".info-content .spe span:nth-child(3) a" ) From 5e6d9f84c544bc7a32c7378838de64ae34b896be Mon Sep 17 00:00:00 2001 From: CryZFix Date: Fri, 27 Oct 2023 17:45:50 +0400 Subject: [PATCH 3/7] Added new sources --- sources/zh/shw5.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ sources/zh/trxs.py | 55 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 sources/zh/shw5.py create mode 100644 sources/zh/trxs.py diff --git a/sources/zh/shw5.py b/sources/zh/shw5.py new file mode 100644 index 000000000..160d2f579 --- /dev/null +++ b/sources/zh/shw5.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import logging + +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class Shw5Crawler(Crawler): + base_url = [ + "https://www.shw5.cc/", + "https://www.bq99.cc/", + "https://www.p2wt.com/", + ] + + def read_novel_info(self): + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one(".book h1") + assert possible_title, "No novel title" + self.novel_title = possible_title.text + logger.info("Novel title: %s", self.novel_title) + + possible_novel_cover = soup.select_one('.book img') + if possible_novel_cover: + self.novel_cover = self.absolute_url(possible_novel_cover["src"]) + logger.info("Novel cover: %s", self.novel_cover) + + possible_synopsis = soup.select_one('.intro dd') + if possible_synopsis: + self.novel_synopsis = possible_synopsis.text + logger.info("Novel synopsis %s", self.novel_synopsis) + + possible_novel_author = soup.select_one('.book .small span') + if possible_novel_author: + self.novel_author = possible_novel_author.text + logger.info("Novel author: %s", self.novel_author) + + volumes = set([]) + chapters = soup.select_one('.listmain') + for a in chapters.find_all("a", rel=False): + ch_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + volumes.add(vol_id) + self.chapters.append( + { + "id": ch_id, + "volume": vol_id, + "title": a.text, + "url": self.absolute_url(a["href"]), + } + ) + + self.volumes = [{"id": x, "title": ""} for x in volumes] + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter["url"]) + contents = soup.select_one("#chaptercontent") + return self.cleaner.extract_contents(contents) diff --git a/sources/zh/trxs.py b/sources/zh/trxs.py new file mode 100644 index 000000000..40a82c7c4 --- /dev/null +++ b/sources/zh/trxs.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +import logging + +from lncrawl.core.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class TrxsCrawler(Crawler): + base_url = "https://trxs.cc/" + + def read_novel_info(self): + logger.debug("Visiting %s", self.novel_url) + soup = self.get_soup(self.novel_url, encoding='gb2312') + + possible_title = soup.select_one(".book_info h1") + assert possible_title, "No novel title" + self.novel_title = possible_title.text + logger.info("Novel title: %s", self.novel_title) + + possible_novel_cover = soup.select_one('.book_info img') + if possible_novel_cover: + self.novel_cover = self.absolute_url(possible_novel_cover["src"]) + logger.info("Novel cover: %s", self.novel_cover) + + possible_synopsis = soup.select_one('.book_info p') + if possible_synopsis: + self.novel_synopsis = possible_synopsis.text + logger.info("Novel synopsis %s", self.novel_synopsis) + + possible_novel_author = soup.select_one('.book_info a') + if possible_novel_author: + self.novel_author = possible_novel_author.text + logger.info("Novel author: %s", self.novel_author) + + volumes = set([]) + for a in soup.select(".book_list a"): + ch_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + volumes.add(vol_id) + self.chapters.append( + { + "id": ch_id, + "volume": vol_id, + "title": a.text, + "url": self.absolute_url(a["href"]), + } + ) + + self.volumes = [{"id": x, "title": ""} for x in volumes] + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter["url"], encoding='gb2312') + contents = soup.select_one(".read_chapterDetail") + return self.cleaner.extract_contents(contents) From b040369cf674164b890248f0497116a33941413b Mon Sep 17 00:00:00 2001 From: CryZFix Date: Mon, 30 Oct 2023 14:40:19 +0400 Subject: [PATCH 4/7] Remarks fixed - Url transformation moved to func parse_chapter_body - The optimal values of self.workers and time.sleep are selected --- lncrawl/templates/novelupdates.py | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py index 465ad9c4a..814bdf36c 100644 --- a/lncrawl/templates/novelupdates.py +++ b/lncrawl/templates/novelupdates.py @@ -28,6 +28,11 @@ class NovelupdatesTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate _cached_crawlers: Mapping[str, Crawler] = {} _title_matcher = re.compile(r"^(c|ch|chap|chapter)?[^\w\d]*(\d+)$", flags=re.I) + def initialize(self): + self.init_executor( + workers=4, + ) + def wait_for_cloudflare(self): if "cf_clearance" in self.cookies: return @@ -46,18 +51,6 @@ def cleanup_prompts(self): except Exception: pass - def relibrary_url_transformation(self, response, chapter: Chapter) -> str: - soup = BeautifulSoup(response.text, "lxml") - post_url = soup.select("div > p > a")[-1]["href"] - if "page_id" in post_url: - return post_url - novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}" - time.sleep(5) - response = self.get_soup(novel_url) - chapters = response.select(".page_item > a") - time.sleep(5) - return chapters[chapter.id - 1]["href"] - def select_search_items(self, query: str): query = dict(sf=1, sh=query, sort="srank", order="asc", rl=1, mrl="min") soup = self.get_soup( @@ -150,8 +143,6 @@ def parse_chapter_item(self, tag: Tag, id: int) -> Chapter: def download_chapter_body_in_scraper(self, chapter: Chapter) -> None: response = self.get_response(chapter.url, allow_redirects=True) logger.info("%s => %s", chapter.url, response.url) - if "re-library" in response.url and "translations" not in response.url: - response.url = self.relibrary_url_transformation(response, chapter) chapter.url = response.url return self.parse_chapter_body(chapter, response.text) @@ -170,6 +161,19 @@ def select_chapter_body(self, soup: BeautifulSoup) -> Tag: return super().select_chapter_body(soup) def parse_chapter_body(self, chapter: Chapter, text: str) -> str: + if "re-library" in chapter.url and "translations" not in chapter.url: + soup = self.get_soup(chapter.url) + post_url = soup.select(".entry-content > p[style*='center'] a")[-1]['href'] + if "page_id" in post_url: + chapter.url = post_url + else: + time.sleep(2.5) + novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}" + response = self.get_soup(novel_url) + chapters = response.select(".page_item > a") + chapter.url = chapters[chapter.id - 1]["href"] + time.sleep(2.5) + crawler = self._find_original_crawler(chapter) if hasattr(crawler, "download_chapter_body_in_scraper"): return crawler.download_chapter_body_in_scraper(chapter) From a80006cb94363c48a0613e8c5eb51322ab2b75f3 Mon Sep 17 00:00:00 2001 From: CryZFix Date: Mon, 30 Oct 2023 14:50:24 +0400 Subject: [PATCH 5/7] fixed the capture of the ulrs element --- lncrawl/templates/novelupdates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lncrawl/templates/novelupdates.py b/lncrawl/templates/novelupdates.py index 814bdf36c..1a7a90fa5 100644 --- a/lncrawl/templates/novelupdates.py +++ b/lncrawl/templates/novelupdates.py @@ -163,7 +163,7 @@ def select_chapter_body(self, soup: BeautifulSoup) -> Tag: def parse_chapter_body(self, chapter: Chapter, text: str) -> str: if "re-library" in chapter.url and "translations" not in chapter.url: soup = self.get_soup(chapter.url) - post_url = soup.select(".entry-content > p[style*='center'] a")[-1]['href'] + post_url = soup.select_one(".entry-content > p[style*='center'] a")['href'] if "page_id" in post_url: chapter.url = post_url else: From a643d5bc3d9fbecae8dfa9a0dc9f4a005ddf1121 Mon Sep 17 00:00:00 2001 From: CryZFix Date: Tue, 31 Oct 2023 01:49:47 +0400 Subject: [PATCH 6/7] Changed login method - Due to the addition of a captcha for logging in, the previous method stopped working. Now we have to specify the token manually, by analogy with wuxiaworld. --- sources/en/m/mtlnation.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/sources/en/m/mtlnation.py b/sources/en/m/mtlnation.py index 73e4692d8..c737e5866 100644 --- a/sources/en/m/mtlnation.py +++ b/sources/en/m/mtlnation.py @@ -19,18 +19,9 @@ def initialize(self): self.init_executor(1) def login(self, email: str, password: str) -> None: - self.post_json( - "https://api.mtlnation.com/api/v2/accounts/login", - data=json.dumps( - { - "identity": email, - "password": password, - } - ), - ) - jwt = self.cookies.get("jwt") - self.set_header("authorization", f"JWT {jwt}") - logger.info("Logged in with jwt %s", jwt) + self.set_header("Authorization", f"{email} {password}") + response = self.get_json("https://api.mtlnation.com/api/v2/users/me") + logger.info("Logged in as %s" % response["data"]["name"]) def search_novel(self, query): data = self.get_json( From 2ba48aa1f7263bb4cece17b5aaaff8df145368f1 Mon Sep 17 00:00:00 2001 From: CryZFix Date: Tue, 31 Oct 2023 01:57:07 +0400 Subject: [PATCH 7/7] remove unused imports --- sources/en/m/mtlnation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sources/en/m/mtlnation.py b/sources/en/m/mtlnation.py index c737e5866..1808efbed 100644 --- a/sources/en/m/mtlnation.py +++ b/sources/en/m/mtlnation.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import json import logging from urllib.parse import urlencode, urlparse