Skip to content

Commit

Permalink
Remarks fixed
Browse files Browse the repository at this point in the history
- Url transformation moved to func parse_chapter_body
- The optimal values of self.workers and time.sleep are selected
  • Loading branch information
CryZFix committed Oct 30, 2023
1 parent 5e6d9f8 commit b040369
Showing 1 changed file with 18 additions and 14 deletions.
32 changes: 18 additions & 14 deletions lncrawl/templates/novelupdates.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ class NovelupdatesTemplate(SearchableBrowserTemplate, ChapterOnlyBrowserTemplate
_cached_crawlers: Mapping[str, Crawler] = {}
_title_matcher = re.compile(r"^(c|ch|chap|chapter)?[^\w\d]*(\d+)$", flags=re.I)

def initialize(self):
self.init_executor(
workers=4,
)

def wait_for_cloudflare(self):
if "cf_clearance" in self.cookies:
return
Expand All @@ -46,18 +51,6 @@ def cleanup_prompts(self):
except Exception:
pass

def relibrary_url_transformation(self, response, chapter: Chapter) -> str:
soup = BeautifulSoup(response.text, "lxml")
post_url = soup.select("div > p > a")[-1]["href"]
if "page_id" in post_url:
return post_url
novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}"
time.sleep(5)
response = self.get_soup(novel_url)
chapters = response.select(".page_item > a")
time.sleep(5)
return chapters[chapter.id - 1]["href"]

def select_search_items(self, query: str):
query = dict(sf=1, sh=query, sort="srank", order="asc", rl=1, mrl="min")
soup = self.get_soup(
Expand Down Expand Up @@ -150,8 +143,6 @@ def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
def download_chapter_body_in_scraper(self, chapter: Chapter) -> None:
response = self.get_response(chapter.url, allow_redirects=True)
logger.info("%s => %s", chapter.url, response.url)
if "re-library" in response.url and "translations" not in response.url:
response.url = self.relibrary_url_transformation(response, chapter)
chapter.url = response.url
return self.parse_chapter_body(chapter, response.text)

Expand All @@ -170,6 +161,19 @@ def select_chapter_body(self, soup: BeautifulSoup) -> Tag:
return super().select_chapter_body(soup)

def parse_chapter_body(self, chapter: Chapter, text: str) -> str:
if "re-library" in chapter.url and "translations" not in chapter.url:
soup = self.get_soup(chapter.url)
post_url = soup.select(".entry-content > p[style*='center'] a")[-1]['href']
if "page_id" in post_url:
chapter.url = post_url
else:
time.sleep(2.5)
novel_url = f"https://re-library.com/translations/{post_url.split('/')[4:5][0]}"
response = self.get_soup(novel_url)
chapters = response.select(".page_item > a")
chapter.url = chapters[chapter.id - 1]["href"]
time.sleep(2.5)

crawler = self._find_original_crawler(chapter)
if hasattr(crawler, "download_chapter_body_in_scraper"):
return crawler.download_chapter_body_in_scraper(chapter)
Expand Down

0 comments on commit b040369

Please sign in to comment.