Skip to content

Commit

Permalink
use Driver() instead if SB context manager to reduce impact on existi…
Browse files Browse the repository at this point in the history
…ng code
  • Loading branch information
watzeedzad committed Apr 17, 2024
1 parent 5c47a8c commit ddc7306
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 65 deletions.
2 changes: 1 addition & 1 deletion lncrawl/core/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def _fetch_content_image(app, url, image_file: Path):
img.convert("RGBA").convert("RGB")
else:
img = img.convert("RGB")
img.save(image_file.as_posix(), "JPEG", optimized=True)
img.save(image_file.as_posix(), "JPEG", optimized=True, quality=95)
img.close()
logger.debug("Saved image: %s", image_file)
finally:
Expand Down
61 changes: 29 additions & 32 deletions lncrawl/templates/browser/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Generator, List, Optional

from PIL import Image
from seleniumbase import SB

from ...core.browser import Browser, By
from ...core.crawler import Crawler
Expand Down Expand Up @@ -85,18 +84,17 @@ def search_novel(self, query: str) -> List[SearchResult]:
self.close_browser()

def read_novel_info(self) -> None:
with SB(uc=True, test=True, headless=self.headless, headless2=self.headless, maximize=True) as sb:
try:
self.read_novel_info_in_scraper()
except ScraperErrorGroup as e:
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed in read novel info: %s", e)
self.init_browser()
self.volumes.clear()
self.chapters.clear()
self.read_novel_info_in_browser(sb)
finally:
self.close_browser()
try:
self.read_novel_info_in_scraper()
except ScraperErrorGroup as e:
if logger.isEnabledFor(logging.DEBUG):
logger.exception("Failed in read novel info: %s", e)
self.init_browser()
self.volumes.clear()
self.chapters.clear()
self.read_novel_info_in_browser()
finally:
self.close_browser()

def download_chapters(
self,
Expand All @@ -113,23 +111,22 @@ def download_chapters(
if not self.browser:
return

with SB(uc=True, test=True, headless=self.headless, headless2=self.headless, maximize=True) as sb:
for chapter in self.progress_bar(chapters, desc="Chapters", unit="item"):
if not isinstance(chapter, Chapter) or chapter.success:
yield 1
continue
try:
chapter.body = self.download_chapter_body_in_browser(chapter, sb)
self.extract_chapter_images(chapter)
chapter.success = True
except Exception as e:
logger.error("Failed to get chapter: %s", e)
chapter.body = ""
chapter.success = False
if isinstance(e, KeyboardInterrupt):
break
finally:
yield 1
for chapter in self.progress_bar(chapters, desc="Chapters", unit="item"):
if not isinstance(chapter, Chapter) or chapter.success:
yield 1
continue
try:
chapter.body = self.download_chapter_body_in_browser(chapter)
self.extract_chapter_images(chapter)
chapter.success = True
except Exception as e:
logger.error("Failed to get chapter: %s", e)
chapter.body = ""
chapter.success = False
if isinstance(e, KeyboardInterrupt):
break
finally:
yield 1

self.close_browser()

Expand Down Expand Up @@ -162,7 +159,7 @@ def read_novel_info_in_scraper(self) -> None:
raise FallbackToBrowser()

@abstractmethod
def read_novel_info_in_browser(self, sb: SB = None) -> None:
def read_novel_info_in_browser(self) -> None:
"""Read novel info with `self.browser`"""
raise NotImplementedError()

Expand All @@ -174,6 +171,6 @@ def download_chapter_body_in_scraper(self, chapter: Chapter) -> str:
raise FallbackToBrowser()

@abstractmethod
def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
"""Download the chapter contents using the `self.browser`"""
raise NotImplementedError()
3 changes: 1 addition & 2 deletions lncrawl/templates/browser/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Generator, Union

from bs4 import BeautifulSoup, Tag
from seleniumbase import SB

from ...core.exeptions import FallbackToBrowser
from ...models import Chapter, Volume
Expand Down Expand Up @@ -89,7 +88,7 @@ def download_chapter_body_in_scraper(self, chapter: Chapter) -> str:
body = self.select_chapter_body(soup)
return self.parse_chapter_body(body)

def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
self.visit_chapter_page_in_browser(chapter)
body = self.select_chapter_body_in_browser()
return self.parse_chapter_body(body)
Expand Down
3 changes: 1 addition & 2 deletions lncrawl/templates/novelupdates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from bs4 import BeautifulSoup, Tag
from readability import Document
from seleniumbase import SB

from lncrawl.core.browser import EC
from lncrawl.core.crawler import Crawler
Expand Down Expand Up @@ -147,7 +146,7 @@ def download_chapter_body_in_scraper(self, chapter: Chapter) -> None:
chapter.url = response.url
return self.parse_chapter_body(chapter, response.text)

def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
self.visit(chapter.url)
for i in range(30):
if not self.browser.current_url.startswith(chapter.url):
Expand Down
3 changes: 1 addition & 2 deletions sources/_examples/_09_basic_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
"""
import logging
from typing import List
from seleniumbase import SB

from lncrawl.models.chapter import Chapter
from lncrawl.models.search_result import SearchResult
Expand Down Expand Up @@ -103,7 +102,7 @@ def download_chapter_body_in_scraper(self, chapter: Chapter) -> str:
pass

# TODO: [REQUIRED] Download the chapter contents using the `self.browser`
def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
# Use the `chapter['url']` to get the chapter contents.
# You can use `self.visit` to visit the chapter in browser tab.
# There can be only one thread using the browser at a time.
Expand Down
3 changes: 1 addition & 2 deletions sources/en/w/webnovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from urllib.parse import urlencode, urlparse

from bs4 import BeautifulSoup
from seleniumbase import SB

from lncrawl.core.exeptions import FallbackToBrowser
from lncrawl.models import Chapter, SearchResult
Expand Down Expand Up @@ -137,7 +136,7 @@ def parse_chapter_catalog(self, soup: BeautifulSoup) -> None:
)
self.chapters.append(chap)

def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
path = urlparse(chapter.url).path.strip("/")
self.visit(f"{self.home_url}{path}")
self.browser.wait(f"j_chapter_{chapter.cid}", By.CLASS_NAME)
Expand Down
3 changes: 1 addition & 2 deletions sources/en/w/wuxiacom.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import re

from pyease_grpc import RpcSession
from seleniumbase import SB

from lncrawl.core.exeptions import FallbackToBrowser
from lncrawl.models import Chapter, Volume
Expand Down Expand Up @@ -265,7 +264,7 @@ def read_novel_info_in_browser(self) -> None:
# Close progress bar
bar.close()

def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
# login
if not self.start_download_chapter_body_in_browser:
if self.bearer_token:
Expand Down
49 changes: 27 additions & 22 deletions sources/multi/foxaholic.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging
import os
from io import BytesIO

from PIL import Image
from seleniumbase import SB
from bs4 import BeautifulSoup
from seleniumbase import Driver

from lncrawl.models import Chapter
from lncrawl.templates.browser.basic import BasicBrowserTemplate
Expand All @@ -11,18 +13,21 @@


def open_turnstile_page(base, url):
base.reconnect(timeout=3)
# open web page using uc
base.driver.uc_open_with_reconnect(url, reconnect_time=5)
base.uc_open_with_reconnect(url, reconnect_time=3)


def click_turnstile(base):
base.reconnect(timeout=3)
# do turnstile challenge
if base.is_element_visible('.captcha-prompt iframe'):
base.driver.switch_to_frame('.captcha-prompt iframe')
base.driver.uc_click('span.mark', reconnect_time=5)
base.switch_to_frame('.captcha-prompt iframe')
base.uc_click('span.mark', reconnect_time=3)


class FoxaholicCrawler(BasicBrowserTemplate):
driver = None
base_url = [
"https://foxaholic.com/",
"https://www.foxaholic.com/",
Expand All @@ -31,16 +36,17 @@ class FoxaholicCrawler(BasicBrowserTemplate):
]

def initialize(self) -> None:
self.driver = Driver(uc=True, headless=self.headless, headless2=self.headless, chromium_arg='--start-maximized')
self.init_executor(1)

def read_novel_info_in_browser(self, sb: SB = None) -> None:
open_turnstile_page(sb, self.novel_url)
click_turnstile(sb)
def read_novel_info_in_browser(self) -> None:
open_turnstile_page(self.driver, self.novel_url)
click_turnstile(self.driver)

# verify that page is loaded
sb.assert_element('.wp-manga-chapter.free-chap a', timeout=60)
self.driver.assert_element('.wp-manga-chapter.free-chap a', timeout=60)
# get bs4 from web page
soup = sb.get_beautiful_soup()
soup = BeautifulSoup(self.driver.get_page_source(), 'html.parser')

self.novel_title = soup.select_one('.post-title h1').text.strip()
logger.info("Novel title: %s", self.novel_title)
Expand Down Expand Up @@ -70,25 +76,24 @@ def read_novel_info_in_browser(self, sb: SB = None) -> None:
}
)

def download_chapter_body_in_browser(self, chapter: Chapter, sb: SB = None) -> str:
open_turnstile_page(sb, chapter['url'])
click_turnstile(sb)
def download_chapter_body_in_browser(self, chapter: Chapter) -> str:
open_turnstile_page(self.driver, chapter['url'])
click_turnstile(self.driver)

# verify that page is loaded
sb.assert_element('.entry-content_wrap', timeout=60)
self.driver.assert_element('.entry-content_wrap', timeout=60)
# get bs4 from web page
soup = sb.get_beautiful_soup()
soup = BeautifulSoup(self.driver.get_page_source(), 'html.parser')

contents = soup.select_one('.entry-content_wrap')
return self.cleaner.extract_contents(contents)

def download_image(self, url, **kwargs):
with SB(uc=True, test=True, headless=self.headless, headless2=self.headless, maximize=True) as sb:
open_turnstile_page(sb, url)
click_turnstile(sb)
def download_image(self, url, **kwargs) -> Image:
open_turnstile_page(self.driver, url)
click_turnstile(self.driver)

# verify that page is loaded
sb.assert_element('img', timeout=60)
# verify that page is loaded
self.driver.assert_element('img', timeout=60)

img = sb.find_element('img').screenshot_as_png
return Image.open(BytesIO(img))
img = self.driver.find_element('img').screenshot_as_png
return Image.open(BytesIO(img))

0 comments on commit ddc7306

Please sign in to comment.