diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py index cfa3a2de4..fb8cb77e7 100644 --- a/lncrawl/core/crawler.py +++ b/lncrawl/core/crawler.py @@ -117,18 +117,19 @@ def extract_chapter_images(self, chapter: Chapter) -> None: if not chapter.body: return + has_changes = False chapter.setdefault("images", {}) soup = self.make_soup(chapter.body) for img in soup.select("img[src]"): full_url = self.absolute_url(img["src"], page_url=chapter["url"]) if not full_url.startswith("http"): continue - filename = hashlib.md5(full_url.encode()).hexdigest() + ".jpg" img.attrs = {"src": "images/" + filename, "alt": filename} chapter.images[filename] = full_url + has_changes = True - if soup.find("body") is not None: + if has_changes: chapter.body = soup.find("body").decode_contents() def download_chapters( diff --git a/sources/jp/s/syosetu.py b/sources/jp/s/syosetu.py index e69730877..a38cbf424 100644 --- a/sources/jp/s/syosetu.py +++ b/sources/jp/s/syosetu.py @@ -4,9 +4,7 @@ from lncrawl.core.crawler import Crawler logger = logging.getLogger(__name__) -search_url = ( - "https://yomou.syosetu.com/search.php?word=%s" -) +search_url = "https://yomou.syosetu.com/search.php?word=%s" class SyosetuCrawler(Crawler): @@ -18,8 +16,12 @@ def search_novel(self, query): results = [] for tab in soup.select(".searchkekka_box"): a = tab.select_one(".novel_h a") - latest = tab.select_one(".left").get_text(separator=" ").strip() # e.g.: 連載中 (全604部分) - votes = tab.select_one(".attention").text.strip() # e.g.: "総合ポイント: 625,717 pt" + latest = ( + tab.select_one(".left").get_text(separator=" ").strip() + ) # e.g.: 連載中 (全604部分) + votes = tab.select_one( + ".attention" + ).text.strip() # e.g.: "総合ポイント: 625,717 pt" results.append( { "title": a.text.strip(), @@ -30,9 +32,11 @@ def search_novel(self, query): return results def read_novel_info(self): + self.init_parser('xml') soup = self.get_soup(self.novel_url) self.novel_title = soup.select_one(".novel_title").text.strip() + logger.debug('Novel title: %s', self.novel_title) # No novel cover. @@ -41,29 +45,28 @@ def read_novel_info(self): self.novel_author = author_tag.text.strip() # Syosetu calls parts "chapters" - volume_id = 0 if len(soup.select(".chapter_title")) != 0 else 1 - chapter_id = 1 - for tag in soup.select(".chapter_title, .subtitle a"): - if tag.name == "a": + chapter_id = 0 + volume = {"id": 0} + self.volumes.append(volume) + for tag in soup.select(".index_box .chapter_title, .index_box .subtitle a"): + if 'chapter_title' in tag.attrs.get('class', ''): + # Part/volume (there might be none) + volume = { + "id": volume['id'] + 1, + "title": tag.text.strip(), + } + self.volumes.append(volume) + elif tag.name == "a": # Chapter + chapter_id += 1 self.chapters.append( { "id": chapter_id, - "volume": volume_id, + "volume": volume['id'], "title": tag.text.strip() or ("Chapter %d" % chapter_id), "url": self.absolute_url(tag["href"]), } ) - chapter_id += 1 - elif tag.name == "div": - # Part/volume (there might be none) - self.volumes.append( - { - "id": volume_id, - "title": tag.text.strip(), - } - ) - volume_id += 1 def download_chapter_body(self, chapter): soup = self.get_soup(chapter["url"])