Skip to content

Commit

Permalink
Add search for piaotian
Browse files Browse the repository at this point in the history
  • Loading branch information
Zokhoi authored and dipu-bd committed Sep 11, 2024
1 parent c0e4a4e commit 68ac8f0
Showing 1 changed file with 66 additions and 1 deletion.
67 changes: 66 additions & 1 deletion sources/zh/piaotian.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,28 @@
import logging

from lncrawl.core.crawler import Crawler
import urllib.parse

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:101.0) Gecko/20100101 Firefox/101.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, utf-8",
"Accept-Language": "en-US,en;q=0.9,de-CH;q=0.8,de;q=0.7",
"Cache-Control": "no-cache",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "https://www.piaotia.com",
"DNT": "1",
"Referer": "https://www.piaotia.com/modules/article/search.php",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Opera GX";v="106"',
"Sec-Ch-Ua-Platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
}

logger = logging.getLogger(__name__)

Expand All @@ -13,9 +35,52 @@ class PiaoTian(Crawler):
base_url = [
"https://www.piaotian.com",
"https://www.ptwxz.com",
"https://www.piaotia.com"
"https://www.piaotia.com",
]

def search_novel(self, query):
query = urllib.parse.quote(query.encode("gbk"))
search = urllib.parse.quote(" 搜 索 ".encode("gbk"))
data = f"searchtype=articlename&searchkey={query}&Submit={search}"
headers["Origin"] = self.home_url
headers["Referer"] = novel_search_url % self.home_url

response = self.post_response(
novel_search_url % self.home_url,
headers=headers,
data=data,
)
soup = self.make_soup(response, "gbk")

results = []

# if there is only one result, the search page redirects to bookinfo page of that result
if response.url.startswith("%sbookinfo/" % self.home_url):
author = soup.select('div#content table tr td[width]')[2].get_text()
author = author.replace(u'\xa0', "").replace("作 者:", "")
results.append(
{
"title": soup.select_one("div#content table table table h1").get_text(),
"url": response.url,
"info": f"Author: {author}",
}
)

else:
for data in soup.select("div#content table tr")[1:]:
title = data.select_one("td a").get_text()
author = data.select("td")[2].get_text()
url = data.select_one("td a")["href"]

results.append(
{
"title": title,
"url": url,
"info": f"Author: {author}",
}
)
return results

def read_novel_info(self):
# Transform bookinfo page into chapter list page
# https://www.piaotia.com/bookinfo/8/8866.html -> https://www.piaotia.com/html/8/8866/
Expand Down

0 comments on commit 68ac8f0

Please sign in to comment.