Skip to content

Commit 110b64c

Browse files
committed
Splitting named scrapers into their own files
1 parent cf5be7d commit 110b64c

File tree

10 files changed

+373
-325
lines changed

10 files changed

+373
-325
lines changed

minet/scrape/classes/named.py

-325
This file was deleted.
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import Dict, Type
2+
from .types import NamedScraper
3+
4+
from .canonical import CanonicalScraper
5+
from .europresse import EuropresseScraper
6+
from .images import ImagesScraper
7+
from .metas import MetasScraper
8+
from .rss import RssScraper
9+
from .title import TitleScraper
10+
from .urls import UrlsScraper
11+
12+
NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
13+
s.name: s
14+
for s in [
15+
TitleScraper,
16+
CanonicalScraper,
17+
UrlsScraper,
18+
ImagesScraper,
19+
MetasScraper,
20+
RssScraper,
21+
EuropresseScraper,
22+
]
23+
}
+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import Any, cast
2+
3+
from bs4 import SoupStrainer, BeautifulSoup
4+
5+
from .types import NamedScraper
6+
7+
8+
class CanonicalScraper(NamedScraper):
9+
name = "canonical"
10+
fieldnames = ["canonical_url"]
11+
plural = False
12+
output_type = "scalar"
13+
strainer = SoupStrainer(name="link", attrs={"rel": "canonical"})
14+
15+
def scrape(self, soup: BeautifulSoup, context=None) -> Any:
16+
link_elem = soup.select_one("link[rel=canonical][href]")
17+
18+
if link_elem is None:
19+
return None
20+
21+
url = link_elem.get("href")
22+
23+
if url is None:
24+
return None
25+
26+
url = cast(str, url).strip()
27+
28+
if not url:
29+
return None
30+
31+
return url

0 commit comments

Comments
 (0)