Skip to content

Commit

Permalink
Adding WonderfulSoup.force_scrape_one
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Sep 6, 2024
1 parent 5941cfa commit 6ba46a1
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 10 deletions.
3 changes: 2 additions & 1 deletion minet/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Module exposing utilities related to minet's scraping DSL.
#
from minet.scrape.classes.definition import scrape, DefinitionScraper, validate
from minet.scrape.soup import WonderfulSoup, SelectionError
from minet.scrape.soup import WonderfulSoup, SelectionError, ExtractionError
from minet.scrape.regex import (
extract_encodings_from_xml,
extract_canonical_link,
Expand All @@ -19,6 +19,7 @@
"validate",
"WonderfulSoup",
"SelectionError",
"ExtractionError",
"extract_encodings_from_xml",
"extract_canonical_link",
"extract_javascript_relocation",
Expand Down
26 changes: 19 additions & 7 deletions minet/scrape/soup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ class SelectionError(Exception):
pass


class ExtractionError(Exception):
pass


def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]:
if target is None or target == "text":
return elem.get_text()
Expand Down Expand Up @@ -52,15 +56,23 @@ def select(self, css: str, *args, **kwargs) -> List["MinetTag"]:

return cast(List["MinetTag"], super().select(css, *args, **kwargs))

def scrape_one(
self, css: str, target: Optional[str] = None, strict: bool = False
) -> Optional[str]:
def force_scrape_one(self, css: str, target: Optional[str] = None) -> str:
elem = self.select_one(css)

if elem is None:
if strict:
raise SelectionError(css)
raise SelectionError(css)

value = extract(elem, target)

if value is None:
raise ExtractionError(target)

return value

def scrape_one(self, css: str, target: Optional[str] = None) -> Optional[str]:
elem = self.select_one(css)

if elem is None:
return None

return extract(elem, target)
Expand Down Expand Up @@ -130,10 +142,10 @@ def __init__(
super().__init__(
markup,
features,
element_classes=WONDERFUL_ELEMENT_CLASSES,
element_classes=WONDERFUL_ELEMENT_CLASSES, # type: ignore
parse_only=parse_only,
multi_valued_attributes=None,
) # type: ignore
)


@contextmanager
Expand Down
7 changes: 5 additions & 2 deletions test/soup_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from minet.scrape.soup import WonderfulSoup, SelectionError
from minet.scrape.soup import WonderfulSoup, SelectionError, ExtractionError

HTML = """
<div>
Expand Down Expand Up @@ -50,7 +50,10 @@ def test_scrape_one(self):
assert soup.scrape_one("link") is None

with pytest.raises(SelectionError):
assert soup.scrape_one("link", strict=True)
assert soup.force_scrape_one("link")

with pytest.raises(ExtractionError):
assert soup.force_scrape_one("h1", "blabla")

def test_get(self):
soup = WonderfulSoup('<div class=" a b ">Ok</div>')
Expand Down

0 comments on commit 6ba46a1

Please sign in to comment.