diff --git a/minet/scrape/__init__.py b/minet/scrape/__init__.py index 0c04b47eeb..cd87e95129 100644 --- a/minet/scrape/__init__.py +++ b/minet/scrape/__init__.py @@ -5,7 +5,7 @@ # Module exposing utilities related to minet's scraping DSL. # from minet.scrape.classes.definition import scrape, DefinitionScraper, validate -from minet.scrape.soup import WonderfulSoup, SelectionError +from minet.scrape.soup import WonderfulSoup, SelectionError, ExtractionError from minet.scrape.regex import ( extract_encodings_from_xml, extract_canonical_link, @@ -19,6 +19,7 @@ "validate", "WonderfulSoup", "SelectionError", + "ExtractionError", "extract_encodings_from_xml", "extract_canonical_link", "extract_javascript_relocation", diff --git a/minet/scrape/soup.py b/minet/scrape/soup.py index 420ee25f0a..d035495719 100644 --- a/minet/scrape/soup.py +++ b/minet/scrape/soup.py @@ -19,6 +19,10 @@ class SelectionError(Exception): pass +class ExtractionError(Exception): + pass + + def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]: if target is None or target == "text": return elem.get_text() @@ -52,15 +56,23 @@ def select(self, css: str, *args, **kwargs) -> List["MinetTag"]: return cast(List["MinetTag"], super().select(css, *args, **kwargs)) - def scrape_one( - self, css: str, target: Optional[str] = None, strict: bool = False - ) -> Optional[str]: + def force_scrape_one(self, css: str, target: Optional[str] = None) -> str: elem = self.select_one(css) if elem is None: - if strict: - raise SelectionError(css) + raise SelectionError(css) + + value = extract(elem, target) + if value is None: + raise ExtractionError(target) + + return value + + def scrape_one(self, css: str, target: Optional[str] = None) -> Optional[str]: + elem = self.select_one(css) + + if elem is None: return None return extract(elem, target) @@ -130,10 +142,10 @@ def __init__( super().__init__( markup, features, - element_classes=WONDERFUL_ELEMENT_CLASSES, + element_classes=WONDERFUL_ELEMENT_CLASSES, # type: ignore parse_only=parse_only, multi_valued_attributes=None, - ) # type: ignore + ) @contextmanager diff --git a/test/soup_test.py b/test/soup_test.py index c4b3132301..2c05154f1d 100644 --- a/test/soup_test.py +++ b/test/soup_test.py @@ -1,6 +1,6 @@ import pytest -from minet.scrape.soup import WonderfulSoup, SelectionError +from minet.scrape.soup import WonderfulSoup, SelectionError, ExtractionError HTML = """