From 949832e231404837c87d698be81aa32dddc12d6f Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 11 Dec 2023 15:20:47 +0100 Subject: [PATCH] Working of scrape cmd -m Related to #763 --- docs/cli.md | 24 ++- ftest/ftest-array.sh | 2 + ftest/scrapers/title.py | 10 ++ minet/cli/scrape/__init__.py | 21 ++- minet/cli/scrape/scrape.py | 143 +++++++++++------- minet/scrape/__init__.py | 4 +- minet/scrape/classes/__init__.py | 4 + minet/scrape/classes/base.py | 44 ++++++ .../{scraper.py => classes/definition.py} | 11 +- minet/scrape/classes/function.py | 41 +++++ minet/scrape/{typical.py => classes/named.py} | 12 +- minet/scrape/exceptions.py | 4 - minet/scrape/types.py | 108 +------------ test/scraper_test.py | 57 +------ 14 files changed, 241 insertions(+), 244 deletions(-) create mode 100644 ftest/scrapers/title.py create mode 100644 minet/scrape/classes/__init__.py create mode 100644 minet/scrape/classes/base.py rename minet/scrape/{scraper.py => classes/definition.py} (89%) create mode 100644 minet/scrape/classes/function.py rename minet/scrape/{typical.py => classes/named.py} (95%) diff --git a/docs/cli.md b/docs/cli.md index 25e2f5f1f2..c9a4bef62f 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1121,7 +1121,7 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/ ``` Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND] - [--simple-progress] [-g] [-I INPUT_DIR] [-p PROCESSES] + [--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES] [--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN] [--url-column URL_COLUMN] [--error-column ERROR_COLUMN] [--status-column STATUS_COLUMN] @@ -1138,6 +1138,8 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND] Use multiple processes to scrape data from a batch of HTML files using minet scraping DSL documented here: https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md +or a python function given using the -m/--module flag, or an already +implemented typical scraping routine (listed below). It will output the scraped items as a CSV or NDJSON file. @@ -1164,8 +1166,10 @@ an error occurred. Positional Arguments: scraper Path to a scraper definition file, or name of a - builtin scraper, e.g. "title". See the complete - list below. + builtin scraper, e.g. "title" (see the complete + list below), or a path to a python module and + function (e.g. scraper.py, + scraper.py:scrape_title). path_or_path_column Single path to process or name of the CSV column containing paths when using -i/--input. Defaults to "path". @@ -1190,6 +1194,8 @@ Optional Arguments: --mimetype-column MIMETYPE_COLUMN Name of the CSV column containing file mimetype. Defaults to `mimetype`. + -m, --module Whether given scraper is a python target to + import. --plural-separator PLURAL_SEPARATOR Separator use to join lists of values when serializing to CSV. Defaults to `|`. @@ -1263,6 +1269,15 @@ Examples: . Scraping a single url: $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i - +. Using a builtin scraper: + $ minet scrape title -i report.csv > titles.csv + +. Using the `scrape` (default) function of target python module: + $ minet scrape scraper.py -i report.csv > titles.csv + +. Using the `scrape_title` function of target python module: + $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv + . Indicating a custom path column (e.g. "file"): $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv @@ -1286,9 +1301,6 @@ Examples: . Keeping only some columns from input CSV file: $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv - -. Using a builtin scraper: - $ minet scrape title -i report.csv > titles.csv ``` ## screenshot diff --git a/ftest/ftest-array.sh b/ftest/ftest-array.sh index 8fbe11def4..1119ae7d39 100755 --- a/ftest/ftest-array.sh +++ b/ftest/ftest-array.sh @@ -15,6 +15,8 @@ echo echo "Scrape" echo " - Single HTML file" $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml $EXTRACT_DIR/article.html | wc -l +echo " - Single HTML file, typical scraper" +$MINET scrape -p 1 title $EXTRACT_DIR/article.html | wc -l echo " - Single glob pattern" $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml "$EXTRACT_DIR/*.html" -g | wc -l echo " - CSV input" diff --git a/ftest/scrapers/title.py b/ftest/scrapers/title.py new file mode 100644 index 0000000000..1829ec9503 --- /dev/null +++ b/ftest/scrapers/title.py @@ -0,0 +1,10 @@ +from minet.scrape import WonderfulSoup +from casanova import RowWrapper + + +def scrape(row: RowWrapper, soup: WonderfulSoup): + return {"url": row.url, "title": soup.scrape_one("title")} + + +def titles(row: RowWrapper, soup: WonderfulSoup): + yield soup.scrape_one("title") diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py index 3182de8aef..696524576a 100644 --- a/minet/cli/scrape/__init__.py +++ b/minet/cli/scrape/__init__.py @@ -14,6 +14,8 @@ def resolve_arguments(cli_args): Use multiple processes to scrape data from a batch of HTML files using minet scraping DSL documented here: https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md + or a python function given using the -m/--module flag, or an already + implemented typical scraping routine (listed below). It will output the scraped items as a CSV or NDJSON file. @@ -61,6 +63,15 @@ def resolve_arguments(cli_args): . Scraping a single url: $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i - + . Using a builtin scraper: + $ minet scrape title -i report.csv > titles.csv + + . Using the `scrape` (default) function of target python module: + $ minet scrape scraper.py -i report.csv > titles.csv + + . Using the `scrape_title` function of target python module: + $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv + . Indicating a custom path column (e.g. "file"): $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv @@ -84,16 +95,18 @@ def resolve_arguments(cli_args): . Keeping only some columns from input CSV file: $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv - - . Using a builtin scraper: - $ minet scrape title -i report.csv > titles.csv """, resolve=resolve_arguments, variadic_input={"dummy_column": "path", "optional": True, "no_help": True}, arguments=[ { "name": "scraper", - "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title". See the complete list below.', + "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title" (see the complete list below), or a path to a python module and function (e.g. scraper.py, scraper.py:scrape_title).', + }, + { + "flags": ["-m", "--module"], + "help": "Whether given scraper is a python target to import.", + "action": "store_true", }, { "flags": ["-g", "--glob"], diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py index d13af2e685..c287079d3f 100644 --- a/minet/cli/scrape/scrape.py +++ b/minet/cli/scrape/scrape.py @@ -13,12 +13,18 @@ from threading import Lock from os.path import basename, isdir -from minet.scrape import Scraper -from minet.scrape.typical import TYPICAL_SCRAPERS -from minet.scrape.types import ScraperBase +from minet.utils import import_target +from minet.scrape.classes import ( + NAMED_SCRAPERS, + ScraperBase, + DefinitionScraper, + FunctionScraper, +) from minet.multiprocessing import LazyPool from minet.exceptions import ( DefinitionInvalidFormatError, + GenericModuleNotFoundError, + TargetInGenericModuleNotFoundError, ) from minet.scrape.exceptions import ( InvalidScraperError, @@ -57,30 +63,19 @@ class ScrapeResult: SCRAPER: Optional[ScraperBase] = None -FORMAT: Optional[str] = None -PLURAL_SEPARATOR: Optional[str] = None HEADERS: Optional[casanova.headers] = None -def init_process(options): +def init_process(scraper: ScraperBase, fieldnames: List[str]): global SCRAPER - global FORMAT - global PLURAL_SEPARATOR global HEADERS - if options["name"] is not None: - SCRAPER = TYPICAL_SCRAPERS[options["name"]]() - else: - SCRAPER = Scraper(options["definition"], strain=options["strain"]) - - FORMAT = options["format"] - PLURAL_SEPARATOR = options["plural_separator"] - HEADERS = casanova.headers(options["fieldnames"]) + SCRAPER = scraper + HEADERS = casanova.headers(fieldnames) def worker(payload: ScrapeWorkerPayload) -> ScrapeResult: assert SCRAPER is not None - assert PLURAL_SEPARATOR is not None assert HEADERS is not None text = payload.text @@ -109,12 +104,7 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult: context["basename"] = basename(payload.path) # Attempting to scrape - if FORMAT == "csv": - items = SCRAPER.as_csv_rows( - text, context=context, plural_separator=PLURAL_SEPARATOR - ) - else: - items = SCRAPER.as_records(text, context=context) + items = SCRAPER.items(text, context=context) # NOTE: errors might be raised when we consume the generators created above try: @@ -129,19 +119,39 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult: def action(cli_args): - using_typical_scraper = False - # Parsing scraper definition try: - if cli_args.scraper in TYPICAL_SCRAPERS: - using_typical_scraper = True - scraper = TYPICAL_SCRAPERS[cli_args.scraper]() + if cli_args.module: + fn = import_target(cli_args.scraper, default="scrape") + scraper = FunctionScraper(fn, strain=cli_args.strain) + elif cli_args.scraper in NAMED_SCRAPERS: + scraper = NAMED_SCRAPERS[cli_args.scraper]() else: - scraper = Scraper(cli_args.scraper, strain=cli_args.strain) + scraper = DefinitionScraper(cli_args.scraper, strain=cli_args.strain) + + except GenericModuleNotFoundError: + raise FatalError( + [ + "Could not import %s!" % cli_args.scraper, + "Are you sure the module exists?", + ] + ) + + except TargetInGenericModuleNotFoundError as e: + raise FatalError( + [ + "Could not find the %s target in the %s module!" % (e.name, e.path), + "Are you sure this class/function/variable exists in the module?", + ] + ) except DefinitionInvalidFormatError: raise FatalError( - ["Unknown scraper format!", "It should be a JSON or YAML file."] + [ + "Unknown scraper format!", + "It should be a JSON or YAML file.", + "Or did you forget the -m/--module flag?", + ] ) except FileNotFoundError: @@ -165,7 +175,7 @@ def action(cli_args): ] ) - if scraper.fieldnames is None and cli_args.format == "csv": + if not scraper.tabular and cli_args.format == "csv": raise FatalError( [ "Your scraper does not yield tabular data.", @@ -183,26 +193,54 @@ def action(cli_args): writer_lock = Lock() if cli_args.format == "csv": - assert scraper.fieldnames is not None + if isinstance(scraper, FunctionScraper): + reader = casanova.reader(cli_args.input, total=cli_args.total) - output_fieldnames = scraper.fieldnames + # TODO: support for inferring_enricher + # TODO: support forwarding cases that will yield None + writer = casanova.inferring_writer( + cli_args.output, plural_separator=cli_args.plural_separator + ) - if cli_args.scraped_column_prefix is not None: - output_fieldnames = [ - cli_args.scraped_column_prefix + h for h in output_fieldnames - ] + def writerow(row, item): + writer.writerow(item) - enricher = casanova.enricher( - cli_args.input, - cli_args.output, - total=cli_args.total, - select=cli_args.select, - add=output_fieldnames, - ) - reader = enricher + else: + assert scraper.fieldnames is not None - def writerow(row, item): - enricher.writerow(row, item) + serializer = casanova.CSVSerializer( + plural_separator=cli_args.plural_separator + ) + + output_fieldnames = scraper.fieldnames + + if cli_args.scraped_column_prefix is not None: + output_fieldnames = [ + cli_args.scraped_column_prefix + h for h in output_fieldnames + ] + + enricher = casanova.enricher( + cli_args.input, + cli_args.output, + total=cli_args.total, + select=cli_args.select, + add=output_fieldnames, + ) + reader = enricher + + def writerow(row, item): + assert scraper.fieldnames is not None + + if item is None: + enricher.writerow(row) + return + + if isinstance(item, dict): + item = [item.get(f) for f in scraper.fieldnames] + else: + item = [item] + + enricher.writerow(row, (serializer(v) for v in item)) # type: ignore else: # TODO: casanova should probably expose some ndjson enricher @@ -254,16 +292,7 @@ def payloads() -> Iterator[ScrapeWorkerPayload]: pool = LazyPool( cli_args.processes, initializer=init_process, - initargs=( - { - "name": cli_args.scraper if using_typical_scraper else None, - "definition": getattr(scraper, "definition", None), - "strain": cli_args.strain if not using_typical_scraper else None, - "format": cli_args.format, - "plural_separator": cli_args.plural_separator, - "fieldnames": reader.fieldnames, - }, - ), + initargs=(scraper, reader.fieldnames), ) loading_bar.append_to_title(" (p=%i)" % pool.processes) diff --git a/minet/scrape/__init__.py b/minet/scrape/__init__.py index 697c72b55b..77a8be8689 100644 --- a/minet/scrape/__init__.py +++ b/minet/scrape/__init__.py @@ -4,7 +4,7 @@ # # Module exposing utilities related to minet's scraping DSL. # -from minet.scrape.scraper import scrape, Scraper, validate +from minet.scrape.classes.definition import scrape, DefinitionScraper, validate from minet.scrape.soup import WonderfulSoup from minet.scrape.regex import ( extract_encodings_from_xml, @@ -15,7 +15,7 @@ __all__ = [ "scrape", - "Scraper", + "DefinitionScraper", "validate", "WonderfulSoup", "extract_encodings_from_xml", diff --git a/minet/scrape/classes/__init__.py b/minet/scrape/classes/__init__.py new file mode 100644 index 0000000000..bf9a248cbf --- /dev/null +++ b/minet/scrape/classes/__init__.py @@ -0,0 +1,4 @@ +from minet.scrape.classes.base import ScraperBase +from minet.scrape.classes.definition import DefinitionScraper, validate, scrape +from minet.scrape.classes.function import FunctionScraper +from minet.scrape.classes.named import NamedScraper, NAMED_SCRAPERS diff --git a/minet/scrape/classes/base.py b/minet/scrape/classes/base.py new file mode 100644 index 0000000000..889d283b71 --- /dev/null +++ b/minet/scrape/classes/base.py @@ -0,0 +1,44 @@ +from typing import Union, Optional, List, Dict, Any + +from bs4 import BeautifulSoup, SoupStrainer + +from minet.scrape.soup import WonderfulSoup +from minet.scrape.analysis import ScraperAnalysisOutputType + +AnyScrapableTarget = Union[str, WonderfulSoup, BeautifulSoup] + + +class ScraperBase(object): + fieldnames: Optional[List[str]] + plural: bool + tabular: bool + output_type: Optional[ScraperAnalysisOutputType] + strainer: Optional[SoupStrainer] + + @property + def singular(self) -> bool: + return not self.plural + + def __repr__(self): + return "<{name} plural={plural} output_type={output_type} strain={strain} fieldnames={fieldnames!r}>".format( + name=self.__class__.__name__, + plural=self.plural, + strain=getattr(self.strainer, "css", None) if self.strainer else None, + output_type=self.output_type, + fieldnames=self.fieldnames, + ) + + def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None) -> Any: + raise NotImplementedError + + def items(self, html: AnyScrapableTarget, context: Optional[Dict] = None): + result = self.__call__(html, context=context) + + if result is None: + return + + if not self.plural: + yield result + return + + yield from result diff --git a/minet/scrape/scraper.py b/minet/scrape/classes/definition.py similarity index 89% rename from minet/scrape/scraper.py rename to minet/scrape/classes/definition.py index c39d76aadb..6ee4687e2a 100644 --- a/minet/scrape/scraper.py +++ b/minet/scrape/classes/definition.py @@ -1,7 +1,6 @@ from typing import Dict, Optional, Union, List from bs4 import SoupStrainer -from casanova import CSVSerializer from minet.types import AnyFileTarget from minet.fs import load_definition @@ -10,7 +9,8 @@ from minet.scrape.straining import strainer_from_css from minet.scrape.exceptions import InvalidScraperError from minet.scrape.utils import ensure_soup -from minet.scrape.types import AnyScrapableTarget, ScraperBase +from minet.scrape.types import AnyScrapableTarget +from minet.scrape.classes.base import ScraperBase def scrape( @@ -25,12 +25,11 @@ def scrape( return interpret_scraper(scraper, soup, root=soup, context=context) -class Scraper(ScraperBase): +class DefinitionScraper(ScraperBase): definition: Dict fieldnames: Optional[List[str]] plural: bool output_type: ScraperAnalysisOutputType - serializer: CSVSerializer strainer: Optional[SoupStrainer] def __init__( @@ -51,12 +50,10 @@ def __init__( analysis = analyse(definition) self.fieldnames = analysis.fieldnames + self.tabular = self.fieldnames is not None self.plural = analysis.plural self.output_type = analysis.output_type - # Serializer - self.serializer = CSVSerializer() - # Strainer self.strainer = None diff --git a/minet/scrape/classes/function.py b/minet/scrape/classes/function.py new file mode 100644 index 0000000000..26c4e1e27f --- /dev/null +++ b/minet/scrape/classes/function.py @@ -0,0 +1,41 @@ +from typing import Optional, Callable, Any, cast, Dict + +import inspect +from casanova import RowWrapper +from bs4 import SoupStrainer + +from minet.scrape.classes.base import ScraperBase +from minet.scrape.soup import WonderfulSoup +from minet.scrape.straining import strainer_from_css +from minet.scrape.utils import ensure_soup +from minet.scrape.types import AnyScrapableTarget + + +class FunctionScraper(ScraperBase): + fn: Callable[[RowWrapper, WonderfulSoup], Any] + fieldnames = None + plural: bool + tabular = True + output_type = None + strainer: Optional[SoupStrainer] + + def __init__( + self, + fn: Callable[[RowWrapper, WonderfulSoup], Any], + strain: Optional[str] = None, + ): + self.fn = fn + self.plural = inspect.isgeneratorfunction(fn) + + self.strainer = None + + if strain is not None: + self.strainer = strainer_from_css(strain) + + def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None): + assert context is not None + + row = context["row"] + soup = cast(WonderfulSoup, ensure_soup(html, strainer=self.strainer)) + + return self.fn(row, soup) diff --git a/minet/scrape/typical.py b/minet/scrape/classes/named.py similarity index 95% rename from minet/scrape/typical.py rename to minet/scrape/classes/named.py index 6d1458a396..a6f6d6fe88 100644 --- a/minet/scrape/typical.py +++ b/minet/scrape/classes/named.py @@ -1,22 +1,22 @@ from typing import Optional, List, Any, Dict, Type, cast from bs4 import SoupStrainer, BeautifulSoup -from casanova import CSVSerializer from urllib.parse import urljoin from ural import should_follow_href, could_be_rss from minet.scrape.analysis import ScraperAnalysisOutputType from minet.scrape.utils import ensure_soup -from minet.scrape.types import AnyScrapableTarget, ScraperBase +from minet.scrape.types import AnyScrapableTarget +from minet.scrape.classes.base import ScraperBase class NamedScraper(ScraperBase): name: str fieldnames: List[str] plural: bool + tabular = True output_type: ScraperAnalysisOutputType strainer: Optional[SoupStrainer] - serializer = CSVSerializer() def scrape(self, soup: BeautifulSoup, context=None) -> Any: raise NotImplementedError @@ -28,7 +28,7 @@ def __call__(self, html: AnyScrapableTarget, context=None) -> Any: class TitleScraper(NamedScraper): name = "title" - fieldnames = ["page_title"] + fieldnames = ["title"] plural = False output_type = "scalar" strainer = SoupStrainer(name="title") @@ -105,7 +105,7 @@ def scrape(self, soup: BeautifulSoup, context=None) -> Any: class ImagesScraper(NamedScraper): name = "images" - fieldnames = ["image_url"] + fieldnames = ["src"] plural = True output_type = "list" strainer = SoupStrainer(name="img") @@ -193,7 +193,7 @@ def scrape(self, soup: BeautifulSoup, context=None): return rss_urls -TYPICAL_SCRAPERS: Dict[str, Type[NamedScraper]] = { +NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = { s.name: s for s in [ TitleScraper, diff --git a/minet/scrape/exceptions.py b/minet/scrape/exceptions.py index ba4367b97a..2d03c3b2d5 100644 --- a/minet/scrape/exceptions.py +++ b/minet/scrape/exceptions.py @@ -19,10 +19,6 @@ class CSSSelectorTooComplex(ScrapeError): pass -class ScraperNotTabularError(ScrapeError): - pass - - class ScraperRuntimeError(ScrapeError): def __init__(self, msg=None, reason=None, expression=None, path=None): super().__init__(msg) diff --git a/minet/scrape/types.py b/minet/scrape/types.py index 504d9b69cc..3fd1138a36 100644 --- a/minet/scrape/types.py +++ b/minet/scrape/types.py @@ -1,111 +1,7 @@ -from typing import Union, Optional, List, Dict, Any +from typing import Union -from bs4 import BeautifulSoup, SoupStrainer -from casanova import CSVSerializer +from bs4 import BeautifulSoup from minet.scrape.soup import WonderfulSoup -from minet.scrape.analysis import ScraperAnalysisOutputType -from minet.scrape.exceptions import ScraperNotTabularError AnyScrapableTarget = Union[str, WonderfulSoup, BeautifulSoup] - - -class ScraperBase(object): - fieldnames: Optional[List[str]] - plural: bool - output_type: ScraperAnalysisOutputType - serializer: CSVSerializer - strainer: Optional[SoupStrainer] - - @property - def singular(self) -> bool: - return not self.plural - - def __repr__(self): - return "<{name} plural={plural} output_type={output_type} strain={strain} fieldnames={fieldnames!r}>".format( - name=self.__class__.__name__, - plural=self.plural, - strain=getattr(self.strainer, "css", None) if self.strainer else None, - output_type=self.output_type, - fieldnames=self.fieldnames, - ) - - def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None) -> Any: - raise NotImplementedError - - def as_csv_rows( - self, - html: AnyScrapableTarget, - context: Optional[Dict] = None, - plural_separator="|", - ): - if self.fieldnames is None: - raise ScraperNotTabularError - - def generator(): - result = self.__call__(html, context=context) - - if result is None: - return - - if not self.plural: - result = [result] - - for item in result: - if isinstance(item, dict): - assert self.fieldnames - - item = self.serializer.serialize_dict_row( - item, self.fieldnames, plural_separator=plural_separator - ) - else: - item = [self.serializer(item, plural_separator=plural_separator)] - - yield item - - return generator() - - def as_csv_dict_rows( - self, - html: AnyScrapableTarget, - context: Optional[Dict] = None, - plural_separator="|", - ): - if self.fieldnames is None: - raise ScraperNotTabularError - - def generator(): - result = self.__call__(html, context=context) - - if result is None: - return - - if not self.plural: - result = [result] - - for item in result: - if isinstance(item, dict): - for k, v in item.items(): - item[k] = self.serializer(v, plural_separator=plural_separator) - else: - item = { - "value": self.serializer( - item, plural_separator=plural_separator - ) - } - - yield item - - return generator() - - def as_records(self, html: AnyScrapableTarget, context: Optional[Dict] = None): - result = self.__call__(html, context=context) - - if result is None: - return - - if not self.plural: - yield result - return - - yield from result diff --git a/test/scraper_test.py b/test/scraper_test.py index 56963e3e19..ea896e22d3 100644 --- a/test/scraper_test.py +++ b/test/scraper_test.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup, Tag, SoupStrainer from textwrap import dedent -from minet.scrape import scrape, Scraper +from minet.scrape import scrape, DefinitionScraper from minet.scrape.analysis import ( fieldnames_from_definition, validate, @@ -30,7 +30,6 @@ ScraperValidationInvalidExtractorError, ScraperValidationMixedConcernError, ScraperValidationUnknownKeyError, - ScraperNotTabularError, ) BASIC_HTML = """ @@ -161,7 +160,7 @@ """ -class TestScraper(object): +class TestDefinitionScraper(object): def test_basics(self): result = scrape({"iterator": "li"}, BASIC_HTML) @@ -523,7 +522,7 @@ def test_fieldnames(self): fieldnames = fieldnames_from_definition({"sel": "table", "tabulate": True}) - scraper = Scraper({"iterator": "li", "fields": {"id": "id"}}) + scraper = DefinitionScraper({"iterator": "li", "fields": {"id": "id"}}) assert scraper.fieldnames == ["id"] @@ -780,7 +779,7 @@ def key(t): assert errors == expecting with pytest.raises(InvalidScraperError) as info: - Scraper(bad_definition) + DefinitionScraper(bad_definition) errors = sorted( [(e.path, type(e)) for e in info.value.validation_errors], key=key @@ -950,58 +949,12 @@ def test_strainer(css, input_html, output_html, **kwargs): 'One', ) - scraper = Scraper({"iterator": "li"}, strain="div") + scraper = DefinitionScraper({"iterator": "li"}, strain="div") html = "
Hello