From 949832e231404837c87d698be81aa32dddc12d6f Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Mon, 11 Dec 2023 15:20:47 +0100
Subject: [PATCH] Working of scrape cmd -m Related to #763

---
 docs/cli.md                                   |  24 ++-
 ftest/ftest-array.sh                          |   2 +
 ftest/scrapers/title.py                       |  10 ++
 minet/cli/scrape/__init__.py                  |  21 ++-
 minet/cli/scrape/scrape.py                    | 143 +++++++++++-------
 minet/scrape/__init__.py                      |   4 +-
 minet/scrape/classes/__init__.py              |   4 +
 minet/scrape/classes/base.py                  |  44 ++++++
 .../{scraper.py => classes/definition.py}     |  11 +-
 minet/scrape/classes/function.py              |  41 +++++
 minet/scrape/{typical.py => classes/named.py} |  12 +-
 minet/scrape/exceptions.py                    |   4 -
 minet/scrape/types.py                         | 108 +------------
 test/scraper_test.py                          |  57 +------
 14 files changed, 241 insertions(+), 244 deletions(-)
 create mode 100644 ftest/scrapers/title.py
 create mode 100644 minet/scrape/classes/__init__.py
 create mode 100644 minet/scrape/classes/base.py
 rename minet/scrape/{scraper.py => classes/definition.py} (89%)
 create mode 100644 minet/scrape/classes/function.py
 rename minet/scrape/{typical.py => classes/named.py} (95%)

diff --git a/docs/cli.md b/docs/cli.md
index 25e2f5f1f2..c9a4bef62f 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -1121,7 +1121,7 @@ For more documentation about minet's scraping DSL check this [page](../cookbook/
 
 ```
 Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
-                    [--simple-progress] [-g] [-I INPUT_DIR] [-p PROCESSES]
+                    [--simple-progress] [-m] [-g] [-I INPUT_DIR] [-p PROCESSES]
                     [--chunk-size CHUNK_SIZE] [--body-column BODY_COLUMN]
                     [--url-column URL_COLUMN] [--error-column ERROR_COLUMN]
                     [--status-column STATUS_COLUMN]
@@ -1138,6 +1138,8 @@ Usage: minet scrape [-h] [--silent] [--refresh-per-second REFRESH_PER_SECOND]
 Use multiple processes to scrape data from a batch of HTML files using
 minet scraping DSL documented here:
 https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
+or a python function given using the -m/--module flag, or an already
+implemented typical scraping routine (listed below).
 
 It will output the scraped items as a CSV or NDJSON file.
 
@@ -1164,8 +1166,10 @@ an error occurred.
 
 Positional Arguments:
   scraper                       Path to a scraper definition file, or name of a
-                                builtin scraper, e.g. "title". See the complete
-                                list below.
+                                builtin scraper, e.g. "title" (see the complete
+                                list below), or a path to a python module and
+                                function (e.g. scraper.py,
+                                scraper.py:scrape_title).
   path_or_path_column           Single path to process or name of the CSV column
                                 containing paths when using -i/--input. Defaults
                                 to "path".
@@ -1190,6 +1194,8 @@ Optional Arguments:
   --mimetype-column MIMETYPE_COLUMN
                                 Name of the CSV column containing file mimetype.
                                 Defaults to `mimetype`.
+  -m, --module                  Whether given scraper is a python target to
+                                import.
   --plural-separator PLURAL_SEPARATOR
                                 Separator use to join lists of values when
                                 serializing to CSV. Defaults to `|`.
@@ -1263,6 +1269,15 @@ Examples:
 . Scraping a single url:
     $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i -
 
+. Using a builtin scraper:
+    $ minet scrape title -i report.csv > titles.csv
+
+. Using the `scrape` (default) function of target python module:
+    $ minet scrape scraper.py -i report.csv > titles.csv
+
+. Using the `scrape_title` function of target python module:
+    $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+
 . Indicating a custom path column (e.g. "file"):
     $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
 
@@ -1286,9 +1301,6 @@ Examples:
 
 . Keeping only some columns from input CSV file:
     $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv
-
-. Using a builtin scraper:
-    $ minet scrape title -i report.csv > titles.csv
 ```
 
 ## screenshot
diff --git a/ftest/ftest-array.sh b/ftest/ftest-array.sh
index 8fbe11def4..1119ae7d39 100755
--- a/ftest/ftest-array.sh
+++ b/ftest/ftest-array.sh
@@ -15,6 +15,8 @@ echo
 echo "Scrape"
 echo "  - Single HTML file"
 $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml $EXTRACT_DIR/article.html | wc -l
+echo "  - Single HTML file, typical scraper"
+$MINET scrape -p 1 title $EXTRACT_DIR/article.html | wc -l
 echo "  - Single glob pattern"
 $MINET scrape -p 1 $EXTRACT_DIR/scraper.yml "$EXTRACT_DIR/*.html" -g | wc -l
 echo "  - CSV input"
diff --git a/ftest/scrapers/title.py b/ftest/scrapers/title.py
new file mode 100644
index 0000000000..1829ec9503
--- /dev/null
+++ b/ftest/scrapers/title.py
@@ -0,0 +1,10 @@
+from minet.scrape import WonderfulSoup
+from casanova import RowWrapper
+
+
+def scrape(row: RowWrapper, soup: WonderfulSoup):
+    return {"url": row.url, "title": soup.scrape_one("title")}
+
+
+def titles(row: RowWrapper, soup: WonderfulSoup):
+    yield soup.scrape_one("title")
diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
index 3182de8aef..696524576a 100644
--- a/minet/cli/scrape/__init__.py
+++ b/minet/cli/scrape/__init__.py
@@ -14,6 +14,8 @@ def resolve_arguments(cli_args):
         Use multiple processes to scrape data from a batch of HTML files using
         minet scraping DSL documented here:
         https://github.com/medialab/minet/blob/master/docs/cookbook/scraping_dsl.md
+        or a python function given using the -m/--module flag, or an already
+        implemented typical scraping routine (listed below).
 
         It will output the scraped items as a CSV or NDJSON file.
 
@@ -61,6 +63,15 @@ def resolve_arguments(cli_args):
         . Scraping a single url:
             $ minet fetch "https://lemonde.fr" | minet scrape scraper.yml -i -
 
+        . Using a builtin scraper:
+            $ minet scrape title -i report.csv > titles.csv
+
+        . Using the `scrape` (default) function of target python module:
+            $ minet scrape scraper.py -i report.csv > titles.csv
+
+        . Using the `scrape_title` function of target python module:
+            $ minet scrape scraper.py:scrape_title -i report.csv > titles.csv
+
         . Indicating a custom path column (e.g. "file"):
             $ minet scrape scraper.yml file -i report.csv -I downloaded > scraped.csv
 
@@ -84,16 +95,18 @@ def resolve_arguments(cli_args):
 
         . Keeping only some columns from input CSV file:
             $ minet scrape scraper.yml -i report.csv -s name,url > scraped.csv
-
-        . Using a builtin scraper:
-            $ minet scrape title -i report.csv > titles.csv
     """,
     resolve=resolve_arguments,
     variadic_input={"dummy_column": "path", "optional": True, "no_help": True},
     arguments=[
         {
             "name": "scraper",
-            "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title". See the complete list below.',
+            "help": 'Path to a scraper definition file, or name of a builtin scraper, e.g. "title" (see the complete list below), or a path to a python module and function (e.g. scraper.py, scraper.py:scrape_title).',
+        },
+        {
+            "flags": ["-m", "--module"],
+            "help": "Whether given scraper is a python target to import.",
+            "action": "store_true",
         },
         {
             "flags": ["-g", "--glob"],
diff --git a/minet/cli/scrape/scrape.py b/minet/cli/scrape/scrape.py
index d13af2e685..c287079d3f 100644
--- a/minet/cli/scrape/scrape.py
+++ b/minet/cli/scrape/scrape.py
@@ -13,12 +13,18 @@
 from threading import Lock
 from os.path import basename, isdir
 
-from minet.scrape import Scraper
-from minet.scrape.typical import TYPICAL_SCRAPERS
-from minet.scrape.types import ScraperBase
+from minet.utils import import_target
+from minet.scrape.classes import (
+    NAMED_SCRAPERS,
+    ScraperBase,
+    DefinitionScraper,
+    FunctionScraper,
+)
 from minet.multiprocessing import LazyPool
 from minet.exceptions import (
     DefinitionInvalidFormatError,
+    GenericModuleNotFoundError,
+    TargetInGenericModuleNotFoundError,
 )
 from minet.scrape.exceptions import (
     InvalidScraperError,
@@ -57,30 +63,19 @@ class ScrapeResult:
 
 
 SCRAPER: Optional[ScraperBase] = None
-FORMAT: Optional[str] = None
-PLURAL_SEPARATOR: Optional[str] = None
 HEADERS: Optional[casanova.headers] = None
 
 
-def init_process(options):
+def init_process(scraper: ScraperBase, fieldnames: List[str]):
     global SCRAPER
-    global FORMAT
-    global PLURAL_SEPARATOR
     global HEADERS
 
-    if options["name"] is not None:
-        SCRAPER = TYPICAL_SCRAPERS[options["name"]]()
-    else:
-        SCRAPER = Scraper(options["definition"], strain=options["strain"])
-
-    FORMAT = options["format"]
-    PLURAL_SEPARATOR = options["plural_separator"]
-    HEADERS = casanova.headers(options["fieldnames"])
+    SCRAPER = scraper
+    HEADERS = casanova.headers(fieldnames)
 
 
 def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
     assert SCRAPER is not None
-    assert PLURAL_SEPARATOR is not None
     assert HEADERS is not None
 
     text = payload.text
@@ -109,12 +104,7 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
         context["basename"] = basename(payload.path)
 
     # Attempting to scrape
-    if FORMAT == "csv":
-        items = SCRAPER.as_csv_rows(
-            text, context=context, plural_separator=PLURAL_SEPARATOR
-        )
-    else:
-        items = SCRAPER.as_records(text, context=context)
+    items = SCRAPER.items(text, context=context)
 
     # NOTE: errors might be raised when we consume the generators created above
     try:
@@ -129,19 +119,39 @@ def worker(payload: ScrapeWorkerPayload) -> ScrapeResult:
 
 
 def action(cli_args):
-    using_typical_scraper = False
-
     # Parsing scraper definition
     try:
-        if cli_args.scraper in TYPICAL_SCRAPERS:
-            using_typical_scraper = True
-            scraper = TYPICAL_SCRAPERS[cli_args.scraper]()
+        if cli_args.module:
+            fn = import_target(cli_args.scraper, default="scrape")
+            scraper = FunctionScraper(fn, strain=cli_args.strain)
+        elif cli_args.scraper in NAMED_SCRAPERS:
+            scraper = NAMED_SCRAPERS[cli_args.scraper]()
         else:
-            scraper = Scraper(cli_args.scraper, strain=cli_args.strain)
+            scraper = DefinitionScraper(cli_args.scraper, strain=cli_args.strain)
+
+    except GenericModuleNotFoundError:
+        raise FatalError(
+            [
+                "Could not import %s!" % cli_args.scraper,
+                "Are you sure the module exists?",
+            ]
+        )
+
+    except TargetInGenericModuleNotFoundError as e:
+        raise FatalError(
+            [
+                "Could not find the %s target in the %s module!" % (e.name, e.path),
+                "Are you sure this class/function/variable exists in the module?",
+            ]
+        )
 
     except DefinitionInvalidFormatError:
         raise FatalError(
-            ["Unknown scraper format!", "It should be a JSON or YAML file."]
+            [
+                "Unknown scraper format!",
+                "It should be a JSON or YAML file.",
+                "Or did you forget the -m/--module flag?",
+            ]
         )
 
     except FileNotFoundError:
@@ -165,7 +175,7 @@ def action(cli_args):
             ]
         )
 
-    if scraper.fieldnames is None and cli_args.format == "csv":
+    if not scraper.tabular and cli_args.format == "csv":
         raise FatalError(
             [
                 "Your scraper does not yield tabular data.",
@@ -183,26 +193,54 @@ def action(cli_args):
     writer_lock = Lock()
 
     if cli_args.format == "csv":
-        assert scraper.fieldnames is not None
+        if isinstance(scraper, FunctionScraper):
+            reader = casanova.reader(cli_args.input, total=cli_args.total)
 
-        output_fieldnames = scraper.fieldnames
+            # TODO: support for inferring_enricher
+            # TODO: support forwarding cases that will yield None
+            writer = casanova.inferring_writer(
+                cli_args.output, plural_separator=cli_args.plural_separator
+            )
 
-        if cli_args.scraped_column_prefix is not None:
-            output_fieldnames = [
-                cli_args.scraped_column_prefix + h for h in output_fieldnames
-            ]
+            def writerow(row, item):
+                writer.writerow(item)
 
-        enricher = casanova.enricher(
-            cli_args.input,
-            cli_args.output,
-            total=cli_args.total,
-            select=cli_args.select,
-            add=output_fieldnames,
-        )
-        reader = enricher
+        else:
+            assert scraper.fieldnames is not None
 
-        def writerow(row, item):
-            enricher.writerow(row, item)
+            serializer = casanova.CSVSerializer(
+                plural_separator=cli_args.plural_separator
+            )
+
+            output_fieldnames = scraper.fieldnames
+
+            if cli_args.scraped_column_prefix is not None:
+                output_fieldnames = [
+                    cli_args.scraped_column_prefix + h for h in output_fieldnames
+                ]
+
+            enricher = casanova.enricher(
+                cli_args.input,
+                cli_args.output,
+                total=cli_args.total,
+                select=cli_args.select,
+                add=output_fieldnames,
+            )
+            reader = enricher
+
+            def writerow(row, item):
+                assert scraper.fieldnames is not None
+
+                if item is None:
+                    enricher.writerow(row)
+                    return
+
+                if isinstance(item, dict):
+                    item = [item.get(f) for f in scraper.fieldnames]
+                else:
+                    item = [item]
+
+                enricher.writerow(row, (serializer(v) for v in item))  # type: ignore
 
     else:
         # TODO: casanova should probably expose some ndjson enricher
@@ -254,16 +292,7 @@ def payloads() -> Iterator[ScrapeWorkerPayload]:
         pool = LazyPool(
             cli_args.processes,
             initializer=init_process,
-            initargs=(
-                {
-                    "name": cli_args.scraper if using_typical_scraper else None,
-                    "definition": getattr(scraper, "definition", None),
-                    "strain": cli_args.strain if not using_typical_scraper else None,
-                    "format": cli_args.format,
-                    "plural_separator": cli_args.plural_separator,
-                    "fieldnames": reader.fieldnames,
-                },
-            ),
+            initargs=(scraper, reader.fieldnames),
         )
 
         loading_bar.append_to_title(" (p=%i)" % pool.processes)
diff --git a/minet/scrape/__init__.py b/minet/scrape/__init__.py
index 697c72b55b..77a8be8689 100644
--- a/minet/scrape/__init__.py
+++ b/minet/scrape/__init__.py
@@ -4,7 +4,7 @@
 #
 # Module exposing utilities related to minet's scraping DSL.
 #
-from minet.scrape.scraper import scrape, Scraper, validate
+from minet.scrape.classes.definition import scrape, DefinitionScraper, validate
 from minet.scrape.soup import WonderfulSoup
 from minet.scrape.regex import (
     extract_encodings_from_xml,
@@ -15,7 +15,7 @@
 
 __all__ = [
     "scrape",
-    "Scraper",
+    "DefinitionScraper",
     "validate",
     "WonderfulSoup",
     "extract_encodings_from_xml",
diff --git a/minet/scrape/classes/__init__.py b/minet/scrape/classes/__init__.py
new file mode 100644
index 0000000000..bf9a248cbf
--- /dev/null
+++ b/minet/scrape/classes/__init__.py
@@ -0,0 +1,4 @@
+from minet.scrape.classes.base import ScraperBase
+from minet.scrape.classes.definition import DefinitionScraper, validate, scrape
+from minet.scrape.classes.function import FunctionScraper
+from minet.scrape.classes.named import NamedScraper, NAMED_SCRAPERS
diff --git a/minet/scrape/classes/base.py b/minet/scrape/classes/base.py
new file mode 100644
index 0000000000..889d283b71
--- /dev/null
+++ b/minet/scrape/classes/base.py
@@ -0,0 +1,44 @@
+from typing import Union, Optional, List, Dict, Any
+
+from bs4 import BeautifulSoup, SoupStrainer
+
+from minet.scrape.soup import WonderfulSoup
+from minet.scrape.analysis import ScraperAnalysisOutputType
+
+AnyScrapableTarget = Union[str, WonderfulSoup, BeautifulSoup]
+
+
+class ScraperBase(object):
+    fieldnames: Optional[List[str]]
+    plural: bool
+    tabular: bool
+    output_type: Optional[ScraperAnalysisOutputType]
+    strainer: Optional[SoupStrainer]
+
+    @property
+    def singular(self) -> bool:
+        return not self.plural
+
+    def __repr__(self):
+        return "<{name} plural={plural} output_type={output_type} strain={strain} fieldnames={fieldnames!r}>".format(
+            name=self.__class__.__name__,
+            plural=self.plural,
+            strain=getattr(self.strainer, "css", None) if self.strainer else None,
+            output_type=self.output_type,
+            fieldnames=self.fieldnames,
+        )
+
+    def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None) -> Any:
+        raise NotImplementedError
+
+    def items(self, html: AnyScrapableTarget, context: Optional[Dict] = None):
+        result = self.__call__(html, context=context)
+
+        if result is None:
+            return
+
+        if not self.plural:
+            yield result
+            return
+
+        yield from result
diff --git a/minet/scrape/scraper.py b/minet/scrape/classes/definition.py
similarity index 89%
rename from minet/scrape/scraper.py
rename to minet/scrape/classes/definition.py
index c39d76aadb..6ee4687e2a 100644
--- a/minet/scrape/scraper.py
+++ b/minet/scrape/classes/definition.py
@@ -1,7 +1,6 @@
 from typing import Dict, Optional, Union, List
 
 from bs4 import SoupStrainer
-from casanova import CSVSerializer
 
 from minet.types import AnyFileTarget
 from minet.fs import load_definition
@@ -10,7 +9,8 @@
 from minet.scrape.straining import strainer_from_css
 from minet.scrape.exceptions import InvalidScraperError
 from minet.scrape.utils import ensure_soup
-from minet.scrape.types import AnyScrapableTarget, ScraperBase
+from minet.scrape.types import AnyScrapableTarget
+from minet.scrape.classes.base import ScraperBase
 
 
 def scrape(
@@ -25,12 +25,11 @@ def scrape(
     return interpret_scraper(scraper, soup, root=soup, context=context)
 
 
-class Scraper(ScraperBase):
+class DefinitionScraper(ScraperBase):
     definition: Dict
     fieldnames: Optional[List[str]]
     plural: bool
     output_type: ScraperAnalysisOutputType
-    serializer: CSVSerializer
     strainer: Optional[SoupStrainer]
 
     def __init__(
@@ -51,12 +50,10 @@ def __init__(
         analysis = analyse(definition)
 
         self.fieldnames = analysis.fieldnames
+        self.tabular = self.fieldnames is not None
         self.plural = analysis.plural
         self.output_type = analysis.output_type
 
-        # Serializer
-        self.serializer = CSVSerializer()
-
         # Strainer
         self.strainer = None
 
diff --git a/minet/scrape/classes/function.py b/minet/scrape/classes/function.py
new file mode 100644
index 0000000000..26c4e1e27f
--- /dev/null
+++ b/minet/scrape/classes/function.py
@@ -0,0 +1,41 @@
+from typing import Optional, Callable, Any, cast, Dict
+
+import inspect
+from casanova import RowWrapper
+from bs4 import SoupStrainer
+
+from minet.scrape.classes.base import ScraperBase
+from minet.scrape.soup import WonderfulSoup
+from minet.scrape.straining import strainer_from_css
+from minet.scrape.utils import ensure_soup
+from minet.scrape.types import AnyScrapableTarget
+
+
+class FunctionScraper(ScraperBase):
+    fn: Callable[[RowWrapper, WonderfulSoup], Any]
+    fieldnames = None
+    plural: bool
+    tabular = True
+    output_type = None
+    strainer: Optional[SoupStrainer]
+
+    def __init__(
+        self,
+        fn: Callable[[RowWrapper, WonderfulSoup], Any],
+        strain: Optional[str] = None,
+    ):
+        self.fn = fn
+        self.plural = inspect.isgeneratorfunction(fn)
+
+        self.strainer = None
+
+        if strain is not None:
+            self.strainer = strainer_from_css(strain)
+
+    def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None):
+        assert context is not None
+
+        row = context["row"]
+        soup = cast(WonderfulSoup, ensure_soup(html, strainer=self.strainer))
+
+        return self.fn(row, soup)
diff --git a/minet/scrape/typical.py b/minet/scrape/classes/named.py
similarity index 95%
rename from minet/scrape/typical.py
rename to minet/scrape/classes/named.py
index 6d1458a396..a6f6d6fe88 100644
--- a/minet/scrape/typical.py
+++ b/minet/scrape/classes/named.py
@@ -1,22 +1,22 @@
 from typing import Optional, List, Any, Dict, Type, cast
 
 from bs4 import SoupStrainer, BeautifulSoup
-from casanova import CSVSerializer
 from urllib.parse import urljoin
 from ural import should_follow_href, could_be_rss
 
 from minet.scrape.analysis import ScraperAnalysisOutputType
 from minet.scrape.utils import ensure_soup
-from minet.scrape.types import AnyScrapableTarget, ScraperBase
+from minet.scrape.types import AnyScrapableTarget
+from minet.scrape.classes.base import ScraperBase
 
 
 class NamedScraper(ScraperBase):
     name: str
     fieldnames: List[str]
     plural: bool
+    tabular = True
     output_type: ScraperAnalysisOutputType
     strainer: Optional[SoupStrainer]
-    serializer = CSVSerializer()
 
     def scrape(self, soup: BeautifulSoup, context=None) -> Any:
         raise NotImplementedError
@@ -28,7 +28,7 @@ def __call__(self, html: AnyScrapableTarget, context=None) -> Any:
 
 class TitleScraper(NamedScraper):
     name = "title"
-    fieldnames = ["page_title"]
+    fieldnames = ["title"]
     plural = False
     output_type = "scalar"
     strainer = SoupStrainer(name="title")
@@ -105,7 +105,7 @@ def scrape(self, soup: BeautifulSoup, context=None) -> Any:
 
 class ImagesScraper(NamedScraper):
     name = "images"
-    fieldnames = ["image_url"]
+    fieldnames = ["src"]
     plural = True
     output_type = "list"
     strainer = SoupStrainer(name="img")
@@ -193,7 +193,7 @@ def scrape(self, soup: BeautifulSoup, context=None):
         return rss_urls
 
 
-TYPICAL_SCRAPERS: Dict[str, Type[NamedScraper]] = {
+NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
     s.name: s
     for s in [
         TitleScraper,
diff --git a/minet/scrape/exceptions.py b/minet/scrape/exceptions.py
index ba4367b97a..2d03c3b2d5 100644
--- a/minet/scrape/exceptions.py
+++ b/minet/scrape/exceptions.py
@@ -19,10 +19,6 @@ class CSSSelectorTooComplex(ScrapeError):
     pass
 
 
-class ScraperNotTabularError(ScrapeError):
-    pass
-
-
 class ScraperRuntimeError(ScrapeError):
     def __init__(self, msg=None, reason=None, expression=None, path=None):
         super().__init__(msg)
diff --git a/minet/scrape/types.py b/minet/scrape/types.py
index 504d9b69cc..3fd1138a36 100644
--- a/minet/scrape/types.py
+++ b/minet/scrape/types.py
@@ -1,111 +1,7 @@
-from typing import Union, Optional, List, Dict, Any
+from typing import Union
 
-from bs4 import BeautifulSoup, SoupStrainer
-from casanova import CSVSerializer
+from bs4 import BeautifulSoup
 
 from minet.scrape.soup import WonderfulSoup
-from minet.scrape.analysis import ScraperAnalysisOutputType
-from minet.scrape.exceptions import ScraperNotTabularError
 
 AnyScrapableTarget = Union[str, WonderfulSoup, BeautifulSoup]
-
-
-class ScraperBase(object):
-    fieldnames: Optional[List[str]]
-    plural: bool
-    output_type: ScraperAnalysisOutputType
-    serializer: CSVSerializer
-    strainer: Optional[SoupStrainer]
-
-    @property
-    def singular(self) -> bool:
-        return not self.plural
-
-    def __repr__(self):
-        return "<{name} plural={plural} output_type={output_type} strain={strain} fieldnames={fieldnames!r}>".format(
-            name=self.__class__.__name__,
-            plural=self.plural,
-            strain=getattr(self.strainer, "css", None) if self.strainer else None,
-            output_type=self.output_type,
-            fieldnames=self.fieldnames,
-        )
-
-    def __call__(self, html: AnyScrapableTarget, context: Optional[Dict] = None) -> Any:
-        raise NotImplementedError
-
-    def as_csv_rows(
-        self,
-        html: AnyScrapableTarget,
-        context: Optional[Dict] = None,
-        plural_separator="|",
-    ):
-        if self.fieldnames is None:
-            raise ScraperNotTabularError
-
-        def generator():
-            result = self.__call__(html, context=context)
-
-            if result is None:
-                return
-
-            if not self.plural:
-                result = [result]
-
-            for item in result:
-                if isinstance(item, dict):
-                    assert self.fieldnames
-
-                    item = self.serializer.serialize_dict_row(
-                        item, self.fieldnames, plural_separator=plural_separator
-                    )
-                else:
-                    item = [self.serializer(item, plural_separator=plural_separator)]
-
-                yield item
-
-        return generator()
-
-    def as_csv_dict_rows(
-        self,
-        html: AnyScrapableTarget,
-        context: Optional[Dict] = None,
-        plural_separator="|",
-    ):
-        if self.fieldnames is None:
-            raise ScraperNotTabularError
-
-        def generator():
-            result = self.__call__(html, context=context)
-
-            if result is None:
-                return
-
-            if not self.plural:
-                result = [result]
-
-            for item in result:
-                if isinstance(item, dict):
-                    for k, v in item.items():
-                        item[k] = self.serializer(v, plural_separator=plural_separator)
-                else:
-                    item = {
-                        "value": self.serializer(
-                            item, plural_separator=plural_separator
-                        )
-                    }
-
-                yield item
-
-        return generator()
-
-    def as_records(self, html: AnyScrapableTarget, context: Optional[Dict] = None):
-        result = self.__call__(html, context=context)
-
-        if result is None:
-            return
-
-        if not self.plural:
-            yield result
-            return
-
-        yield from result
diff --git a/test/scraper_test.py b/test/scraper_test.py
index 56963e3e19..ea896e22d3 100644
--- a/test/scraper_test.py
+++ b/test/scraper_test.py
@@ -5,7 +5,7 @@
 from bs4 import BeautifulSoup, Tag, SoupStrainer
 from textwrap import dedent
 
-from minet.scrape import scrape, Scraper
+from minet.scrape import scrape, DefinitionScraper
 from minet.scrape.analysis import (
     fieldnames_from_definition,
     validate,
@@ -30,7 +30,6 @@
     ScraperValidationInvalidExtractorError,
     ScraperValidationMixedConcernError,
     ScraperValidationUnknownKeyError,
-    ScraperNotTabularError,
 )
 
 BASIC_HTML = """
@@ -161,7 +160,7 @@
 """
 
 
-class TestScraper(object):
+class TestDefinitionScraper(object):
     def test_basics(self):
         result = scrape({"iterator": "li"}, BASIC_HTML)
 
@@ -523,7 +522,7 @@ def test_fieldnames(self):
 
         fieldnames = fieldnames_from_definition({"sel": "table", "tabulate": True})
 
-        scraper = Scraper({"iterator": "li", "fields": {"id": "id"}})
+        scraper = DefinitionScraper({"iterator": "li", "fields": {"id": "id"}})
 
         assert scraper.fieldnames == ["id"]
 
@@ -780,7 +779,7 @@ def key(t):
         assert errors == expecting
 
         with pytest.raises(InvalidScraperError) as info:
-            Scraper(bad_definition)
+            DefinitionScraper(bad_definition)
 
         errors = sorted(
             [(e.path, type(e)) for e in info.value.validation_errors], key=key
@@ -950,58 +949,12 @@ def test_strainer(css, input_html, output_html, **kwargs):
             '<span color="blue">One</span>',
         )
 
-        scraper = Scraper({"iterator": "li"}, strain="div")
+        scraper = DefinitionScraper({"iterator": "li"}, strain="div")
 
         html = "<div>Hello</div><ul><li>ok</li>"
 
         assert scraper(html) == []
 
-    def test_as_csv_row(self):
-        with pytest.raises(ScraperNotTabularError):
-            scraper = Scraper(
-                {"iterator": "li", "fields": {"nested": {"fields": {"text": "text"}}}}
-            )
-
-            scraper.as_csv_dict_rows(BASIC_HTML)
-
-        scraper = Scraper({"iterator": "li"})
-
-        assert list(scraper.as_csv_dict_rows(BASIC_HTML)) == [
-            {"value": "One"},
-            {"value": "Two"},
-        ]
-        assert list(scraper.as_csv_rows(BASIC_HTML)) == [["One"], ["Two"]]
-
-        scraper = Scraper(
-            {
-                "iterator": "li",
-                "fields": {
-                    "text": "text",
-                    "list": {"default": [1, 2]},
-                    "false": {"default": False},
-                    "true": {"default": True},
-                },
-            }
-        )
-
-        assert list(scraper.as_csv_dict_rows(BASIC_HTML)) == [
-            {"text": "One", "list": "1|2", "false": "false", "true": "true"},
-            {"text": "Two", "list": "1|2", "false": "false", "true": "true"},
-        ]
-        assert list(scraper.as_csv_rows(BASIC_HTML)) == [
-            ["One", "1|2", "false", "true"],
-            ["Two", "1|2", "false", "true"],
-        ]
-
-        assert list(scraper.as_csv_dict_rows(BASIC_HTML, plural_separator="§")) == [
-            {"text": "One", "list": "1§2", "false": "false", "true": "true"},
-            {"text": "Two", "list": "1§2", "false": "false", "true": "true"},
-        ]
-        assert list(scraper.as_csv_rows(BASIC_HTML, plural_separator="§")) == [
-            ["One", "1§2", "false", "true"],
-            ["Two", "1§2", "false", "true"],
-        ]
-
     def test_get_display_test(self):
         soup = BeautifulSoup(THE_WORST_HTML, "lxml")