Upgrading lxml, bs4, trafilatura

Fix #985
medialab · Sep 6, 2024 · b464bfd · b464bfd
1 parent 6ba46a1
commit b464bfd
Show file tree

Hide file tree

Showing 11 changed files with 35 additions and 34 deletions.
diff --git a/docs/cli.md b/docs/cli.md
@@ -867,8 +867,8 @@ Optional Arguments:
                                 Defaults to `fetch_error`.
   -g, --glob                    Will interpret given paths as glob patterns to
                                 resolve if given.
-  -I, --input-dir INPUT_DIR     Directory where the HTML files are stored.
-                                Defaults to `downloaded`.
+  -I, --input-dir INPUT_DIR     Directory where the HTML files are stored. Will
+                                default to fetch default output directory.
   --mimetype-column MIMETYPE_COLUMN
                                 Name of the CSV column containing file mimetype.
                                 Defaults to `mimetype`.
@@ -1197,8 +1197,8 @@ Optional Arguments:
                                 Output format. Defaults to `csv`.
   -g, --glob                    Will interpret given paths as glob patterns to
                                 resolve if given.
-  -I, --input-dir INPUT_DIR     Directory where the HTML files are stored.
-                                Defaults to `downloaded`.
+  -I, --input-dir INPUT_DIR     Directory where the HTML files are stored. Will
+                                default to fetch default output directory.
   --mimetype-column MIMETYPE_COLUMN
                                 Name of the CSV column containing file mimetype.
                                 Defaults to `mimetype`.

diff --git a/hooks/hook-minet.cli.py b/hooks/hook-minet.cli.py
@@ -30,8 +30,9 @@
     (join(dirname(trafilatura.__file__), "settings.cfg"), "trafilatura"),
 ]
 
-for p in iglob(join(dirname(trafilatura.__file__), "data", "*.lzma")):
-    datas.append((p, "trafilatura/data"))
+# NOTE: I don't think we need DTD files for our purpose
+# for p in iglob(join(dirname(trafilatura.__file__), "data", "*.dtd")):
+#     datas.append((p, "trafilatura/data"))
 
 for p in iglob(join(dirname(playwright_stealth.__file__), "js", "*.js")):
     datas.append((p, "playwright_stealth/js"))

diff --git a/minet/cli/extract/__init__.py b/minet/cli/extract/__init__.py
@@ -1,7 +1,6 @@
 from casanova import IndexedResumer
 
 from minet.cli.argparse import command
-from minet.cli.constants import DEFAULT_CONTENT_FOLDER
 
 
 def resolve_arguments(cli_args):
@@ -101,8 +100,7 @@ def resolve_arguments(cli_args):
         },
         {
             "flags": ["-I", "--input-dir"],
-            "help": "Directory where the HTML files are stored.",
-            "default": DEFAULT_CONTENT_FOLDER
+            "help": "Directory where the HTML files are stored. Will default to fetch default output directory.",
         },
         {
             "flags": ["-p", "--processes"],

diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
@@ -1,5 +1,4 @@
 from minet.cli.argparse import command
-from minet.cli.constants import DEFAULT_CONTENT_FOLDER
 
 
 def resolve_arguments(cli_args):
@@ -126,8 +125,7 @@ def resolve_arguments(cli_args):
         },
         {
             "flags": ["-I", "--input-dir"],
-            "help": "Directory where the HTML files are stored.",
-            "default": DEFAULT_CONTENT_FOLDER
+            "help": "Directory where the HTML files are stored. Will default to fetch default output directory.",
         },
         {
             "flags": ["-p", "--processes"],

diff --git a/minet/cli/utils.py b/minet/cli/utils.py
@@ -26,6 +26,7 @@
 
 from minet.crawl import CrawlerState
 from minet.encodings import is_supported_encoding
+from minet.cli.constants import DEFAULT_CONTENT_FOLDER
 from minet.cli.console import console
 from minet.cli.loading_bar import LoadingBar, StatsItem
 from minet.cli.exceptions import FatalError
@@ -189,7 +190,10 @@ def create_fetch_like_report_iterator(
     cli_args: SimpleNamespace, reader: casanova.Reader
 ) -> Iterator[FetchReportLikeItem]:
     headers = reader.headers
-    input_dir = cli_args.input_dir or ""
+    input_dir = cli_args.input_dir
+
+    if input_dir is None:
+        input_dir = DEFAULT_CONTENT_FOLDER
 
     # TODO: deal with no_headers
     assert headers is not None

diff --git a/minet/extraction.py b/minet/extraction.py
@@ -2,16 +2,7 @@
 
 from dataclasses import dataclass, field
 from casanova import TabularRecord
-
-try:
-    from trafilatura.core import bare_extraction
-except ModuleNotFoundError as e:
-    if "lzma" in str(e):
-        raise ImportError(
-            "cannot import trafilatura because your version of python was not compiled with lzma.\nSee https://stackoverflow.com/questions/57743230/userwarning-could-not-import-the-lzma-module-your-installed-python-is-incomple for potential solutions."
-        )
-
-    raise
+from trafilatura.core import bare_extraction
 
 from minet.exceptions import TrafilaturaError
 from minet.encodings import fix_surrogates
@@ -97,7 +88,7 @@ def extract(text: str) -> Optional[TrafilaturaResult]:
     # Attempting extraction
     try:
         # https://trafilatura.readthedocs.io/en/latest/corefunctions.html
-        trafilatura_bare_result = bare_extraction(text)
+        trafilatura_bare_result = bare_extraction(text, with_metadata=True)
     except Exception as e:
         raise TrafilaturaError(reason=e)
 

diff --git a/minet/scrape/classes/definition.py b/minet/scrape/classes/definition.py
@@ -16,7 +16,7 @@
 def scrape(
     scraper: Dict,
     html: AnyScrapableTarget,
-    engine: str = "lxml",
+    engine: str = "html.parser",
     context: Optional[Dict] = None,
     strainer: Optional[SoupStrainer] = None,
 ):

diff --git a/minet/scrape/soup.py b/minet/scrape/soup.py
@@ -23,6 +23,10 @@ class ExtractionError(Exception):
     pass
 
 
+def normalize_css(css: str) -> str:
+    return css.replace(":contains(", ":-soup-contains(")
+
+
 def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]:
     if target is None or target == "text":
         return elem.get_text()
@@ -41,6 +45,8 @@ def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]:
 
 class MinetTag(Tag):
     def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag":
+        css = normalize_css(css)
+
         elem = super().select_one(css, *args, **kwargs)
 
         if elem is None:
@@ -49,11 +55,11 @@ def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag":
         return cast(MinetTag, elem)
 
     def select_one(self, css: str, *args, **kwargs) -> Optional["MinetTag"]:
+        css = normalize_css(css)
         return cast(Optional["MinetTag"], super().select_one(css, *args, **kwargs))
 
     def select(self, css: str, *args, **kwargs) -> List["MinetTag"]:
-        css = css.replace(":contains(", ":-soup-contains(")
-
+        css = normalize_css(css)
         return cast(List["MinetTag"], super().select(css, *args, **kwargs))
 
     def force_scrape_one(self, css: str, target: Optional[str] = None) -> str:
@@ -136,7 +142,7 @@ class WonderfulSoup(BeautifulSoup, MinetTag):
     def __init__(
         self,
         markup: str,
-        features: str = "lxml",
+        features: str = "html.parser",
         parse_only: Optional[SoupStrainer] = None,
     ) -> None:
         super().__init__(

diff --git a/requirements.txt b/requirements.txt
@@ -1,22 +1,23 @@
 # Dev dependencies
 ipywidgets
 jupyterlab
-PyInstaller==6.6.0
+PyInstaller==6.10.0
 pytest==7.2.1
 ruff
 twine
 wheel
 
 # Dependencies
 about-time==4.2.1
-beautifulsoup4==4.11.1
+beautifulsoup4==4.12.3
 browser-cookie3==0.19.1
 casanova==2.0.1
 charset-normalizer==3.3.2
 dateparser==1.1.6
 ebbe==1.13.2
 json5==0.9.11
-lxml>=4.9.2,<5.2
+lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'
+lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8'
 nanoid==2.0.0
 playwright==1.46.0
 playwright-stealth==1.0.6
@@ -26,7 +27,7 @@ rich==13.8.0
 rich-argparse==1.5.2
 soupsieve>=2.1,<3
 tenacity==8.2.1
-trafilatura==1.8.1
+trafilatura==1.12.1
 typing_extensions>=4.3; python_version < '3.11'
 twitwi==0.19.2
 ural==1.3.2

diff --git a/setup.py b/setup.py
@@ -33,7 +33,8 @@
         "dateparser>=1.1.1",
         "ebbe>=1.13,<2",
         "json5>=0.8.5",
-        "lxml>=4.3.0,<5.2",
+        "lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'",
+        "lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8'",
         "nanoid>=2,<3",
         "playwright>=1.46,<1.47",
         "playwright_stealth>=1.0.6,<2",
@@ -43,7 +44,7 @@
         "rich-argparse>=1,<2",
         "soupsieve>=2.1,<3",
         "tenacity>=8,<9",
-        "trafilatura>=1.8.1,<1.9",
+        "trafilatura>=1.12.1,<1.13",
         "twitwi>=0.19.2,<0.20",
         "ural>=1.3.2,<2",
         "urllib3>=1.26.16,<2",

diff --git a/test/scraper_test.py b/test/scraper_test.py
@@ -163,6 +163,7 @@
 """
 
 
+@pytest.mark.filterwarnings("ignore")
 class TestDefinitionScraper:
     def test_basics(self):
         result = scrape({"iterator": "li"}, BASIC_HTML)