From b464bfd9e170bb2b3cda2e94ce630c50ebcd04c9 Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Fri, 6 Sep 2024 11:30:46 +0200 Subject: [PATCH] Upgrading lxml, bs4, trafilatura Fix #985 --- docs/cli.md | 8 ++++---- hooks/hook-minet.cli.py | 5 +++-- minet/cli/extract/__init__.py | 4 +--- minet/cli/scrape/__init__.py | 4 +--- minet/cli/utils.py | 6 +++++- minet/extraction.py | 13 ++----------- minet/scrape/classes/definition.py | 2 +- minet/scrape/soup.py | 12 +++++++++--- requirements.txt | 9 +++++---- setup.py | 5 +++-- test/scraper_test.py | 1 + 11 files changed, 35 insertions(+), 34 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 02ac98050c..f4f4fae318 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -867,8 +867,8 @@ Optional Arguments: Defaults to `fetch_error`. -g, --glob Will interpret given paths as glob patterns to resolve if given. - -I, --input-dir INPUT_DIR Directory where the HTML files are stored. - Defaults to `downloaded`. + -I, --input-dir INPUT_DIR Directory where the HTML files are stored. Will + default to fetch default output directory. --mimetype-column MIMETYPE_COLUMN Name of the CSV column containing file mimetype. Defaults to `mimetype`. @@ -1197,8 +1197,8 @@ Optional Arguments: Output format. Defaults to `csv`. -g, --glob Will interpret given paths as glob patterns to resolve if given. - -I, --input-dir INPUT_DIR Directory where the HTML files are stored. - Defaults to `downloaded`. + -I, --input-dir INPUT_DIR Directory where the HTML files are stored. Will + default to fetch default output directory. --mimetype-column MIMETYPE_COLUMN Name of the CSV column containing file mimetype. Defaults to `mimetype`. diff --git a/hooks/hook-minet.cli.py b/hooks/hook-minet.cli.py index 8c81153d1a..156cafabfc 100644 --- a/hooks/hook-minet.cli.py +++ b/hooks/hook-minet.cli.py @@ -30,8 +30,9 @@ (join(dirname(trafilatura.__file__), "settings.cfg"), "trafilatura"), ] -for p in iglob(join(dirname(trafilatura.__file__), "data", "*.lzma")): - datas.append((p, "trafilatura/data")) +# NOTE: I don't think we need DTD files for our purpose +# for p in iglob(join(dirname(trafilatura.__file__), "data", "*.dtd")): +# datas.append((p, "trafilatura/data")) for p in iglob(join(dirname(playwright_stealth.__file__), "js", "*.js")): datas.append((p, "playwright_stealth/js")) diff --git a/minet/cli/extract/__init__.py b/minet/cli/extract/__init__.py index 65ff2b2cad..91df1d35e6 100644 --- a/minet/cli/extract/__init__.py +++ b/minet/cli/extract/__init__.py @@ -1,7 +1,6 @@ from casanova import IndexedResumer from minet.cli.argparse import command -from minet.cli.constants import DEFAULT_CONTENT_FOLDER def resolve_arguments(cli_args): @@ -101,8 +100,7 @@ def resolve_arguments(cli_args): }, { "flags": ["-I", "--input-dir"], - "help": "Directory where the HTML files are stored.", - "default": DEFAULT_CONTENT_FOLDER + "help": "Directory where the HTML files are stored. Will default to fetch default output directory.", }, { "flags": ["-p", "--processes"], diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py index 7a746fe259..4564ab2874 100644 --- a/minet/cli/scrape/__init__.py +++ b/minet/cli/scrape/__init__.py @@ -1,5 +1,4 @@ from minet.cli.argparse import command -from minet.cli.constants import DEFAULT_CONTENT_FOLDER def resolve_arguments(cli_args): @@ -126,8 +125,7 @@ def resolve_arguments(cli_args): }, { "flags": ["-I", "--input-dir"], - "help": "Directory where the HTML files are stored.", - "default": DEFAULT_CONTENT_FOLDER + "help": "Directory where the HTML files are stored. Will default to fetch default output directory.", }, { "flags": ["-p", "--processes"], diff --git a/minet/cli/utils.py b/minet/cli/utils.py index cdce031925..7e864f55c3 100644 --- a/minet/cli/utils.py +++ b/minet/cli/utils.py @@ -26,6 +26,7 @@ from minet.crawl import CrawlerState from minet.encodings import is_supported_encoding +from minet.cli.constants import DEFAULT_CONTENT_FOLDER from minet.cli.console import console from minet.cli.loading_bar import LoadingBar, StatsItem from minet.cli.exceptions import FatalError @@ -189,7 +190,10 @@ def create_fetch_like_report_iterator( cli_args: SimpleNamespace, reader: casanova.Reader ) -> Iterator[FetchReportLikeItem]: headers = reader.headers - input_dir = cli_args.input_dir or "" + input_dir = cli_args.input_dir + + if input_dir is None: + input_dir = DEFAULT_CONTENT_FOLDER # TODO: deal with no_headers assert headers is not None diff --git a/minet/extraction.py b/minet/extraction.py index 516f4ab82a..72d9889ab7 100644 --- a/minet/extraction.py +++ b/minet/extraction.py @@ -2,16 +2,7 @@ from dataclasses import dataclass, field from casanova import TabularRecord - -try: - from trafilatura.core import bare_extraction -except ModuleNotFoundError as e: - if "lzma" in str(e): - raise ImportError( - "cannot import trafilatura because your version of python was not compiled with lzma.\nSee https://stackoverflow.com/questions/57743230/userwarning-could-not-import-the-lzma-module-your-installed-python-is-incomple for potential solutions." - ) - - raise +from trafilatura.core import bare_extraction from minet.exceptions import TrafilaturaError from minet.encodings import fix_surrogates @@ -97,7 +88,7 @@ def extract(text: str) -> Optional[TrafilaturaResult]: # Attempting extraction try: # https://trafilatura.readthedocs.io/en/latest/corefunctions.html - trafilatura_bare_result = bare_extraction(text) + trafilatura_bare_result = bare_extraction(text, with_metadata=True) except Exception as e: raise TrafilaturaError(reason=e) diff --git a/minet/scrape/classes/definition.py b/minet/scrape/classes/definition.py index 6ee4687e2a..cc3fabce2c 100644 --- a/minet/scrape/classes/definition.py +++ b/minet/scrape/classes/definition.py @@ -16,7 +16,7 @@ def scrape( scraper: Dict, html: AnyScrapableTarget, - engine: str = "lxml", + engine: str = "html.parser", context: Optional[Dict] = None, strainer: Optional[SoupStrainer] = None, ): diff --git a/minet/scrape/soup.py b/minet/scrape/soup.py index d035495719..2d53f1c9c0 100644 --- a/minet/scrape/soup.py +++ b/minet/scrape/soup.py @@ -23,6 +23,10 @@ class ExtractionError(Exception): pass +def normalize_css(css: str) -> str: + return css.replace(":contains(", ":-soup-contains(") + + def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]: if target is None or target == "text": return elem.get_text() @@ -41,6 +45,8 @@ def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]: class MinetTag(Tag): def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag": + css = normalize_css(css) + elem = super().select_one(css, *args, **kwargs) if elem is None: @@ -49,11 +55,11 @@ def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag": return cast(MinetTag, elem) def select_one(self, css: str, *args, **kwargs) -> Optional["MinetTag"]: + css = normalize_css(css) return cast(Optional["MinetTag"], super().select_one(css, *args, **kwargs)) def select(self, css: str, *args, **kwargs) -> List["MinetTag"]: - css = css.replace(":contains(", ":-soup-contains(") - + css = normalize_css(css) return cast(List["MinetTag"], super().select(css, *args, **kwargs)) def force_scrape_one(self, css: str, target: Optional[str] = None) -> str: @@ -136,7 +142,7 @@ class WonderfulSoup(BeautifulSoup, MinetTag): def __init__( self, markup: str, - features: str = "lxml", + features: str = "html.parser", parse_only: Optional[SoupStrainer] = None, ) -> None: super().__init__( diff --git a/requirements.txt b/requirements.txt index 35a5f0b08c..b087c07e03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Dev dependencies ipywidgets jupyterlab -PyInstaller==6.6.0 +PyInstaller==6.10.0 pytest==7.2.1 ruff twine @@ -9,14 +9,15 @@ wheel # Dependencies about-time==4.2.1 -beautifulsoup4==4.11.1 +beautifulsoup4==4.12.3 browser-cookie3==0.19.1 casanova==2.0.1 charset-normalizer==3.3.2 dateparser==1.1.6 ebbe==1.13.2 json5==0.9.11 -lxml>=4.9.2,<5.2 +lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8' +lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8' nanoid==2.0.0 playwright==1.46.0 playwright-stealth==1.0.6 @@ -26,7 +27,7 @@ rich==13.8.0 rich-argparse==1.5.2 soupsieve>=2.1,<3 tenacity==8.2.1 -trafilatura==1.8.1 +trafilatura==1.12.1 typing_extensions>=4.3; python_version < '3.11' twitwi==0.19.2 ural==1.3.2 diff --git a/setup.py b/setup.py index 744f1e917f..6c4c8293f7 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ "dateparser>=1.1.1", "ebbe>=1.13,<2", "json5>=0.8.5", - "lxml>=4.3.0,<5.2", + "lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'", + "lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8'", "nanoid>=2,<3", "playwright>=1.46,<1.47", "playwright_stealth>=1.0.6,<2", @@ -43,7 +44,7 @@ "rich-argparse>=1,<2", "soupsieve>=2.1,<3", "tenacity>=8,<9", - "trafilatura>=1.8.1,<1.9", + "trafilatura>=1.12.1,<1.13", "twitwi>=0.19.2,<0.20", "ural>=1.3.2,<2", "urllib3>=1.26.16,<2", diff --git a/test/scraper_test.py b/test/scraper_test.py index 2d8ae6c40f..11fb816715 100644 --- a/test/scraper_test.py +++ b/test/scraper_test.py @@ -163,6 +163,7 @@ """ +@pytest.mark.filterwarnings("ignore") class TestDefinitionScraper: def test_basics(self): result = scrape({"iterator": "li"}, BASIC_HTML)