Skip to content

Commit

Permalink
Upgrading lxml, bs4, trafilatura
Browse files Browse the repository at this point in the history
Fix #985
  • Loading branch information
Yomguithereal committed Sep 6, 2024
1 parent 6ba46a1 commit b464bfd
Show file tree
Hide file tree
Showing 11 changed files with 35 additions and 34 deletions.
8 changes: 4 additions & 4 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -867,8 +867,8 @@ Optional Arguments:
Defaults to `fetch_error`.
-g, --glob Will interpret given paths as glob patterns to
resolve if given.
-I, --input-dir INPUT_DIR Directory where the HTML files are stored.
Defaults to `downloaded`.
-I, --input-dir INPUT_DIR Directory where the HTML files are stored. Will
default to fetch default output directory.
--mimetype-column MIMETYPE_COLUMN
Name of the CSV column containing file mimetype.
Defaults to `mimetype`.
Expand Down Expand Up @@ -1197,8 +1197,8 @@ Optional Arguments:
Output format. Defaults to `csv`.
-g, --glob Will interpret given paths as glob patterns to
resolve if given.
-I, --input-dir INPUT_DIR Directory where the HTML files are stored.
Defaults to `downloaded`.
-I, --input-dir INPUT_DIR Directory where the HTML files are stored. Will
default to fetch default output directory.
--mimetype-column MIMETYPE_COLUMN
Name of the CSV column containing file mimetype.
Defaults to `mimetype`.
Expand Down
5 changes: 3 additions & 2 deletions hooks/hook-minet.cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
(join(dirname(trafilatura.__file__), "settings.cfg"), "trafilatura"),
]

for p in iglob(join(dirname(trafilatura.__file__), "data", "*.lzma")):
datas.append((p, "trafilatura/data"))
# NOTE: I don't think we need DTD files for our purpose
# for p in iglob(join(dirname(trafilatura.__file__), "data", "*.dtd")):
# datas.append((p, "trafilatura/data"))

for p in iglob(join(dirname(playwright_stealth.__file__), "js", "*.js")):
datas.append((p, "playwright_stealth/js"))
Expand Down
4 changes: 1 addition & 3 deletions minet/cli/extract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from casanova import IndexedResumer

from minet.cli.argparse import command
from minet.cli.constants import DEFAULT_CONTENT_FOLDER


def resolve_arguments(cli_args):
Expand Down Expand Up @@ -101,8 +100,7 @@ def resolve_arguments(cli_args):
},
{
"flags": ["-I", "--input-dir"],
"help": "Directory where the HTML files are stored.",
"default": DEFAULT_CONTENT_FOLDER
"help": "Directory where the HTML files are stored. Will default to fetch default output directory.",
},
{
"flags": ["-p", "--processes"],
Expand Down
4 changes: 1 addition & 3 deletions minet/cli/scrape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from minet.cli.argparse import command
from minet.cli.constants import DEFAULT_CONTENT_FOLDER


def resolve_arguments(cli_args):
Expand Down Expand Up @@ -126,8 +125,7 @@ def resolve_arguments(cli_args):
},
{
"flags": ["-I", "--input-dir"],
"help": "Directory where the HTML files are stored.",
"default": DEFAULT_CONTENT_FOLDER
"help": "Directory where the HTML files are stored. Will default to fetch default output directory.",
},
{
"flags": ["-p", "--processes"],
Expand Down
6 changes: 5 additions & 1 deletion minet/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from minet.crawl import CrawlerState
from minet.encodings import is_supported_encoding
from minet.cli.constants import DEFAULT_CONTENT_FOLDER
from minet.cli.console import console
from minet.cli.loading_bar import LoadingBar, StatsItem
from minet.cli.exceptions import FatalError
Expand Down Expand Up @@ -189,7 +190,10 @@ def create_fetch_like_report_iterator(
cli_args: SimpleNamespace, reader: casanova.Reader
) -> Iterator[FetchReportLikeItem]:
headers = reader.headers
input_dir = cli_args.input_dir or ""
input_dir = cli_args.input_dir

if input_dir is None:
input_dir = DEFAULT_CONTENT_FOLDER

# TODO: deal with no_headers
assert headers is not None
Expand Down
13 changes: 2 additions & 11 deletions minet/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,7 @@

from dataclasses import dataclass, field
from casanova import TabularRecord

try:
from trafilatura.core import bare_extraction
except ModuleNotFoundError as e:
if "lzma" in str(e):
raise ImportError(
"cannot import trafilatura because your version of python was not compiled with lzma.\nSee https://stackoverflow.com/questions/57743230/userwarning-could-not-import-the-lzma-module-your-installed-python-is-incomple for potential solutions."
)

raise
from trafilatura.core import bare_extraction

from minet.exceptions import TrafilaturaError
from minet.encodings import fix_surrogates
Expand Down Expand Up @@ -97,7 +88,7 @@ def extract(text: str) -> Optional[TrafilaturaResult]:
# Attempting extraction
try:
# https://trafilatura.readthedocs.io/en/latest/corefunctions.html
trafilatura_bare_result = bare_extraction(text)
trafilatura_bare_result = bare_extraction(text, with_metadata=True)
except Exception as e:
raise TrafilaturaError(reason=e)

Expand Down
2 changes: 1 addition & 1 deletion minet/scrape/classes/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def scrape(
scraper: Dict,
html: AnyScrapableTarget,
engine: str = "lxml",
engine: str = "html.parser",
context: Optional[Dict] = None,
strainer: Optional[SoupStrainer] = None,
):
Expand Down
12 changes: 9 additions & 3 deletions minet/scrape/soup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class ExtractionError(Exception):
pass


def normalize_css(css: str) -> str:
return css.replace(":contains(", ":-soup-contains(")


def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]:
if target is None or target == "text":
return elem.get_text()
Expand All @@ -41,6 +45,8 @@ def extract(elem: "MinetTag", target: Optional[str]) -> Optional[str]:

class MinetTag(Tag):
def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag":
css = normalize_css(css)

elem = super().select_one(css, *args, **kwargs)

if elem is None:
Expand All @@ -49,11 +55,11 @@ def force_select_one(self, css: str, *args, **kwargs) -> "MinetTag":
return cast(MinetTag, elem)

def select_one(self, css: str, *args, **kwargs) -> Optional["MinetTag"]:
css = normalize_css(css)
return cast(Optional["MinetTag"], super().select_one(css, *args, **kwargs))

def select(self, css: str, *args, **kwargs) -> List["MinetTag"]:
css = css.replace(":contains(", ":-soup-contains(")

css = normalize_css(css)
return cast(List["MinetTag"], super().select(css, *args, **kwargs))

def force_scrape_one(self, css: str, target: Optional[str] = None) -> str:
Expand Down Expand Up @@ -136,7 +142,7 @@ class WonderfulSoup(BeautifulSoup, MinetTag):
def __init__(
self,
markup: str,
features: str = "lxml",
features: str = "html.parser",
parse_only: Optional[SoupStrainer] = None,
) -> None:
super().__init__(
Expand Down
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Dev dependencies
ipywidgets
jupyterlab
PyInstaller==6.6.0
PyInstaller==6.10.0
pytest==7.2.1
ruff
twine
wheel

# Dependencies
about-time==4.2.1
beautifulsoup4==4.11.1
beautifulsoup4==4.12.3
browser-cookie3==0.19.1
casanova==2.0.1
charset-normalizer==3.3.2
dateparser==1.1.6
ebbe==1.13.2
json5==0.9.11
lxml>=4.9.2,<5.2
lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'
lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8'
nanoid==2.0.0
playwright==1.46.0
playwright-stealth==1.0.6
Expand All @@ -26,7 +27,7 @@ rich==13.8.0
rich-argparse==1.5.2
soupsieve>=2.1,<3
tenacity==8.2.1
trafilatura==1.8.1
trafilatura==1.12.1
typing_extensions>=4.3; python_version < '3.11'
twitwi==0.19.2
ural==1.3.2
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
"dateparser>=1.1.1",
"ebbe>=1.13,<2",
"json5>=0.8.5",
"lxml>=4.3.0,<5.2",
"lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml >= 5.2.2; platform_system != 'Darwin' or python_version > '3.8'",
"nanoid>=2,<3",
"playwright>=1.46,<1.47",
"playwright_stealth>=1.0.6,<2",
Expand All @@ -43,7 +44,7 @@
"rich-argparse>=1,<2",
"soupsieve>=2.1,<3",
"tenacity>=8,<9",
"trafilatura>=1.8.1,<1.9",
"trafilatura>=1.12.1,<1.13",
"twitwi>=0.19.2,<0.20",
"ural>=1.3.2,<2",
"urllib3>=1.26.16,<2",
Expand Down
1 change: 1 addition & 0 deletions test/scraper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
"""


@pytest.mark.filterwarnings("ignore")
class TestDefinitionScraper:
def test_basics(self):
result = scrape({"iterator": "li"}, BASIC_HTML)
Expand Down

0 comments on commit b464bfd

Please sign in to comment.