Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework attributes #155

Merged
merged 24 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 35 additions & 14 deletions src/parser/html_parser/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Optional,
Tuple,
Type,
TypeVar,
)

import lxml.html
Expand All @@ -24,6 +25,8 @@
from src.parser.html_parser.data import LinkedDataMapping
from src.parser.html_parser.utility import get_meta_content

RegisteredFunctionT_co = TypeVar("RegisteredFunctionT_co", covariant=True, bound="RegisteredFunction")


class RegisteredFunction(ABC):
__wrapped__: Callable[[object], Any]
Expand All @@ -32,7 +35,7 @@ class RegisteredFunction(ABC):
__self__: Optional["BaseParser"]

# TODO: ensure uint for priority instead of int
def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
def __init__(self, func: Callable[[object], Any], priority: Optional[int]):
self.__self__ = None
self.__func__ = func
self.__finite__: bool = False
Expand Down Expand Up @@ -69,18 +72,19 @@ def __repr__(self):


class Attribute(RegisteredFunction):
def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
def __init__(self, func: Callable[[object], Any], priority: Optional[int], supported: bool):
self.supported = supported
super(Attribute, self).__init__(func=func, priority=priority)


class Function(RegisteredFunction):
def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
def __init__(self, func: Callable[[object], Any], priority: Optional[int]):
super(Function, self).__init__(func=func, priority=priority)


def _register(cls, factory: Type[RegisteredFunction], priority):
def _register(cls, factory: Type[RegisteredFunction], **kwargs):
def wrapper(func):
return functools.update_wrapper(factory(func, priority), func)
return functools.update_wrapper(factory(func, **kwargs), func)

# _register was called with parenthesis
if cls is None:
Expand All @@ -90,16 +94,16 @@ def wrapper(func):
return wrapper(cls)


def attribute(cls=None, /, *, priority: Optional[int] = None):
return _register(cls, factory=Attribute, priority=priority)
def attribute(cls=None, /, *, priority: Optional[int] = None, supported: bool = True):
return _register(cls, factory=Attribute, priority=priority, supported=supported)


def function(cls=None, /, *, priority: Optional[int] = None):
return _register(cls, factory=Function, priority=priority)


class RegisteredFunctionCollection(Collection[RegisteredFunction]):
def __init__(self, *functions: RegisteredFunction):
class RegisteredFunctionCollection(Collection[RegisteredFunctionT_co]):
def __init__(self, *functions: RegisteredFunctionT_co):
self.functions = tuple(functions)

@property
Expand All @@ -109,12 +113,29 @@ def names(self) -> List[str]:
def __len__(self) -> int:
return len(self.functions)

def __iter__(self) -> Iterator[RegisteredFunction]:
def __iter__(self) -> Iterator[RegisteredFunctionT_co]:
return iter(self.functions)

def __contains__(self, item) -> bool:
return self.functions.__contains__(item)

def __eq__(self, other) -> bool:
MaxDall marked this conversation as resolved.
Show resolved Hide resolved
return self.functions == other.functions if isinstance(other, RegisteredFunctionCollection) else False


class AttributeCollection(RegisteredFunctionCollection[Attribute]):
@property
def supported(self) -> List[Attribute]:
return [attr for attr in self.functions if attr.supported]

@property
def unsupported(self) -> List[Attribute]:
return [attr for attr in self.functions if not attr.supported]
dobbersc marked this conversation as resolved.
Show resolved Hide resolved


class FunctionCollection(RegisteredFunctionCollection[Function]):
pass


@dataclass
class Precomputed:
Expand All @@ -140,14 +161,14 @@ def _search_members(cls, obj_type: type) -> List[Tuple[str, Any]]:
return members

@classmethod
def attributes(cls) -> RegisteredFunctionCollection:
def attributes(cls) -> AttributeCollection:
attrs = [func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta"]]
return RegisteredFunctionCollection(*attrs)
return AttributeCollection(*attrs)

@classmethod
def functions(cls) -> RegisteredFunctionCollection:
def functions(cls) -> FunctionCollection:
funcs = [func for _, func in cls._search_members(Function)]
return RegisteredFunctionCollection(*funcs)
return FunctionCollection(*funcs)

@property
def cache(self) -> Optional[Dict[str, Any]]:
Expand Down
97 changes: 26 additions & 71 deletions src/scraping/article.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,46 @@
import json
from abc import ABC
from dataclasses import InitVar, dataclass, field
from dataclasses import dataclass, field
from datetime import datetime
from textwrap import TextWrapper, dedent
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Dict, List, Optional

import more_itertools
from colorama import Fore, Style

from src.parser.html_parser import ArticleBody, LinkedDataMapping
from src.parser.html_parser import ArticleBody
from src.scraping.source import ArticleSource


@dataclass(frozen=True)
class BaseArticle(ABC):
url: str
html: str
crawl_date: datetime
publisher: Optional[str] = None
crawler_ref: InitVar[object] = None

def __post_init__(self, crawler_ref: object):
object.__setattr__(self, "_crawler_ref", crawler_ref)
class Article:
source: ArticleSource
exception: Optional[Exception] = None

def serialize(self) -> Dict[str, Any]:
attrs = self.__dict__
attrs["crawler_ref"] = attrs.pop("_crawler_ref")
return attrs
# supported attributes as defined in the guidelines
title: Optional[str] = None
author: List[str] = field(default_factory=list)
body: Optional[ArticleBody] = None
publishing_date: Optional[datetime] = None
topics: List[str] = field(default_factory=list)

@classmethod
def deserialize(cls, serialized: Dict[str, Any]):
return cls(**serialized)

def pprint(
self,
indent: int = 4,
ensure_ascii: bool = False,
default: Callable[[Any], Any] = str,
exclude: Optional[List[str]] = None,
) -> str:
to_serialize: Dict[str, Any] = self.__dict__.copy()
if not exclude:
exclude = []
for key in exclude:
if not hasattr(self, key):
raise AttributeError(f"Tried to exclude key '{key} which isn't present in this'{self}' instance")
to_serialize.pop(key)
return json.dumps(to_serialize, indent=indent, ensure_ascii=ensure_ascii, default=default)


@dataclass(frozen=True)
class ArticleSource(BaseArticle):
pass

def from_extracted(cls, source: ArticleSource, extracted: Dict[str, Any], exception: Optional[Exception] = None):
dobbersc marked this conversation as resolved.
Show resolved Hide resolved
MaxDall marked this conversation as resolved.
Show resolved Hide resolved
unsupported, supported = more_itertools.partition(
lambda view: view[0] in cls.__annotations__, extracted.items()
)

@dataclass(frozen=True)
class Article(BaseArticle):
extracted: Dict[str, Any] = field(default_factory=dict)
exception: Optional[Exception] = None
new = cls(source, exception, **dict(supported))
MaxDall marked this conversation as resolved.
Show resolved Hide resolved
for attr, value in unsupported:
object.__setattr__(new, attr, value)

@property
def complete(self) -> bool:
return all(not (isinstance(attr, Exception) or attr is None) for attr in self.extracted.values())
return new

# provide direct access for commonly used attributes in self.extracted
@property
def plaintext(self) -> Optional[str]:
body = self.body
return str(body) if body else None

@property
def title(self) -> Optional[str]:
return self.extracted.get("title") if self.extracted else None

@property
def body(self) -> Optional[ArticleBody]:
return self.extracted.get("body") if self.extracted else None

@property
def authors(self) -> List[str]:
return self.extracted.get("authors", []) if self.extracted else []

@property
def ld(self) -> Optional[LinkedDataMapping]:
return self.extracted.get("ld") if self.extracted else None

@property
def meta(self) -> Optional[Dict[str, Any]]:
return self.extracted.get("meta") if self.extracted else None
def __getattr__(self, item):
raise AttributeError(f"Article has no attribute '{item}'")
dobbersc marked this conversation as resolved.
Show resolved Hide resolved

def __str__(self):
dobbersc marked this conversation as resolved.
Show resolved Hide resolved
# the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
Expand All @@ -103,8 +58,8 @@ def __str__(self):
f"Fundus-Article:"
f'\n- Title: "{wrapped_title}"'
f'\n- Text: "{wrapped_plaintext}"'
f"\n- URL: {self.url}"
f'\n- From: {self.publisher} ({self.crawl_date.strftime("%Y-%m-%d %H:%M")})'
f"\n- URL: {self.source.url}"
f'\n- From: {self.source.publisher} ({self.publishing_date.strftime("%Y-%m-%d %H:%M") if self.publishing_date else ""})'
)

return dedent(text)
4 changes: 2 additions & 2 deletions src/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ def scrape(self, error_handling: Literal["suppress", "catch", "raise"], batch_si
if error_handling == "raise":
raise err
elif error_handling == "catch":
yield Article(extracted={}, exception=err, **article_source.serialize())
yield Article(source=article_source, exception=err)
continue
elif error_handling == "suppress":
basic_logger.info(f"Skipped {article_source.url} because of: {err!r}")
continue
else:
raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")

article = Article(extracted=data, **article_source.serialize())
article = Article.from_extracted(source=article_source, extracted=data)
yield article
11 changes: 10 additions & 1 deletion src/scraping/source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gzip
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from functools import cached_property
from multiprocessing.pool import ThreadPool
Expand All @@ -12,7 +13,15 @@
from requests import HTTPError

from src.logging.logger import basic_logger
from src.scraping.article import ArticleSource


@dataclass(frozen=True)
class ArticleSource:
url: str
html: str
crawl_date: datetime
publisher: Optional[str] = None
crawler_ref: object = None
dobbersc marked this conversation as resolved.
Show resolved Hide resolved


class Source(Iterable[str], ABC):
Expand Down
36 changes: 35 additions & 1 deletion tests/fixtures/fixture_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from src.parser.html_parser import BaseParser, attribute
from src.parser.html_parser import BaseParser, attribute, function


@pytest.fixture
Expand All @@ -11,6 +11,26 @@ class EmptyParser(BaseParser):
return EmptyParser


@pytest.fixture
def parser_with_static_method():
class ParserWithStaticMethod(BaseParser):
@staticmethod
def test():
return "this is not an attribute"

return ParserWithStaticMethod


@pytest.fixture
def parser_with_function_test():
class ParserWithFunctionTest(BaseParser):
@function
def test(self):
pass

return ParserWithFunctionTest


@pytest.fixture
def parser_with_attr_title():
class ParserWithAttrTitle(BaseParser):
Expand All @@ -19,3 +39,17 @@ def title(self) -> str:
return "This is a title"

return ParserWithAttrTitle


@pytest.fixture
def parser_with_supported_and_unsupported():
class ParserWithSupportedAndUnsupported(BaseParser):
@attribute
def supported(self):
return "supported"

@attribute(supported=False)
def unsupported(self):
return "unsupported"
MaxDall marked this conversation as resolved.
Show resolved Hide resolved

return ParserWithSupportedAndUnsupported
23 changes: 22 additions & 1 deletion tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from src.library.collection import PublisherCollection
from src.library.collection.base_objects import PublisherEnum
from src.parser.html_parser import BaseParser
from tests.resources import parse_annotations, parser_test_data_path


Expand Down Expand Up @@ -37,14 +38,34 @@ def load_data(publisher: PublisherEnum) -> Dict[str, Any]:
raise ValueError("Unknown json format")


class TestBaseParser:
def test_functions_iter(self, parser_with_function_test, parser_with_static_method):
assert len(BaseParser.functions()) == 0
assert len(parser_with_static_method.functions()) == 0
assert len(parser_with_function_test.functions()) == 1
assert parser_with_function_test.functions().names == ["test"]

def test_attributes_iter(self, parser_with_attr_title, parser_with_static_method):
assert len(BaseParser.attributes()) == 0
assert len(parser_with_static_method.attributes()) == 0
assert len(parser_with_attr_title.attributes()) == 1
assert parser_with_attr_title.attributes().names == ["title"]

def test_supported_unsupported(self, parser_with_supported_and_unsupported):
parser = parser_with_supported_and_unsupported
assert len(parser.attributes()) == 2
assert parser.attributes().supported == [parser.supported]
assert parser.attributes().unsupported == [parser.unsupported]


@pytest.mark.parametrize(
"publisher", list(PublisherCollection), ids=[publisher.name for publisher in PublisherCollection]
)
class TestParser:
def test_annotations(self, publisher: PublisherEnum) -> None:
parser = publisher.parser
mapping = parse_annotations()
for attr in parser.attributes():
for attr in parser.attributes().supported:
if annotation := mapping[attr.__name__]:
assert (
attr.__annotations__.get("return") == annotation
Expand Down