flairNLP · dobbersc · Apr 12, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 5, 2023
diff --git a/src/parser/html_parser/base_parser.py b/src/parser/html_parser/base_parser.py
@@ -16,6 +16,7 @@
     Optional,
     Tuple,
     Type,
+    TypeVar,
 )
 
 import lxml.html
@@ -24,6 +25,8 @@
 from src.parser.html_parser.data import LinkedDataMapping
 from src.parser.html_parser.utility import get_meta_content
 
+RegisteredFunctionT_co = TypeVar("RegisteredFunctionT_co", covariant=True, bound="RegisteredFunction")
+
 
 class RegisteredFunction(ABC):
     __wrapped__: Callable[[object], Any]
@@ -32,7 +35,7 @@ class RegisteredFunction(ABC):
     __self__: Optional["BaseParser"]
 
     # TODO: ensure uint for priority instead of int
-    def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
+    def __init__(self, func: Callable[[object], Any], priority: Optional[int]):
         self.__self__ = None
         self.__func__ = func
         self.__finite__: bool = False
@@ -69,18 +72,19 @@ def __repr__(self):
 
 
 class Attribute(RegisteredFunction):
-    def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
+    def __init__(self, func: Callable[[object], Any], priority: Optional[int], supported: bool):
+        self.supported = supported
         super(Attribute, self).__init__(func=func, priority=priority)
 
 
 class Function(RegisteredFunction):
-    def __init__(self, func: Callable[[object], Any], priority: Optional[int] = None):
+    def __init__(self, func: Callable[[object], Any], priority: Optional[int]):
         super(Function, self).__init__(func=func, priority=priority)
 
 
-def _register(cls, factory: Type[RegisteredFunction], priority):
+def _register(cls, factory: Type[RegisteredFunction], **kwargs):
     def wrapper(func):
-        return functools.update_wrapper(factory(func, priority), func)
+        return functools.update_wrapper(factory(func, **kwargs), func)
 
     # _register was called with parenthesis
     if cls is None:
@@ -90,16 +94,16 @@ def wrapper(func):
     return wrapper(cls)
 
 
-def attribute(cls=None, /, *, priority: Optional[int] = None):
-    return _register(cls, factory=Attribute, priority=priority)
+def attribute(cls=None, /, *, priority: Optional[int] = None, supported: bool = True):
+    return _register(cls, factory=Attribute, priority=priority, supported=supported)
 
 
 def function(cls=None, /, *, priority: Optional[int] = None):
     return _register(cls, factory=Function, priority=priority)
 
 
-class RegisteredFunctionCollection(Collection[RegisteredFunction]):
-    def __init__(self, *functions: RegisteredFunction):
+class RegisteredFunctionCollection(Collection[RegisteredFunctionT_co]):
+    def __init__(self, *functions: RegisteredFunctionT_co):
         self.functions = tuple(functions)
 
     @property
@@ -109,12 +113,29 @@ def names(self) -> List[str]:
     def __len__(self) -> int:
         return len(self.functions)
 
-    def __iter__(self) -> Iterator[RegisteredFunction]:
+    def __iter__(self) -> Iterator[RegisteredFunctionT_co]:
         return iter(self.functions)
 
     def __contains__(self, item) -> bool:
         return self.functions.__contains__(item)
 
+    def __eq__(self, other) -> bool:
+        return self.functions == other.functions if isinstance(other, RegisteredFunctionCollection) else False
+
+
+class AttributeCollection(RegisteredFunctionCollection[Attribute]):
+    @property
+    def supported(self) -> List[Attribute]:
+        return [attr for attr in self.functions if attr.supported]
+
+    @property
+    def unsupported(self) -> List[Attribute]:
+        return [attr for attr in self.functions if not attr.supported]
+
+
+class FunctionCollection(RegisteredFunctionCollection[Function]):
+    pass
+
 
 @dataclass
 class Precomputed:
@@ -140,14 +161,14 @@ def _search_members(cls, obj_type: type) -> List[Tuple[str, Any]]:
         return members
 
     @classmethod
-    def attributes(cls) -> RegisteredFunctionCollection:
+    def attributes(cls) -> AttributeCollection:
         attrs = [func for _, func in cls._search_members(Attribute) if func.__name__ not in ["__ld", "__meta"]]
-        return RegisteredFunctionCollection(*attrs)
+        return AttributeCollection(*attrs)
 
     @classmethod
-    def functions(cls) -> RegisteredFunctionCollection:
+    def functions(cls) -> FunctionCollection:
         funcs = [func for _, func in cls._search_members(Function)]
-        return RegisteredFunctionCollection(*funcs)
+        return FunctionCollection(*funcs)
 
     @property
     def cache(self) -> Optional[Dict[str, Any]]:

diff --git a/src/scraping/article.py b/src/scraping/article.py
@@ -1,91 +1,46 @@
-import json
-from abc import ABC
-from dataclasses import InitVar, dataclass, field
+from dataclasses import dataclass, field
 from datetime import datetime
 from textwrap import TextWrapper, dedent
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
+import more_itertools
 from colorama import Fore, Style
 
-from src.parser.html_parser import ArticleBody, LinkedDataMapping
+from src.parser.html_parser import ArticleBody
+from src.scraping.source import ArticleSource
 
 
 @dataclass(frozen=True)
-class BaseArticle(ABC):
-    url: str
-    html: str
-    crawl_date: datetime
-    publisher: Optional[str] = None
-    crawler_ref: InitVar[object] = None
-
-    def __post_init__(self, crawler_ref: object):
-        object.__setattr__(self, "_crawler_ref", crawler_ref)
+class Article:
+    source: ArticleSource
+    exception: Optional[Exception] = None
 
-    def serialize(self) -> Dict[str, Any]:
-        attrs = self.__dict__
-        attrs["crawler_ref"] = attrs.pop("_crawler_ref")
-        return attrs
+    # supported attributes as defined in the guidelines
+    title: Optional[str] = None
+    author: List[str] = field(default_factory=list)
+    body: Optional[ArticleBody] = None
+    publishing_date: Optional[datetime] = None
+    topics: List[str] = field(default_factory=list)
 
     @classmethod
-    def deserialize(cls, serialized: Dict[str, Any]):
-        return cls(**serialized)
-
-    def pprint(
-        self,
-        indent: int = 4,
-        ensure_ascii: bool = False,
-        default: Callable[[Any], Any] = str,
-        exclude: Optional[List[str]] = None,
-    ) -> str:
-        to_serialize: Dict[str, Any] = self.__dict__.copy()
-        if not exclude:
-            exclude = []
-        for key in exclude:
-            if not hasattr(self, key):
-                raise AttributeError(f"Tried to exclude key '{key} which isn't present in this'{self}' instance")
-            to_serialize.pop(key)
-        return json.dumps(to_serialize, indent=indent, ensure_ascii=ensure_ascii, default=default)
-
-
-@dataclass(frozen=True)
-class ArticleSource(BaseArticle):
-    pass
-
+    def from_extracted(cls, source: ArticleSource, extracted: Dict[str, Any], exception: Optional[Exception] = None):
+        unsupported, supported = more_itertools.partition(
+            lambda view: view[0] in cls.__annotations__, extracted.items()
+        )
 
-@dataclass(frozen=True)
-class Article(BaseArticle):
-    extracted: Dict[str, Any] = field(default_factory=dict)
-    exception: Optional[Exception] = None
+        new = cls(source, exception, **dict(supported))
+        for attr, value in unsupported:
+            object.__setattr__(new, attr, value)
 
-    @property
-    def complete(self) -> bool:
-        return all(not (isinstance(attr, Exception) or attr is None) for attr in self.extracted.values())
+        return new
 
-    # provide direct access for commonly used attributes in self.extracted
     @property
     def plaintext(self) -> Optional[str]:
         body = self.body
         return str(body) if body else None
 
-    @property
-    def title(self) -> Optional[str]:
-        return self.extracted.get("title") if self.extracted else None
-
-    @property
-    def body(self) -> Optional[ArticleBody]:
-        return self.extracted.get("body") if self.extracted else None
-
-    @property
-    def authors(self) -> List[str]:
-        return self.extracted.get("authors", []) if self.extracted else []
-
-    @property
-    def ld(self) -> Optional[LinkedDataMapping]:
-        return self.extracted.get("ld") if self.extracted else None
-
-    @property
-    def meta(self) -> Optional[Dict[str, Any]]:
-        return self.extracted.get("meta") if self.extracted else None
+    def __getattr__(self, item):
+        raise AttributeError(f"Article has no attribute '{item}'")
 
     def __str__(self):
         # the subsequent indent here is a bit wacky, but textwrapper.dedent won't work with tabs, so we have to use
@@ -103,8 +58,8 @@ def __str__(self):
             f"Fundus-Article:"
             f'\n- Title: "{wrapped_title}"'
             f'\n- Text:  "{wrapped_plaintext}"'
-            f"\n- URL:    {self.url}"
-            f'\n- From:   {self.publisher} ({self.crawl_date.strftime("%Y-%m-%d %H:%M")})'
+            f"\n- URL:    {self.source.url}"
+            f'\n- From:   {self.source.publisher} ({self.publishing_date.strftime("%Y-%m-%d %H:%M") if self.publishing_date else ""})'
         )
 
         return dedent(text)
diff --git a/src/scraping/scraper.py b/src/scraping/scraper.py
@@ -21,13 +21,13 @@ def scrape(self, error_handling: Literal["suppress", "catch", "raise"], batch_si
                     if error_handling == "raise":
                         raise err
                     elif error_handling == "catch":
-                        yield Article(extracted={}, exception=err, **article_source.serialize())
+                        yield Article(source=article_source, exception=err)
                         continue
                     elif error_handling == "suppress":
                         basic_logger.info(f"Skipped {article_source.url} because of: {err!r}")
                         continue
                     else:
                         raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")
 
-                article = Article(extracted=data, **article_source.serialize())
+                article = Article.from_extracted(source=article_source, extracted=data)
                 yield article
diff --git a/src/scraping/source.py b/src/scraping/source.py
@@ -1,5 +1,6 @@
 import gzip
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from datetime import datetime
 from functools import cached_property
 from multiprocessing.pool import ThreadPool
@@ -12,7 +13,15 @@
 from requests import HTTPError
 
 from src.logging.logger import basic_logger
-from src.scraping.article import ArticleSource
+
+
+@dataclass(frozen=True)
+class ArticleSource:
+    url: str
+    html: str
+    crawl_date: datetime
+    publisher: Optional[str] = None
+    crawler_ref: object = None
 
 
 class Source(Iterable[str], ABC):

diff --git a/tests/fixtures/fixture_parser.py b/tests/fixtures/fixture_parser.py
@@ -1,6 +1,6 @@
 import pytest
 
-from src.parser.html_parser import BaseParser, attribute
+from src.parser.html_parser import BaseParser, attribute, function
 
 
 @pytest.fixture
@@ -11,6 +11,26 @@ class EmptyParser(BaseParser):
     return EmptyParser
 
 
+@pytest.fixture
+def parser_with_static_method():
+    class ParserWithStaticMethod(BaseParser):
+        @staticmethod
+        def test():
+            return "this is not an attribute"
+
+    return ParserWithStaticMethod
+
+
+@pytest.fixture
+def parser_with_function_test():
+    class ParserWithFunctionTest(BaseParser):
+        @function
+        def test(self):
+            pass
+
+    return ParserWithFunctionTest
+
+
 @pytest.fixture
 def parser_with_attr_title():
     class ParserWithAttrTitle(BaseParser):
@@ -19,3 +39,17 @@ def title(self) -> str:
             return "This is a title"
 
     return ParserWithAttrTitle
+
+
+@pytest.fixture
+def parser_with_supported_and_unsupported():
+    class ParserWithSupportedAndUnsupported(BaseParser):
+        @attribute
+        def supported(self):
+            return "supported"
+
+        @attribute(supported=False)
+        def unsupported(self):
+            return "unsupported"
+
+    return ParserWithSupportedAndUnsupported
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -8,6 +8,7 @@
 
 from src.library.collection import PublisherCollection
 from src.library.collection.base_objects import PublisherEnum
+from src.parser.html_parser import BaseParser
 from tests.resources import parse_annotations, parser_test_data_path
 
 
@@ -37,14 +38,34 @@ def load_data(publisher: PublisherEnum) -> Dict[str, Any]:
         raise ValueError("Unknown json format")
 
 
+class TestBaseParser:
+    def test_functions_iter(self, parser_with_function_test, parser_with_static_method):
+        assert len(BaseParser.functions()) == 0
+        assert len(parser_with_static_method.functions()) == 0
+        assert len(parser_with_function_test.functions()) == 1
+        assert parser_with_function_test.functions().names == ["test"]
+
+    def test_attributes_iter(self, parser_with_attr_title, parser_with_static_method):
+        assert len(BaseParser.attributes()) == 0
+        assert len(parser_with_static_method.attributes()) == 0
+        assert len(parser_with_attr_title.attributes()) == 1
+        assert parser_with_attr_title.attributes().names == ["title"]
+
+    def test_supported_unsupported(self, parser_with_supported_and_unsupported):
+        parser = parser_with_supported_and_unsupported
+        assert len(parser.attributes()) == 2
+        assert parser.attributes().supported == [parser.supported]
+        assert parser.attributes().unsupported == [parser.unsupported]
+
+
 @pytest.mark.parametrize(
     "publisher", list(PublisherCollection), ids=[publisher.name for publisher in PublisherCollection]
 )
 class TestParser:
     def test_annotations(self, publisher: PublisherEnum) -> None:
         parser = publisher.parser
         mapping = parse_annotations()
-        for attr in parser.attributes():
+        for attr in parser.attributes().supported:
             if annotation := mapping[attr.__name__]:
                 assert (
                     attr.__annotations__.get("return") == annotation