diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index 6cc561fe..af033b0c 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -9,10 +9,11 @@ from enum import EnumMeta from typing import List, Optional, cast -from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed +from fundus import Crawler, PublisherCollection from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article from fundus.scraping.filter import RequiresAll +from scripts.utility import timeout def main() -> None: @@ -31,23 +32,39 @@ def main() -> None: ): publisher_name: str = publisher.name # type: ignore[attr-defined] - if not (publisher.source_mapping[RSSFeed] or publisher.source_mapping[NewsMap]): # type: ignore[attr-defined] - # skip publishers providing no NewsMap or RSSFeed - print(f"⏩ SKIPPED: {publisher_name!r} - NO NewsMap or RSSFeed found") + if not any(publisher.source_mapping.values()): # type: ignore[attr-defined] + # skip publishers providing no sources for forward crawling + print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined") continue - crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed]) - complete_article: Optional[Article] = next( - crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None + crawler: Crawler = Crawler(publisher, delay=0.4) + + timed_next = timeout(next, time=20, silent=True) + + complete_article: Optional[Article] = timed_next( # type: ignore[call-arg] + crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="suppress"), None ) if complete_article is None: - incomplete_article: Optional[Article] = next( - crawler.crawl(max_articles=1, only_complete=False, error_handling="suppress"), None + incomplete_article: Optional[Article] = timed_next( # type: ignore[call-arg] + crawler.crawl(max_articles=1, only_complete=False, error_handling="catch"), None ) if incomplete_article is None: print(f"❌ FAILED: {publisher_name!r} - No articles received") + + elif incomplete_article.exception is not None: + print( + f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling " + f"(URL: {incomplete_article.html.requested_url})" + ) + traceback.print_exception( + etype=type(incomplete_article.exception), + value=incomplete_article.exception, + tb=incomplete_article.exception.__traceback__, + file=sys.stdout, + ) + else: print( f"❌ FAILED: {publisher_name!r} - No complete articles received " @@ -56,21 +73,6 @@ def main() -> None: failed += 1 continue - if complete_article.exception is not None: - print( - f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling " - f"(URL: {complete_article.html.requested_url})" - ) - traceback.print_exception( - etype=type(complete_article.exception), - value=complete_article.exception, - tb=complete_article.exception.__traceback__, - file=sys.stdout, - ) - - failed += 1 - continue - print(f"✔️ PASSED: {publisher_name!r}") print() diff --git a/scripts/utility.py b/scripts/utility.py new file mode 100644 index 00000000..6e7a83a8 --- /dev/null +++ b/scripts/utility.py @@ -0,0 +1,44 @@ +import _thread as thread +import threading +from functools import wraps +from typing import Callable, Literal, Optional, TypeVar, overload + +from typing_extensions import ParamSpec + +P = ParamSpec("P") +T = TypeVar("T") + + +def _interrupt_handler() -> None: + thread.interrupt_main() + + +@overload +def timeout(func: Callable[P, T], time: float, silent: Literal[False] = ...) -> Callable[P, T]: + ... + + +@overload +def timeout(func: Callable[P, T], time: float, silent: Literal[True]) -> Callable[P, Optional[T]]: + ... + + +def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, Optional[T]]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: + # register interrupt handler + timer = threading.Timer(time, _interrupt_handler) + + try: + timer.start() + result = func(*args, **kwargs) + except KeyboardInterrupt as err: + if silent: + return None + else: + raise TimeoutError from err + finally: + timer.cancel() + return result + + return wrapper