From a0b22cf40af60eae738fb0cea60404499723fc22 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 6 May 2024 17:29:44 +0200 Subject: [PATCH 1/7] add timeout to publisher_coverage.py --- scripts/publisher_coverage.py | 13 +++++----- scripts/utility.py | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 7 deletions(-) create mode 100644 scripts/utility.py diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index 6cc561fe3..25b50cb88 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -13,6 +13,7 @@ from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article from fundus.scraping.filter import RequiresAll +from scripts.utility import timeout def main() -> None: @@ -31,18 +32,16 @@ def main() -> None: ): publisher_name: str = publisher.name # type: ignore[attr-defined] - if not (publisher.source_mapping[RSSFeed] or publisher.source_mapping[NewsMap]): # type: ignore[attr-defined] - # skip publishers providing no NewsMap or RSSFeed - print(f"⏩ SKIPPED: {publisher_name!r} - NO NewsMap or RSSFeed found") - continue + crawler: Crawler = Crawler(publisher, delay=0.4) + + timed_next = timeout(next, time=20, silent=True) - crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed]) - complete_article: Optional[Article] = next( + complete_article: Optional[Article] = timed_next( # type: ignore[call-arg] crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None ) if complete_article is None: - incomplete_article: Optional[Article] = next( + incomplete_article: Optional[Article] = timed_next( # type: ignore[call-arg] crawler.crawl(max_articles=1, only_complete=False, error_handling="suppress"), None ) diff --git a/scripts/utility.py b/scripts/utility.py new file mode 100644 index 000000000..333e7f29d --- /dev/null +++ b/scripts/utility.py @@ -0,0 +1,49 @@ +import threading +from functools import wraps +from typing import Callable, Optional, TypeVar, overload + +from typing_extensions import ParamSpec + +try: + import thread +except ImportError: + import _thread as thread + +P = ParamSpec("P") +T = TypeVar("T") + + +def _interrupt_handler() -> None: + thread.interrupt_main() + + +@overload +def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, T]: + ... + + +@overload +def timeout(func: Callable[P, T], time: int, silent: bool = True) -> Callable[P, Optional[T]]: + ... + + +def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, Optional[T]]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: + # register interrupt handler + timer = threading.Timer(time, _interrupt_handler) + timer.start() + + try: + result = func(*args, **kwargs) + except KeyboardInterrupt as err: + if silent: + return None + else: + raise TimeoutError from err + finally: + timer.cancel() + timer.join() + return result + + return wrapper From 7cf9e071592f8e3b34c86c156fc8775796d4a892 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Mon, 13 May 2024 13:13:48 +0200 Subject: [PATCH 2/7] catch error only if no complete articles were received --- scripts/publisher_coverage.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index 25b50cb88..1e591f015 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -9,7 +9,7 @@ from enum import EnumMeta from typing import List, Optional, cast -from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed +from fundus import Crawler, PublisherCollection from fundus.publishers.base_objects import PublisherEnum from fundus.scraping.article import Article from fundus.scraping.filter import RequiresAll @@ -37,16 +37,29 @@ def main() -> None: timed_next = timeout(next, time=20, silent=True) complete_article: Optional[Article] = timed_next( # type: ignore[call-arg] - crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None + crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="suppress"), None ) if complete_article is None: incomplete_article: Optional[Article] = timed_next( # type: ignore[call-arg] - crawler.crawl(max_articles=1, only_complete=False, error_handling="suppress"), None + crawler.crawl(max_articles=1, only_complete=False, error_handling="catch"), None ) if incomplete_article is None: print(f"❌ FAILED: {publisher_name!r} - No articles received") + + elif incomplete_article.exception is not None: + print( + f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling " + f"(URL: {incomplete_article.html.requested_url})" + ) + traceback.print_exception( + etype=type(incomplete_article.exception), + value=incomplete_article.exception, + tb=incomplete_article.exception.__traceback__, + file=sys.stdout, + ) + else: print( f"❌ FAILED: {publisher_name!r} - No complete articles received " @@ -55,21 +68,6 @@ def main() -> None: failed += 1 continue - if complete_article.exception is not None: - print( - f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling " - f"(URL: {complete_article.html.requested_url})" - ) - traceback.print_exception( - etype=type(complete_article.exception), - value=complete_article.exception, - tb=complete_article.exception.__traceback__, - file=sys.stdout, - ) - - failed += 1 - continue - print(f"✔️ PASSED: {publisher_name!r}") print() From f765e3c1c1650f72ae07927847383388fdf456e7 Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 16 May 2024 13:54:29 +0200 Subject: [PATCH 3/7] remove Python 2 related import --- scripts/utility.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/utility.py b/scripts/utility.py index 333e7f29d..e556943bf 100644 --- a/scripts/utility.py +++ b/scripts/utility.py @@ -1,14 +1,10 @@ +import _thread as thread import threading from functools import wraps from typing import Callable, Optional, TypeVar, overload from typing_extensions import ParamSpec -try: - import thread -except ImportError: - import _thread as thread - P = ParamSpec("P") T = TypeVar("T") From 69059840798fc1f0a725d84a3b4fc532e89c108d Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 16 May 2024 14:26:14 +0200 Subject: [PATCH 4/7] fix bug and type hint --- scripts/utility.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/utility.py b/scripts/utility.py index e556943bf..44221554a 100644 --- a/scripts/utility.py +++ b/scripts/utility.py @@ -14,23 +14,23 @@ def _interrupt_handler() -> None: @overload -def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, T]: +def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, T]: ... @overload -def timeout(func: Callable[P, T], time: int, silent: bool = True) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: float, silent: bool = True) -> Callable[P, Optional[T]]: ... -def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, Optional[T]]: @wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: # register interrupt handler timer = threading.Timer(time, _interrupt_handler) - timer.start() try: + timer.start() result = func(*args, **kwargs) except KeyboardInterrupt as err: if silent: From a9131d44de6857d8f47c6a8d89fbdf75c820fcbb Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 16 May 2024 14:30:35 +0200 Subject: [PATCH 5/7] remove `.join` in `timeout` --- scripts/utility.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/utility.py b/scripts/utility.py index 44221554a..c5178a1e6 100644 --- a/scripts/utility.py +++ b/scripts/utility.py @@ -39,7 +39,6 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: raise TimeoutError from err finally: timer.cancel() - timer.join() return result return wrapper From 9c8d65660696a5f684b560f8c909ce76ac64b06b Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 16 May 2024 14:40:25 +0200 Subject: [PATCH 6/7] fix type overload --- scripts/utility.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/utility.py b/scripts/utility.py index c5178a1e6..c5d234b58 100644 --- a/scripts/utility.py +++ b/scripts/utility.py @@ -1,7 +1,7 @@ import _thread as thread import threading from functools import wraps -from typing import Callable, Optional, TypeVar, overload +from typing import Callable, Literal, Optional, TypeVar, overload from typing_extensions import ParamSpec @@ -14,16 +14,16 @@ def _interrupt_handler() -> None: @overload -def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, T]: +def timeout(func: Callable[P, T], time: int, silent: Literal[False] = ...) -> Callable[P, T]: ... @overload -def timeout(func: Callable[P, T], time: float, silent: bool = True) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: int, silent: Literal[True]) -> Callable[P, Optional[T]]: ... -def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, Optional[T]]: @wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: # register interrupt handler From d8e6c1a588a3e84296e0abb94dd2452b697a064b Mon Sep 17 00:00:00 2001 From: Max Dallabetta Date: Thu, 16 May 2024 14:43:31 +0200 Subject: [PATCH 7/7] skip publishers without sources --- scripts/publisher_coverage.py | 5 +++++ scripts/utility.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/publisher_coverage.py b/scripts/publisher_coverage.py index 1e591f015..af033b0ca 100644 --- a/scripts/publisher_coverage.py +++ b/scripts/publisher_coverage.py @@ -32,6 +32,11 @@ def main() -> None: ): publisher_name: str = publisher.name # type: ignore[attr-defined] + if not any(publisher.source_mapping.values()): # type: ignore[attr-defined] + # skip publishers providing no sources for forward crawling + print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined") + continue + crawler: Crawler = Crawler(publisher, delay=0.4) timed_next = timeout(next, time=20, silent=True) diff --git a/scripts/utility.py b/scripts/utility.py index c5d234b58..6e7a83a8b 100644 --- a/scripts/utility.py +++ b/scripts/utility.py @@ -14,16 +14,16 @@ def _interrupt_handler() -> None: @overload -def timeout(func: Callable[P, T], time: int, silent: Literal[False] = ...) -> Callable[P, T]: +def timeout(func: Callable[P, T], time: float, silent: Literal[False] = ...) -> Callable[P, T]: ... @overload -def timeout(func: Callable[P, T], time: int, silent: Literal[True]) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: float, silent: Literal[True]) -> Callable[P, Optional[T]]: ... -def timeout(func: Callable[P, T], time: int, silent: bool = False) -> Callable[P, Optional[T]]: +def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, Optional[T]]: @wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]: # register interrupt handler