Skip to content

Commit

Permalink
Merge pull request #508 from flairNLP/add-timeout-to-publisher-coverage
Browse files Browse the repository at this point in the history
Add timeout to publisher_coverage.py
  • Loading branch information
MaxDall authored May 16, 2024
2 parents cd75d00 + d8e6c1a commit 2ed6d31
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 24 deletions.
50 changes: 26 additions & 24 deletions scripts/publisher_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from enum import EnumMeta
from typing import List, Optional, cast

from fundus import Crawler, NewsMap, PublisherCollection, RSSFeed
from fundus import Crawler, PublisherCollection
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.filter import RequiresAll
from scripts.utility import timeout


def main() -> None:
Expand All @@ -31,23 +32,39 @@ def main() -> None:
):
publisher_name: str = publisher.name # type: ignore[attr-defined]

if not (publisher.source_mapping[RSSFeed] or publisher.source_mapping[NewsMap]): # type: ignore[attr-defined]
# skip publishers providing no NewsMap or RSSFeed
print(f"⏩ SKIPPED: {publisher_name!r} - NO NewsMap or RSSFeed found")
if not any(publisher.source_mapping.values()): # type: ignore[attr-defined]
# skip publishers providing no sources for forward crawling
print(f"⏩ SKIPPED: {publisher_name!r} - No sources defined")
continue

crawler: Crawler = Crawler(publisher, restrict_sources_to=[NewsMap, RSSFeed])
complete_article: Optional[Article] = next(
crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="catch"), None
crawler: Crawler = Crawler(publisher, delay=0.4)

timed_next = timeout(next, time=20, silent=True)

complete_article: Optional[Article] = timed_next( # type: ignore[call-arg]
crawler.crawl(max_articles=1, only_complete=RequiresAll(), error_handling="suppress"), None
)

if complete_article is None:
incomplete_article: Optional[Article] = next(
crawler.crawl(max_articles=1, only_complete=False, error_handling="suppress"), None
incomplete_article: Optional[Article] = timed_next( # type: ignore[call-arg]
crawler.crawl(max_articles=1, only_complete=False, error_handling="catch"), None
)

if incomplete_article is None:
print(f"❌ FAILED: {publisher_name!r} - No articles received")

elif incomplete_article.exception is not None:
print(
f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling "
f"(URL: {incomplete_article.html.requested_url})"
)
traceback.print_exception(
etype=type(incomplete_article.exception),
value=incomplete_article.exception,
tb=incomplete_article.exception.__traceback__,
file=sys.stdout,
)

else:
print(
f"❌ FAILED: {publisher_name!r} - No complete articles received "
Expand All @@ -56,21 +73,6 @@ def main() -> None:
failed += 1
continue

if complete_article.exception is not None:
print(
f"❌ FAILED: {publisher_name!r} - Encountered exception during crawling "
f"(URL: {complete_article.html.requested_url})"
)
traceback.print_exception(
etype=type(complete_article.exception),
value=complete_article.exception,
tb=complete_article.exception.__traceback__,
file=sys.stdout,
)

failed += 1
continue

print(f"✔️ PASSED: {publisher_name!r}")
print()

Expand Down
44 changes: 44 additions & 0 deletions scripts/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import _thread as thread
import threading
from functools import wraps
from typing import Callable, Literal, Optional, TypeVar, overload

from typing_extensions import ParamSpec

P = ParamSpec("P")
T = TypeVar("T")


def _interrupt_handler() -> None:
thread.interrupt_main()


@overload
def timeout(func: Callable[P, T], time: float, silent: Literal[False] = ...) -> Callable[P, T]:
...


@overload
def timeout(func: Callable[P, T], time: float, silent: Literal[True]) -> Callable[P, Optional[T]]:
...


def timeout(func: Callable[P, T], time: float, silent: bool = False) -> Callable[P, Optional[T]]:
@wraps(func)
def wrapper(*args: P.args, **kwargs: P.kwargs) -> Optional[T]:
# register interrupt handler
timer = threading.Timer(time, _interrupt_handler)

try:
timer.start()
result = func(*args, **kwargs)
except KeyboardInterrupt as err:
if silent:
return None
else:
raise TimeoutError from err
finally:
timer.cancel()
return result

return wrapper

0 comments on commit 2ed6d31

Please sign in to comment.