Skip to content

Commit

Permalink
feat: add kingfisher collect log warnings with scrapyloganalyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jan 3, 2025
1 parent 6821f8b commit e90fcc9
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 9 deletions.
8 changes: 8 additions & 0 deletions data_registry/process_manager/task/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import requests
from django.conf import settings
from scrapyloganalyzer import ScrapyLogFile

from data_registry.exceptions import ConfigurationError, RecoverableError, UnexpectedError
from data_registry.models import Task
Expand Down Expand Up @@ -105,6 +106,13 @@ def get_status(self):
if "process_id" not in self.job.context or "data_version" not in self.job.context:
raise UnexpectedError("Unable to retrieve collection ID and data version from Scrapy log")

scrapy_log = ScrapyLogFile(scrapy_log_url)
for key in scrapy_log.logparser["log_categories"]:
if scrapy_log.logparser["log_categories"][key]["count"] > 0:
logger.warning("%s: %s", self, {scrapy_log.logparser["log_categories"][key]["details"]})
if scrapy_log.error_rate:
logger.warning("%s: crawl error rate was %s", self, {scrapy_log.error_rate})

return Task.Status.COMPLETED

raise RecoverableError(f"Unable to find status of Scrapyd job {scrapyd_job_id}")
Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ gunicorn[setproctitle]
markdown-it-py
psycopg2
requests
scrapyloganalyzer
sentry-sdk
yapw[perf]
16 changes: 13 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ django-modeltranslation==0.18.10
# via -r requirements.in
flatterer==0.20.1
# via -r requirements.in
gunicorn[setproctitle]==22.0.0
gunicorn==22.0.0
# via -r requirements.in
idna==3.7
# via requests
ijson==3.1.4
# via flatterer
logparser==0.8.3
# via scrapyloganalyzer
markdown-it-py==2.2.0
# via -r requirements.in
mdurl==0.1.2
Expand All @@ -41,22 +43,30 @@ packaging==24.0
# via gunicorn
pandas==1.5.0
# via flatterer
pexpect==4.9.0
# via logparser
pika==1.3.2
# via yapw
psycopg2==2.9.6
# via -r requirements.in
ptyprocess==0.7.0
# via pexpect
python-dateutil==2.8.2
# via pandas
pytz==2021.1
# via pandas
requests==2.32.3
# via -r requirements.in
scrapyloganalyzer==0.0.1
# via -r requirements.in
sentry-sdk==2.8.0
# via -r requirements.in
setproctitle==1.2.2
# via gunicorn
six==1.16.0
# via python-dateutil
# via
# logparser
# python-dateutil
sqlparse==0.5.0
# via django
typing-extensions==4.7.1
Expand All @@ -67,5 +77,5 @@ urllib3==2.2.2
# via
# requests
# sentry-sdk
yapw[perf]==0.1.4
yapw==0.1.4
# via -r requirements.in
24 changes: 18 additions & 6 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ django-modeltranslation==0.18.10
# via -r requirements.txt
flatterer==0.20.1
# via -r requirements.txt
gunicorn[setproctitle]==22.0.0
gunicorn==22.0.0
# via -r requirements.txt
idna==3.7
# via
Expand All @@ -40,6 +40,10 @@ ijson==3.1.4
# via
# -r requirements.txt
# flatterer
logparser==0.8.3
# via
# -r requirements.txt
# scrapyloganalyzer
markdown-it-py==2.2.0
# via -r requirements.txt
mdurl==0.1.2
Expand All @@ -54,7 +58,6 @@ orjson==3.9.15
# via
# -r requirements.txt
# flatterer
# yapw
packaging==24.0
# via
# -r requirements.txt
Expand All @@ -63,6 +66,10 @@ pandas==1.5.0
# via
# -r requirements.txt
# flatterer
pexpect==4.9.0
# via
# -r requirements.txt
# logparser
pika==1.3.2
# via
# -r requirements.txt
Expand All @@ -71,6 +78,10 @@ psycopg2==2.9.6
# via -r requirements.txt
psycopg2-binary==2.9.2
# via -r requirements_dev.in
ptyprocess==0.7.0
# via
# -r requirements.txt
# pexpect
python-dateutil==2.8.2
# via
# -r requirements.txt
Expand All @@ -81,15 +92,16 @@ pytz==2021.1
# pandas
requests==2.32.3
# via -r requirements.txt
scrapyloganalyzer==0.0.1
# via -r requirements.txt
sentry-sdk==2.8.0
# via -r requirements.txt
setproctitle==1.2.2
# via
# -r requirements.txt
# gunicorn
# via -r requirements.txt
six==1.16.0
# via
# -r requirements.txt
# logparser
# python-dateutil
sqlparse==0.5.0
# via
Expand All @@ -105,5 +117,5 @@ urllib3==2.2.2
# -r requirements.txt
# requests
# sentry-sdk
yapw[perf]==0.1.4
yapw==0.1.4
# via -r requirements.txt

0 comments on commit e90fcc9

Please sign in to comment.