Skip to content

Commit

Permalink
scrapy-spider-metadata support. (#75)
Browse files Browse the repository at this point in the history
* scrapy-spider-metadata support.

* Fixes.

* get_metadata_for_spider was renamed.

* Add tests for shub-image-info.

* Fix tests without scrapy-spider-metadata.

* Remove scraoy-spider-metadata from requirements.

* Cleanup.
  • Loading branch information
wRAR authored Sep 29, 2023
1 parent 71aacd8 commit 1cc4352
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 3 deletions.
17 changes: 16 additions & 1 deletion sh_scrapy/commands/shub_image_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,23 @@ def add_options(self, parser):
def run(self, args, opts):
result = {
'project_type': 'scrapy',
'spiders': sorted(self.crawler_process.spider_loader.list())
'spiders': sorted(self.crawler_process.spider_loader.list()),
}
try:
from scrapy_spider_metadata import get_spider_metadata
except ImportError:
pass
else:
result['metadata'] = {}
for spider_name in result['spiders']:
spider_cls = self.crawler_process.spider_loader.load(spider_name)
metadata_dict = get_spider_metadata(spider_cls)
try:
# make sure it's serializable
json.dumps(metadata_dict)
except (TypeError, ValueError):
continue
result['metadata'][spider_name] = metadata_dict
if opts.debug:
output = subprocess.check_output(
['bash', '-c', self.IMAGE_INFO_CMD],
Expand Down
66 changes: 64 additions & 2 deletions tests/test_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
import json
import mock
import pytest
import warnings
from scrapy.settings import Settings
from scrapy.exceptions import ScrapyDeprecationWarning

import sh_scrapy.crawl
from sh_scrapy.crawl import _fatalerror
Expand All @@ -18,6 +16,14 @@
from sh_scrapy.crawl import list_spiders
from sh_scrapy.crawl import main
from sh_scrapy.log import HubstorageLogHandler
from tests.utils import create_project, call_command


try:
from scrapy_spider_metadata import get_spider_metadata
SPIDER_METADATA_AVAILABLE = True
except:
SPIDER_METADATA_AVAILABLE = False


@mock.patch.dict(os.environ, {'HWORKER_SENTRY_DSN': 'hw-sentry-dsn',
Expand Down Expand Up @@ -281,3 +287,59 @@ def test_main(mocked_launch, pipe_writer):
# This ensures that pipe is writable even if main program is fininshed -
# e.g. for threads that are not closed yet.
assert not pipe_writer.close.called


def test_image_info(tmp_path):
project_dir = create_project(tmp_path)
out, err = call_command(project_dir, "shub-image-info")
# can't be asserted as it contains a SHScrapyDeprecationWarning
# assert err == ""
data = json.loads(out)
expected = {
"project_type": "scrapy",
"spiders": ["myspider"],
"metadata": {"myspider": {}},
}
if not SPIDER_METADATA_AVAILABLE:
del expected["metadata"]
assert data == expected


def test_image_info_metadata(tmp_path):
project_dir = create_project(tmp_path, spider_text="""
from scrapy import Spider
class MySpider(Spider):
name = "myspider"
metadata = {"foo": 42}
""")
out, _ = call_command(project_dir, "shub-image-info")
data = json.loads(out)
expected = {
"project_type": "scrapy",
"spiders": ["myspider"],
"metadata": {"myspider": {"foo": 42}},
}
if not SPIDER_METADATA_AVAILABLE:
del expected["metadata"]
assert data == expected


def test_image_info_metadata_skip_broken(tmp_path):
project_dir = create_project(tmp_path, spider_text="""
from scrapy import Spider
class MySpider(Spider):
name = "myspider"
metadata = {"foo": Spider}
""")
out, _ = call_command(project_dir, "shub-image-info")
data = json.loads(out)
expected = {
"project_type": "scrapy",
"spiders": ["myspider"],
"metadata": {},
}
if not SPIDER_METADATA_AVAILABLE:
del expected["metadata"]
assert data == expected
36 changes: 36 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import subprocess
import sys
from pathlib import Path
from typing import Tuple, Optional, Union


def call_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]:
result = subprocess.run(
args,
cwd=str(cwd),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
assert result.returncode == 0, result.stderr
return result.stdout, result.stderr


def call_scrapy_command(cwd: Union[str, os.PathLike], *args: str) -> Tuple[str, str]:
args = (sys.executable, "-m", "scrapy.cmdline") + args
return call_command(cwd, *args)


def create_project(topdir: Path, spider_text: Optional[str] = None) -> Path:
project_name = "foo"
cwd = topdir
call_scrapy_command(str(cwd), "startproject", project_name)
cwd /= project_name
(cwd / project_name / "spiders" / "spider.py").write_text(spider_text or """
from scrapy import Spider
class MySpider(Spider):
name = "myspider"
""")
return cwd
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@ deps =
hubstorage
packaging
py36-scrapy16: Scrapy==1.6
scrapy-spider-metadata; python_version >= "3.8"

commands =
pytest --verbose --cov=sh_scrapy --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: sh_scrapy tests}

0 comments on commit 1cc4352

Please sign in to comment.