From ea646cb3b8b6d380b7e500158b9605c398431736 Mon Sep 17 00:00:00 2001 From: R Max Espinoza Date: Thu, 4 Jul 2024 21:37:37 +0200 Subject: [PATCH] maint: precommit format, fixes and update copyright (#298) maint: precommit format and update copyright --- .flake8 | 7 +- .github/workflows/checks.yml | 6 + .isort.cfg | 2 + .pre-commit-config.yaml | 36 ++++ AUTHORS.rst | 2 +- LICENSE | 2 +- docs/conf.py | 152 ++++++++-------- example-project/README.rst | 17 +- example-project/example/items.py | 4 +- example-project/example/pipelines.py | 2 +- example-project/example/settings.py | 12 +- example-project/example/spiders/dmoz.py | 23 +-- .../example/spiders/mycrawler_redis.py | 19 +- .../example/spiders/myspider_redis.py | 15 +- example-project/process_items.py | 47 +++-- setup.py | 54 +++--- src/scrapy_redis/__init__.py | 13 +- src/scrapy_redis/connection.py | 28 ++- src/scrapy_redis/defaults.py | 27 ++- src/scrapy_redis/dupefilter.py | 29 ++-- src/scrapy_redis/pipelines.py | 23 ++- src/scrapy_redis/queue.py | 18 +- src/scrapy_redis/scheduler.py | 54 +++--- src/scrapy_redis/spiders.py | 87 ++++++---- src/scrapy_redis/stats.py | 18 +- src/scrapy_redis/utils.py | 28 +-- tests/test_connection.py | 44 ++--- tests/test_dupefilter.py | 40 +++-- tests/test_picklecompat.py | 22 +-- tests/test_queue.py | 14 +- tests/test_scrapy_redis.py | 164 +++++++++--------- tests/test_spiders.py | 134 +++++++------- tests/test_utils.py | 4 +- 33 files changed, 621 insertions(+), 526 deletions(-) create mode 100644 .isort.cfg create mode 100644 .pre-commit-config.yaml diff --git a/.flake8 b/.flake8 index d472e2f7..7b8da1c0 100644 --- a/.flake8 +++ b/.flake8 @@ -2,8 +2,11 @@ [flake8] max-line-length = 119 -ignore = W503 +ignore = + W503 + P102 + P103 exclude = tests/test_spiders.py E731 - docs/conf.py E265 \ No newline at end of file + docs/conf.py E265 diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 11e7c918..a5c392ff 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -33,3 +33,9 @@ jobs: run: | pip install -r requirements-tests.txt tox + + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: pre-commit/action@v3.0.0 diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 00000000..f238bf7e --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..2837d21d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,36 @@ +repos: +- repo: https://github.com/PyCQA/bandit + rev: 1.7.7 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 + additional_dependencies: + - flake8-bugbear + - flake8-comprehensions + - flake8-debugger + #- flake8-docstrings + - flake8-string-format + - flake8-type-checking +- repo: https://github.com/psf/black.git + rev: 24.2.0 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.2.0 +- repo: https://github.com/asottile/pyupgrade + rev: v3.15.2 + hooks: + - id: pyupgrade + args: [--py38-plus, --keep-runtime-typing] diff --git a/AUTHORS.rst b/AUTHORS.rst index 808f7673..43eaed81 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -5,7 +5,7 @@ Credits Development Lead ---------------- -* Rolando Espinoza +* R Max Espinoza Contributors ------------ diff --git a/LICENSE b/LICENSE index 1ff8f3a9..68705984 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2022, Rolando Espinoza +Copyright (c) 2011-2024, R Max Espinoza Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/docs/conf.py b/docs/conf.py index 9840bfec..91b4ca71 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- # # scrapy-redis documentation build configuration file, created by # sphinx-quickstart on Tue Jul 9 22:26:36 2013. @@ -20,7 +19,7 @@ # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # Get the project root dir, which is the parent dir of this project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -28,206 +27,208 @@ # -- General configuration --------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'Scrapy-Redis' -copyright = '2011-2016, Rolando Espinoza' +project = "Scrapy-Redis" +copyright = "2011-2024, R Max Espinoza" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout # the built documents. # # The full version, including alpha/beta/rc tags. -release = open(os.path.join(project_root, 'VERSION')).read().strip() +release = open(os.path.join(project_root, "VERSION")).read().strip() # The short X.Y version. -version = re.findall(r'\d+\.\d+\.\d+', release)[0] +version = re.findall(r"\d+\.\d+\.\d+", release)[0] # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to # some non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built # documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as # html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the # top of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon # of the docs. This file should be a Windows icon file (.ico) being # 16x16 or 32x32 pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) # here, relative to this directory. They are copied after the builtin # static files, so a file named "default.css" will overwrite the builtin # "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names # to template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. # Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. # Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages # will contain a tag referring to it. The value of this option # must be the base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'scrapy_redisdoc' +htmlhelp_basename = "scrapy_redisdoc" # -- Options for LaTeX output ------------------------------------------ latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - #'papersize': 'letterpaper', - + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). - #'pointsize': '10pt', - + # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. - #'preamble': '', + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'scrapy_redis.tex', - 'Scrapy-Redis Documentation', - 'Rolando Espinoza', 'manual'), + ( + "index", + "scrapy_redis.tex", + "Scrapy-Redis Documentation", + "R Max Espinoza", + "manual", + ), ] # The name of an image file (relative to this directory) to place at # the top of the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings # are parts, not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output ------------------------------------ @@ -235,13 +236,11 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'scrapy_redis', - 'Scrapy-Redis Documentation', - ['Rolando Espinoza'], 1) + ("index", "scrapy_redis", "Scrapy-Redis Documentation", ["R Max Espinoza"], 1) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ---------------------------------------- @@ -250,22 +249,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'scrapy_redis', - 'Scrapy-Redis Documentation', - 'Rolando Espinoza', - 'scrapy-redis', - 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "scrapy_redis", + "Scrapy-Redis Documentation", + "R Max Espinoza", + "scrapy-redis", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/example-project/README.rst b/example-project/README.rst index 4fb8c94a..3a16a016 100644 --- a/example-project/README.rst +++ b/example-project/README.rst @@ -45,7 +45,7 @@ across multiple spider instances, highly suitable for broad crawls. 2. Run the crawler for first time then stop it -.. code-block:: python +.. code-block:: bash cd example-project scrapy crawl dmoz @@ -54,21 +54,21 @@ across multiple spider instances, highly suitable for broad crawls. 3. Run the crawler again to resume stopped crawling -.. code-block:: python +.. code-block:: bash scrapy crawl dmoz ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled) 4. Start one or more additional scrapy crawlers -.. code-block:: python +.. code-block:: bash scrapy crawl dmoz ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled) 5. Start one or more post-processing workers -.. code-block:: python +.. code-block:: bash python process_items.py dmoz:items -v ... @@ -91,8 +91,9 @@ For example, create a file ``myspider.py`` with the code below: from scrapy_redis.spiders import RedisSpider + class MySpider(RedisSpider): - name = 'myspider' + name = "myspider" def parse(self, response): # do stuff @@ -103,13 +104,13 @@ Then: 1. run the spider -.. code-block:: python +.. code-block:: bash scrapy runspider myspider.py 2. push json data to redis -.. code-block:: python +.. code-block:: bash redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }' @@ -128,6 +129,8 @@ Processing items The ``process_items.py`` provides an example of consuming the items queue:: +.. code-block:: bash + python process_items.py --help diff --git a/example-project/example/items.py b/example-project/example/items.py index f293427b..d8763fee 100644 --- a/example-project/example/items.py +++ b/example-project/example/items.py @@ -3,9 +3,9 @@ # See documentation in: # http://doc.scrapy.org/topics/items.html -from scrapy.item import Item, Field +from scrapy.item import Field, Item from scrapy.loader import ItemLoader -from scrapy.loader.processors import MapCompose, TakeFirst, Join +from scrapy.loader.processors import Join, MapCompose, TakeFirst class ExampleItem(Item): diff --git a/example-project/example/pipelines.py b/example-project/example/pipelines.py index caad2438..64ff72a6 100644 --- a/example-project/example/pipelines.py +++ b/example-project/example/pipelines.py @@ -5,7 +5,7 @@ from datetime import datetime -class ExamplePipeline(object): +class ExamplePipeline: def process_item(self, item, spider): item["crawled"] = datetime.utcnow() item["spider"] = spider.name diff --git a/example-project/example/settings.py b/example-project/example/settings.py index 19f87d8c..380e3ac0 100644 --- a/example-project/example/settings.py +++ b/example-project/example/settings.py @@ -5,10 +5,10 @@ # # http://doc.scrapy.org/topics/settings.html # -SPIDER_MODULES = ['example.spiders'] -NEWSPIDER_MODULE = 'example.spiders' +SPIDER_MODULES = ["example.spiders"] +NEWSPIDER_MODULE = "example.spiders" -USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)' +USER_AGENT = "scrapy-redis (+https://github.com/rolando/scrapy-redis)" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER = "scrapy_redis.scheduler.Scheduler" @@ -18,11 +18,11 @@ # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" ITEM_PIPELINES = { - 'example.pipelines.ExamplePipeline': 300, - 'scrapy_redis.pipelines.RedisPipeline': 400, + "example.pipelines.ExamplePipeline": 300, + "scrapy_redis.pipelines.RedisPipeline": 400, } -LOG_LEVEL = 'DEBUG' +LOG_LEVEL = "DEBUG" # Introduce an artifical delay to make use of parallelism. to speed up the # crawl. diff --git a/example-project/example/spiders/dmoz.py b/example-project/example/spiders/dmoz.py index 5bfb68c2..c00ef140 100644 --- a/example-project/example/spiders/dmoz.py +++ b/example-project/example/spiders/dmoz.py @@ -4,20 +4,23 @@ class DmozSpider(CrawlSpider): """Follow categories and extract links.""" - name = 'dmoz' - allowed_domains = ['dmoz-odp.org'] - start_urls = ['http://www.dmoz-odp.org/'] + + name = "dmoz" + allowed_domains = ["dmoz-odp.org"] + start_urls = ["http://www.dmoz-odp.org/"] rules = [ - Rule(LinkExtractor( - restrict_css=('.top-cat', '.sub-cat', '.cat-item') - ), callback='parse_directory', follow=True), + Rule( + LinkExtractor(restrict_css=(".top-cat", ".sub-cat", ".cat-item")), + callback="parse_directory", + follow=True, + ), ] def parse_directory(self, response): - for div in response.css('.title-and-desc'): + for div in response.css(".title-and-desc"): yield { - 'name': div.css('.site-title::text').extract_first(), - 'description': div.css('.site-descr::text').extract_first().strip(), - 'link': div.css('a::attr(href)').extract_first(), + "name": div.css(".site-title::text").extract_first(), + "description": div.css(".site-descr::text").extract_first().strip(), + "link": div.css("a::attr(href)").extract_first(), } diff --git a/example-project/example/spiders/mycrawler_redis.py b/example-project/example/spiders/mycrawler_redis.py index da62cde9..7b740f80 100644 --- a/example-project/example/spiders/mycrawler_redis.py +++ b/example-project/example/spiders/mycrawler_redis.py @@ -1,27 +1,28 @@ -from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import Rule from scrapy_redis.spiders import RedisCrawlSpider class MyCrawler(RedisCrawlSpider): """Spider that reads urls from redis queue (myspider:start_urls).""" - name = 'mycrawler_redis' - redis_key = 'mycrawler:start_urls' + + name = "mycrawler_redis" + redis_key = "mycrawler:start_urls" rules = ( # follow all links - Rule(LinkExtractor(), callback='parse_page', follow=True), + Rule(LinkExtractor(), callback="parse_page", follow=True), ) def __init__(self, *args, **kwargs): # Dynamically define the allowed domains list. - domain = kwargs.pop('domain', '') - self.allowed_domains = filter(None, domain.split(',')) - super(MyCrawler, self).__init__(*args, **kwargs) + domain = kwargs.pop("domain", "") + self.allowed_domains = filter(None, domain.split(",")) + super().__init__(*args, **kwargs) def parse_page(self, response): return { - 'name': response.css('title::text').extract_first(), - 'url': response.url, + "name": response.css("title::text").extract_first(), + "url": response.url, } diff --git a/example-project/example/spiders/myspider_redis.py b/example-project/example/spiders/myspider_redis.py index 4e912a01..661027f9 100644 --- a/example-project/example/spiders/myspider_redis.py +++ b/example-project/example/spiders/myspider_redis.py @@ -3,17 +3,18 @@ class MySpider(RedisSpider): """Spider that reads urls from redis queue (myspider:start_urls).""" - name = 'myspider_redis' - redis_key = 'myspider:start_urls' + + name = "myspider_redis" + redis_key = "myspider:start_urls" def __init__(self, *args, **kwargs): # Dynamically define the allowed domains list. - domain = kwargs.pop('domain', '') - self.allowed_domains = filter(None, domain.split(',')) - super(MySpider, self).__init__(*args, **kwargs) + domain = kwargs.pop("domain", "") + self.allowed_domains = filter(None, domain.split(",")) + super().__init__(*args, **kwargs) def parse(self, response): return { - 'name': response.css('title::text').extract_first(), - 'url': response.url, + "name": response.css("title::text").extract_first(), + "url": response.url, } diff --git a/example-project/process_items.py b/example-project/process_items.py index 54b01f3b..42819b73 100644 --- a/example-project/process_items.py +++ b/example-project/process_items.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- """A script to process items from a redis queue.""" -from __future__ import print_function, unicode_literals import argparse import json @@ -13,11 +12,10 @@ from scrapy_redis import get_redis +logger = logging.getLogger("process_items") -logger = logging.getLogger('process_items') - -def process_items(r, keys, timeout, limit=0, log_every=1000, wait=.1): +def process_items(r, keys, timeout, limit=0, log_every=1000, wait=0.1): """Process items from a redis queue. Parameters @@ -30,7 +28,7 @@ def process_items(r, keys, timeout, limit=0, log_every=1000, wait=.1): Read timeout. """ - limit = limit or float('inf') + limit = limit or float("inf") processed = 0 while processed < limit: # Change ``blpop`` to ``brpop`` to process as LIFO. @@ -48,12 +46,13 @@ def process_items(r, keys, timeout, limit=0, log_every=1000, wait=.1): continue try: - name = item.get('name') or item.get('title') - url = item.get('url') or item.get('link') + name = item.get("name") or item.get("title") + url = item.get("url") or item.get("link") logger.debug("[%s] Processing item: %s <%s>", source, name, url) except KeyError: - logger.exception("[%s] Failed to process item:\n%r", - source, pprint.pformat(item)) + logger.exception( + "[%s] Failed to process item:\n%r", source, pprint.pformat(item) + ) continue processed += 1 @@ -63,32 +62,32 @@ def process_items(r, keys, timeout, limit=0, log_every=1000, wait=.1): def main(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('key', help="Redis key where items are stored") - parser.add_argument('--host') - parser.add_argument('--port') - parser.add_argument('--timeout', type=int, default=5) - parser.add_argument('--limit', type=int, default=0) - parser.add_argument('--progress-every', type=int, default=100) - parser.add_argument('-v', '--verbose', action='store_true') + parser.add_argument("key", help="Redis key where items are stored") + parser.add_argument("--host") + parser.add_argument("--port") + parser.add_argument("--timeout", type=int, default=5) + parser.add_argument("--limit", type=int, default=0) + parser.add_argument("--progress-every", type=int, default=100) + parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args() params = {} if args.host: - params['host'] = args.host + params["host"] = args.host if args.port: - params['port'] = args.port + params["port"] = args.port logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) r = get_redis(**params) - host = r.connection_pool.get_connection('info').host + host = r.connection_pool.get_connection("info").host logger.info("Waiting for items in '%s' (server: %s)", args.key, host) kwargs = { - 'keys': [args.key], - 'timeout': args.timeout, - 'limit': args.limit, - 'log_every': args.progress_every, + "keys": [args.key], + "timeout": args.timeout, + "limit": args.limit, + "log_every": args.progress_every, } try: process_items(r, **kwargs) @@ -102,5 +101,5 @@ def main(): return retcode -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/setup.py b/setup.py index cbc08914..cc4df606 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- import io from pkgutil import walk_packages + from setuptools import setup @@ -11,45 +11,49 @@ def find_packages(path): def read_file(filename): - with io.open(filename) as fp: + with open(filename) as fp: return fp.read().strip() def read_rst(filename): # Ignore unsupported directives by pypi. content = read_file(filename) - return ''.join(line for line in io.StringIO(content) - if not line.startswith('.. comment::')) + return "".join( + line for line in io.StringIO(content) if not line.startswith(".. comment::") + ) def read_requirements(filename): - return [line.strip() for line in read_file(filename).splitlines() - if not line.startswith('#')] + return [ + line.strip() + for line in read_file(filename).splitlines() + if not line.startswith("#") + ] setup( - name='scrapy-redis', - version=read_file('VERSION'), + name="scrapy-redis", + version=read_file("VERSION"), description="Redis-based components for Scrapy.", - long_description=read_rst('README.rst') + '\n\n' + read_rst('HISTORY.rst'), - author="Rolando Espinoza", - author_email='rolando@rmax.io', - url='https://github.com/rolando/scrapy-redis', - packages=list(find_packages('src')), - package_dir={'': 'src'}, - install_requires=read_requirements('requirements.txt'), + long_description=read_rst("README.rst") + "\n\n" + read_rst("HISTORY.rst"), + author="R Max Espinoza", + author_email="hey@rmax.dev", + url="https://github.com/rmax/scrapy-redis", + packages=list(find_packages("src")), + package_dir={"": "src"}, + install_requires=read_requirements("requirements.txt"), include_package_data=True, license="MIT", - keywords='scrapy-redis', + keywords="scrapy-redis", classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], ) diff --git a/src/scrapy_redis/__init__.py b/src/scrapy_redis/__init__.py index c58d99dd..1822b7b0 100644 --- a/src/scrapy_redis/__init__.py +++ b/src/scrapy_redis/__init__.py @@ -1,10 +1,5 @@ -# -*- coding: utf-8 -*- -from .connection import ( # NOQA - get_redis, - get_redis_from_settings, -) +from .connection import get_redis, get_redis_from_settings # NOQA - -__author__ = 'Rolando Espinoza' -__email__ = 'rolando at rmax.io' -__version__ = '0.8.0' +__author__ = "R Max Espinoza" +__email__ = "hey at rmax.dev" +__version__ = "0.8.0" diff --git a/src/scrapy_redis/connection.py b/src/scrapy_redis/connection.py index 5783e72e..002ccaca 100644 --- a/src/scrapy_redis/connection.py +++ b/src/scrapy_redis/connection.py @@ -1,23 +1,17 @@ -import sys - -import six - from scrapy.utils.misc import load_object from . import defaults - # Shortcut maps 'setting name' -> 'parmater name'. SETTINGS_PARAMS_MAP = { - 'REDIS_URL': 'url', - 'REDIS_HOST': 'host', - 'REDIS_PORT': 'port', - 'REDIS_DB': 'db', - 'REDIS_ENCODING': 'encoding', + "REDIS_URL": "url", + "REDIS_HOST": "host", + "REDIS_PORT": "port", + "REDIS_DB": "db", + "REDIS_ENCODING": "encoding", } -if sys.version_info > (3,): - SETTINGS_PARAMS_MAP['REDIS_DECODE_RESPONSES'] = 'decode_responses' +SETTINGS_PARAMS_MAP["REDIS_DECODE_RESPONSES"] = "decode_responses" def get_redis_from_settings(settings): @@ -59,7 +53,7 @@ def get_redis_from_settings(settings): """ params = defaults.REDIS_PARAMS.copy() - params.update(settings.getdict('REDIS_PARAMS')) + params.update(settings.getdict("REDIS_PARAMS")) # XXX: Deprecate REDIS_* settings. for source, dest in SETTINGS_PARAMS_MAP.items(): val = settings.get(source) @@ -67,8 +61,8 @@ def get_redis_from_settings(settings): params[dest] = val # Allow ``redis_cls`` to be a path to a class. - if isinstance(params.get('redis_cls'), six.string_types): - params['redis_cls'] = load_object(params['redis_cls']) + if isinstance(params.get("redis_cls"), str): + params["redis_cls"] = load_object(params["redis_cls"]) return get_redis(**params) @@ -95,8 +89,8 @@ def get_redis(**kwargs): Redis client instance. """ - redis_cls = kwargs.pop('redis_cls', defaults.REDIS_CLS) - url = kwargs.pop('url', None) + redis_cls = kwargs.pop("redis_cls", defaults.REDIS_CLS) + url = kwargs.pop("url", None) if url: return redis_cls.from_url(url, **kwargs) else: diff --git a/src/scrapy_redis/defaults.py b/src/scrapy_redis/defaults.py index 7a30f7d2..ffe398da 100644 --- a/src/scrapy_redis/defaults.py +++ b/src/scrapy_redis/defaults.py @@ -1,30 +1,29 @@ import redis - # For standalone use. -DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' +DUPEFILTER_KEY = "dupefilter:%(timestamp)s" -PIPELINE_KEY = '%(spider)s:items' +PIPELINE_KEY = "%(spider)s:items" -STATS_KEY = '%(spider)s:stats' +STATS_KEY = "%(spider)s:stats" REDIS_CLS = redis.StrictRedis -REDIS_ENCODING = 'utf-8' +REDIS_ENCODING = "utf-8" # Sane connection defaults. REDIS_PARAMS = { - 'socket_timeout': 30, - 'socket_connect_timeout': 30, - 'retry_on_timeout': True, - 'encoding': REDIS_ENCODING, + "socket_timeout": 30, + "socket_connect_timeout": 30, + "retry_on_timeout": True, + "encoding": REDIS_ENCODING, } REDIS_CONCURRENT_REQUESTS = 16 -SCHEDULER_QUEUE_KEY = '%(spider)s:requests' -SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' -SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' -SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' +SCHEDULER_QUEUE_KEY = "%(spider)s:requests" +SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.PriorityQueue" +SCHEDULER_DUPEFILTER_KEY = "%(spider)s:dupefilter" +SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER_PERSIST = False -START_URLS_KEY = '%(name)s:start_urls' +START_URLS_KEY = "%(name)s:start_urls" START_URLS_AS_SET = False START_URLS_AS_ZSET = False MAX_IDLE_TIME = 0 diff --git a/src/scrapy_redis/dupefilter.py b/src/scrapy_redis/dupefilter.py index dea88c8c..194880a5 100644 --- a/src/scrapy_redis/dupefilter.py +++ b/src/scrapy_redis/dupefilter.py @@ -1,6 +1,6 @@ -import logging import hashlib import json +import logging import time from scrapy.dupefilters import BaseDupeFilter @@ -10,7 +10,6 @@ from . import defaults from .connection import get_redis_from_settings - logger = logging.getLogger(__name__) @@ -66,8 +65,8 @@ def from_settings(cls, settings): # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. - key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} - debug = settings.getbool('DUPEFILTER_DEBUG') + key = defaults.DUPEFILTER_KEY % {"timestamp": int(time.time())} + debug = settings.getbool("DUPEFILTER_DEBUG") return cls(server, key=key, debug=debug) @classmethod @@ -127,12 +126,14 @@ def request_fingerprint(self, request): def from_spider(cls, spider): settings = spider.settings server = get_redis_from_settings(settings) - dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) - key = dupefilter_key % {'spider': spider.name} - debug = settings.getbool('DUPEFILTER_DEBUG') + dupefilter_key = settings.get( + "SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY + ) + key = dupefilter_key % {"spider": spider.name} + debug = settings.getbool("DUPEFILTER_DEBUG") return cls(server, key=key, debug=debug) - def close(self, reason=''): + def close(self, reason=""): """Delete data on close. Called by Scrapy's scheduler. Parameters @@ -157,10 +158,12 @@ def log(self, request, spider): """ if self.debug: msg = "Filtered duplicate request: %(request)s" - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + self.logger.debug(msg, {"request": request}, extra={"spider": spider}) elif self.logdupes: - msg = ("Filtered duplicate request %(request)s" - " - no more duplicates will be shown" - " (see DUPEFILTER_DEBUG to show all duplicates)") - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + msg = ( + "Filtered duplicate request %(request)s" + " - no more duplicates will be shown" + " (see DUPEFILTER_DEBUG to show all duplicates)" + ) + self.logger.debug(msg, {"request": request}, extra={"spider": spider}) self.logdupes = False diff --git a/src/scrapy_redis/pipelines.py b/src/scrapy_redis/pipelines.py index 8ae4ef0f..57267a79 100644 --- a/src/scrapy_redis/pipelines.py +++ b/src/scrapy_redis/pipelines.py @@ -4,11 +4,10 @@ from . import connection, defaults - default_serialize = ScrapyJSONEncoder().encode -class RedisPipeline(object): +class RedisPipeline: """Pushes serialized item into a redis list/queue Settings @@ -20,9 +19,9 @@ class RedisPipeline(object): """ - def __init__(self, server, - key=defaults.PIPELINE_KEY, - serialize_func=default_serialize): + def __init__( + self, server, key=defaults.PIPELINE_KEY, serialize_func=default_serialize + ): """Initialize pipeline. Parameters @@ -42,14 +41,12 @@ def __init__(self, server, @classmethod def from_settings(cls, settings): params = { - 'server': connection.from_settings(settings), + "server": connection.from_settings(settings), } - if settings.get('REDIS_ITEMS_KEY'): - params['key'] = settings['REDIS_ITEMS_KEY'] - if settings.get('REDIS_ITEMS_SERIALIZER'): - params['serialize_func'] = load_object( - settings['REDIS_ITEMS_SERIALIZER'] - ) + if settings.get("REDIS_ITEMS_KEY"): + params["key"] = settings["REDIS_ITEMS_KEY"] + if settings.get("REDIS_ITEMS_SERIALIZER"): + params["serialize_func"] = load_object(settings["REDIS_ITEMS_SERIALIZER"]) return cls(**params) @@ -73,4 +70,4 @@ def item_key(self, item, spider): and/or spider. """ - return self.key % {'spider': spider.name} + return self.key % {"spider": spider.name} diff --git a/src/scrapy_redis/queue.py b/src/scrapy_redis/queue.py index 7039d1a1..075f0cac 100644 --- a/src/scrapy_redis/queue.py +++ b/src/scrapy_redis/queue.py @@ -6,7 +6,7 @@ from . import picklecompat -class Base(object): +class Base: """Per-spider base queue class""" def __init__(self, server, spider, key, serializer=None): @@ -28,14 +28,18 @@ def __init__(self, server, spider, key, serializer=None): # Backward compatibility. # TODO: deprecate pickle. serializer = picklecompat - if not hasattr(serializer, 'loads'): - raise TypeError(f"serializer does not implement 'loads' function: {serializer}") - if not hasattr(serializer, 'dumps'): - raise TypeError(f"serializer does not implement 'dumps' function: {serializer}") + if not hasattr(serializer, "loads"): + raise TypeError( + f"serializer does not implement 'loads' function: {serializer}" + ) + if not hasattr(serializer, "dumps"): + raise TypeError( + f"serializer does not implement 'dumps' function: {serializer}" + ) self.server = server self.spider = spider - self.key = key % {'spider': spider.name} + self.key = key % {"spider": spider.name} self.serializer = serializer def _encode_request(self, request): @@ -105,7 +109,7 @@ def push(self, request): # We don't use zadd method as the order of arguments change depending on # whether the class is Redis or StrictRedis, and the option of using # kwargs only accepts strings, not bytes. - self.server.execute_command('ZADD', self.key, score, data) + self.server.execute_command("ZADD", self.key, score, data) def pop(self, timeout=0): """ diff --git a/src/scrapy_redis/scheduler.py b/src/scrapy_redis/scheduler.py index 28bc1973..0814d59a 100644 --- a/src/scrapy_redis/scheduler.py +++ b/src/scrapy_redis/scheduler.py @@ -1,5 +1,4 @@ import importlib -import six from scrapy.utils.misc import load_object @@ -7,7 +6,7 @@ # TODO: add SCRAPY_JOB support. -class Scheduler(object): +class Scheduler: """Redis-based scheduler Settings @@ -31,15 +30,18 @@ class Scheduler(object): """ - def __init__(self, server, - persist=False, - flush_on_start=False, - queue_key=defaults.SCHEDULER_QUEUE_KEY, - queue_cls=defaults.SCHEDULER_QUEUE_CLASS, - dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, - dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, - idle_before_close=0, - serializer=None): + def __init__( + self, + server, + persist=False, + flush_on_start=False, + queue_key=defaults.SCHEDULER_QUEUE_KEY, + queue_cls=defaults.SCHEDULER_QUEUE_CLASS, + dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, + dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, + idle_before_close=0, + serializer=None, + ): """Initialize scheduler. Parameters @@ -82,21 +84,21 @@ def __len__(self): @classmethod def from_settings(cls, settings): kwargs = { - 'persist': settings.getbool('SCHEDULER_PERSIST'), - 'flush_on_start': settings.getbool('SCHEDULER_FLUSH_ON_START'), - 'idle_before_close': settings.getint('SCHEDULER_IDLE_BEFORE_CLOSE'), + "persist": settings.getbool("SCHEDULER_PERSIST"), + "flush_on_start": settings.getbool("SCHEDULER_FLUSH_ON_START"), + "idle_before_close": settings.getint("SCHEDULER_IDLE_BEFORE_CLOSE"), } # If these values are missing, it means we want to use the defaults. optional = { # TODO: Use custom prefixes for this settings to note that are # specific to scrapy-redis. - 'queue_key': 'SCHEDULER_QUEUE_KEY', - 'queue_cls': 'SCHEDULER_QUEUE_CLASS', - 'dupefilter_key': 'SCHEDULER_DUPEFILTER_KEY', + "queue_key": "SCHEDULER_QUEUE_KEY", + "queue_cls": "SCHEDULER_QUEUE_CLASS", + "dupefilter_key": "SCHEDULER_DUPEFILTER_KEY", # We use the default setting name to keep compatibility. - 'dupefilter_cls': 'DUPEFILTER_CLASS', - 'serializer': 'SCHEDULER_SERIALIZER', + "dupefilter_cls": "DUPEFILTER_CLASS", + "serializer": "SCHEDULER_SERIALIZER", } for name, setting_name in optional.items(): val = settings.get(setting_name) @@ -104,8 +106,8 @@ def from_settings(cls, settings): kwargs[name] = val # Support serializer as a path to a module. - if isinstance(kwargs.get('serializer'), six.string_types): - kwargs['serializer'] = importlib.import_module(kwargs['serializer']) + if isinstance(kwargs.get("serializer"), str): + kwargs["serializer"] = importlib.import_module(kwargs["serializer"]) server = connection.from_settings(settings) # Ensure the connection is working. @@ -127,11 +129,13 @@ def open(self, spider): self.queue = load_object(self.queue_cls)( server=self.server, spider=spider, - key=self.queue_key % {'spider': spider.name}, + key=self.queue_key % {"spider": spider.name}, serializer=self.serializer, ) except TypeError as e: - raise ValueError(f"Failed to instantiate queue class '{self.queue_cls}': {e}") + raise ValueError( + f"Failed to instantiate queue class '{self.queue_cls}': {e}" + ) self.df = load_object(self.dupefilter_cls).from_spider(spider) @@ -154,7 +158,7 @@ def enqueue_request(self, request): self.df.log(request, self.spider) return False if self.stats: - self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) + self.stats.inc_value("scheduler/enqueued/redis", spider=self.spider) self.queue.push(request) return True @@ -162,7 +166,7 @@ def next_request(self): block_pop_timeout = self.idle_before_close request = self.queue.pop(block_pop_timeout) if request and self.stats: - self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) + self.stats.inc_value("scheduler/dequeued/redis", spider=self.spider) return request def has_pending_requests(self): diff --git a/src/scrapy_redis/spiders.py b/src/scrapy_redis/spiders.py index 36c56b28..9ca48a87 100644 --- a/src/scrapy_redis/spiders.py +++ b/src/scrapy_redis/spiders.py @@ -1,17 +1,21 @@ import json +import time from collections.abc import Iterable -from scrapy import signals, FormRequest, version_info as scrapy_version + +from scrapy import FormRequest, signals +from scrapy import version_info as scrapy_version from scrapy.exceptions import DontCloseSpider -from scrapy.spiders import Spider, CrawlSpider +from scrapy.spiders import CrawlSpider, Spider + from scrapy_redis.utils import TextColor -import time from . import connection, defaults from .utils import bytes_to_str, is_dict -class RedisMixin(object): +class RedisMixin: """Mixin class to implement reading urls from a redis queue.""" + redis_key = None redis_batch_size = None redis_encoding = None @@ -39,7 +43,7 @@ def setup_redis(self, crawler=None): # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. - crawler = getattr(self, 'crawler', None) + crawler = getattr(self, "crawler", None) if crawler is None: raise ValueError("crawler is required") @@ -48,16 +52,19 @@ def setup_redis(self, crawler=None): if self.redis_key is None: self.redis_key = settings.get( - 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, + "REDIS_START_URLS_KEY", + defaults.START_URLS_KEY, ) - self.redis_key = self.redis_key % {'name': self.name} + self.redis_key = self.redis_key % {"name": self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: - self.redis_batch_size = settings.getint('CONCURRENT_REQUESTS', defaults.REDIS_CONCURRENT_REQUESTS) + self.redis_batch_size = settings.getint( + "CONCURRENT_REQUESTS", defaults.REDIS_CONCURRENT_REQUESTS + ) try: self.redis_batch_size = int(self.redis_batch_size) @@ -65,18 +72,22 @@ def setup_redis(self, crawler=None): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: - self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) + self.redis_encoding = settings.get( + "REDIS_ENCODING", defaults.REDIS_ENCODING + ) - self.logger.info("Reading start URLs from redis key '%(redis_key)s' " - "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)", - self.__dict__) + self.logger.info( + "Reading start URLs from redis key '%(redis_key)s' " + "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s)", + self.__dict__, + ) self.server = connection.from_settings(crawler.settings) - if settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET): + if settings.getbool("REDIS_START_URLS_AS_SET", defaults.START_URLS_AS_SET): self.fetch_data = self.server.spop self.count_size = self.server.scard - elif settings.getbool('REDIS_START_URLS_AS_ZSET', defaults.START_URLS_AS_ZSET): + elif settings.getbool("REDIS_START_URLS_AS_ZSET", defaults.START_URLS_AS_ZSET): self.fetch_data = self.pop_priority_queue self.count_size = self.server.zcard else: @@ -85,8 +96,7 @@ def setup_redis(self, crawler=None): if self.max_idle_time is None: self.max_idle_time = settings.get( - "MAX_IDLE_TIME_BEFORE_CLOSE", - defaults.MAX_IDLE_TIME + "MAX_IDLE_TIME_BEFORE_CLOSE", defaults.MAX_IDLE_TIME ) try: @@ -124,7 +134,7 @@ def next_requests(self): yield req # XXX: should be here? found += 1 - self.logger.info(f'start req url:{req.url}') + self.logger.info(f"start req url:{req.url}") elif reqs: yield reqs found += 1 @@ -135,28 +145,29 @@ def next_requests(self): self.logger.debug(f"Read {found} requests from '{self.redis_key}'") def make_request_from_data(self, data): - """ - Returns a `Request` instance for data coming from Redis. + """Returns a `Request` instance for data coming from Redis. Overriding this function to support the `json` requested `data` that contains `url` ,`meta` and other optional parameters. `meta` is a nested json which contains sub-data. Along with: After accessing the data, sending the FormRequest with `url`, `meta` and addition `formdata`, `method` + For example: - { - "url": "https://exaple.com", - "meta": { - 'job-id':'123xsd', - 'start-date':'dd/mm/yy' - }, - "url_cookie_key":"fertxsas", - "method":"POST" - } - - If `url` is empty, return []. So you should verify the `url` in the data. + + { + "url": "https://example.com", + "meta": { + "job-id":"123xsd", + "start-date":"dd/mm/yy", + }, + "url_cookie_key":"fertxsas", + "method":"POST", + } + + If `url` is empty, return `[]`. So you should verify the `url` in the data. If `method` is empty, the request object will set method to 'GET', optional. - If `meta` is empty, the request object will set `meta` to {}, optional. + If `meta` is empty, the request object will set `meta` to an empty dictionary, optional. This json supported data can be accessed from 'scrapy.spider' through response. 'request.url', 'request.meta', 'request.cookies', 'request.method' @@ -178,15 +189,19 @@ def make_request_from_data(self, data): ) return FormRequest(formatted_data, dont_filter=True) - if parameter.get('url', None) is None: - self.logger.warning(f"{TextColor.WARNING}The data from Redis has no url key in push data{TextColor.ENDC}") + if parameter.get("url", None) is None: + self.logger.warning( + f"{TextColor.WARNING}The data from Redis has no url key in push data{TextColor.ENDC}" + ) return [] url = parameter.pop("url") method = parameter.pop("method").upper() if "method" in parameter else "GET" metadata = parameter.pop("meta") if "meta" in parameter else {} - return FormRequest(url, dont_filter=True, method=method, formdata=parameter, meta=metadata) + return FormRequest( + url, dont_filter=True, method=method, formdata=parameter, meta=metadata + ) def schedule_next_requests(self): """Schedules a request if available""" @@ -243,7 +258,7 @@ class RedisSpider(RedisMixin, Spider): @classmethod def from_crawler(cls, crawler, *args, **kwargs): - obj = super(RedisSpider, cls).from_crawler(crawler, *args, **kwargs) + obj = super().from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj @@ -275,6 +290,6 @@ class RedisCrawlSpider(RedisMixin, CrawlSpider): @classmethod def from_crawler(cls, crawler, *args, **kwargs): - obj = super(RedisCrawlSpider, cls).from_crawler(crawler, *args, **kwargs) + obj = super().from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj diff --git a/src/scrapy_redis/stats.py b/src/scrapy_redis/stats.py index ebd18841..29c8eb7a 100644 --- a/src/scrapy_redis/stats.py +++ b/src/scrapy_redis/stats.py @@ -1,8 +1,9 @@ -from scrapy.statscollectors import StatsCollector -from .connection import from_settings as redis_from_settings -from .defaults import STATS_KEY, SCHEDULER_PERSIST from datetime import datetime +from scrapy.statscollectors import StatsCollector + +from .connection import from_settings as redis_from_settings +from .defaults import SCHEDULER_PERSIST, STATS_KEY from .utils import convert_bytes_to_str @@ -16,17 +17,16 @@ def __init__(self, crawler, spider=None): self.server = redis_from_settings(crawler.settings) self.spider = spider self.spider_name = spider.name if spider else crawler.spidercls.name - self.stats_key = crawler.settings.get('STATS_KEY', STATS_KEY) - self.persist = crawler.settings.get( - 'SCHEDULER_PERSIST', SCHEDULER_PERSIST) + self.stats_key = crawler.settings.get("STATS_KEY", STATS_KEY) + self.persist = crawler.settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) def _get_key(self, spider=None): """Return the hash name of stats""" if spider: - return self.stats_key % {'spider': spider.name} + return self.stats_key % {"spider": spider.name} if self.spider: - return self.stats_key % {'spider': self.spider.name} - return self.stats_key % {'spider': self.spider_name or 'scrapy'} + return self.stats_key % {"spider": self.spider.name} + return self.stats_key % {"spider": self.spider_name or "scrapy"} @classmethod def from_crawler(cls, crawler): diff --git a/src/scrapy_redis/utils.py b/src/scrapy_redis/utils.py index 2a8dbbf5..224782ec 100644 --- a/src/scrapy_redis/utils.py +++ b/src/scrapy_redis/utils.py @@ -5,18 +5,18 @@ class TextColor: - HEADER = '\033[95m' - OKBLUE = '\033[94m' - OKCYAN = '\033[96m' - OKGREEN = '\033[92m' - WARNING = '\033[93m' - FAIL = '\033[91m' - ENDC = '\033[0m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - - -def bytes_to_str(s, encoding='utf-8'): + HEADER = "\033[95m" + OKBLUE = "\033[94m" + OKCYAN = "\033[96m" + OKGREEN = "\033[92m" + WARNING = "\033[93m" + FAIL = "\033[91m" + ENDC = "\033[0m" + BOLD = "\033[1m" + UNDERLINE = "\033[4m" + + +def bytes_to_str(s, encoding="utf-8"): """Returns a str if a bytes object is given.""" if six.PY3 and isinstance(s, bytes): return s.decode(encoding) @@ -32,9 +32,9 @@ def is_dict(string_content): return True -def convert_bytes_to_str(data, encoding='utf-8'): +def convert_bytes_to_str(data, encoding="utf-8"): """Convert a dict's keys & values from `bytes` to `str` - or convert bytes to str""" + or convert bytes to str""" if isinstance(data, bytes): return data.decode(encoding) if isinstance(data, dict): diff --git a/tests/test_connection.py b/tests/test_connection.py index b126e2fe..bf84959e 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -1,16 +1,12 @@ -import mock +from unittest import mock from scrapy.settings import Settings from scrapy_redis import defaults -from scrapy_redis.connection import ( - from_settings, - get_redis, - get_redis_from_settings, -) +from scrapy_redis.connection import from_settings, get_redis, get_redis_from_settings -class TestGetRedis(object): +class TestGetRedis: def test_default_instance(self): server = get_redis() @@ -18,47 +14,51 @@ def test_default_instance(self): def test_custom_class(self): client_cls = mock.Mock() - server = get_redis(param='foo', redis_cls=client_cls) + server = get_redis(param="foo", redis_cls=client_cls) assert server is client_cls.return_value - client_cls.assert_called_with(param='foo') + client_cls.assert_called_with(param="foo") def test_from_url(self): client_cls = mock.Mock() - url = 'redis://localhost' - server = get_redis(redis_cls=client_cls, url=url, param='foo') + url = "redis://localhost" + server = get_redis(redis_cls=client_cls, url=url, param="foo") assert server is client_cls.from_url.return_value - client_cls.from_url.assert_called_with(url, param='foo') + client_cls.from_url.assert_called_with(url, param="foo") -class TestFromSettings(object): +class TestFromSettings: def setup(self): self.redis_cls = mock.Mock() self.expected_params = { - 'timeout': 0, - 'flag': False, + "timeout": 0, + "flag": False, } - self.settings = Settings({ - 'REDIS_PARAMS': dict(self.expected_params, redis_cls=self.redis_cls), - }) + self.settings = Settings( + { + "REDIS_PARAMS": dict(self.expected_params, redis_cls=self.redis_cls), + } + ) def test_redis_cls_default(self): server = from_settings(Settings()) assert isinstance(server, defaults.REDIS_CLS) def test_redis_cls_custom_path(self): - self.settings['REDIS_PARAMS']['redis_cls'] = 'mock.Mock' + self.settings["REDIS_PARAMS"]["redis_cls"] = "unittest.mock.Mock" server = from_settings(self.settings) assert isinstance(server, mock.Mock) def test_default_params(self): server = from_settings(self.settings) assert server is self.redis_cls.return_value - self.redis_cls.assert_called_with(**dict(defaults.REDIS_PARAMS, **self.expected_params)) + self.redis_cls.assert_called_with( + **dict(defaults.REDIS_PARAMS, **self.expected_params) + ) def test_override_default_params(self): - for key, val in defaults.REDIS_PARAMS.items(): - self.expected_params[key] = self.settings['REDIS_PARAMS'][key] = object() + for key, _ in defaults.REDIS_PARAMS.items(): + self.expected_params[key] = self.settings["REDIS_PARAMS"][key] = object() server = from_settings(self.settings) assert server is self.redis_cls.return_value diff --git a/tests/test_dupefilter.py b/tests/test_dupefilter.py index b5aeb9d6..04192a5d 100644 --- a/tests/test_dupefilter.py +++ b/tests/test_dupefilter.py @@ -1,4 +1,4 @@ -import mock +from unittest import mock from scrapy.http import Request from scrapy.settings import Settings @@ -9,7 +9,7 @@ def get_redis_mock(): server = mock.Mock() - def sadd(key, fp, added=0, db={}): + def sadd(key, fp, added=0, db={}): # noqa: mutable db fingerprints = db.setdefault(key, set()) if fp not in fingerprints: fingerprints.add(fp) @@ -21,27 +21,27 @@ def sadd(key, fp, added=0, db={}): return server -class TestRFPDupeFilter(object): +class TestRFPDupeFilter: def setup(self): self.server = get_redis_mock() - self.key = 'dupefilter:1' + self.key = "dupefilter:1" self.df = RFPDupeFilter(self.server, self.key) def test_request_seen(self): - req = Request('http://example.com') + req = Request("http://example.com") def same_request(): assert not self.df.request_seen(req) assert self.df.request_seen(req) def diff_method(): - diff_method = Request('http://example.com', method='POST') + diff_method = Request("http://example.com", method="POST") assert self.df.request_seen(req) assert not self.df.request_seen(diff_method) def diff_url(): - diff_url = Request('http://example2.com') + diff_url = Request("http://example2.com") assert self.df.request_seen(req) assert not self.df.request_seen(diff_url) @@ -50,7 +50,7 @@ def diff_url(): diff_url() def test_overridable_request_fingerprinter(self): - req = Request('http://example.com') + req = Request("http://example.com") self.df.request_fingerprint = mock.Mock(wraps=self.df.request_fingerprint) assert not self.df.request_seen(req) self.df.request_fingerprint.assert_called_with(req) @@ -62,34 +62,36 @@ def test_clear_deletes(self): def test_close_calls_clear(self): self.df.clear = mock.Mock(wraps=self.df.clear) self.df.close() - self.df.close(reason='foo') + self.df.close(reason="foo") assert self.df.clear.call_count == 2 def test_log_dupes(): def _test(df, dupes, logcount): df.logger.debug = mock.Mock(wraps=df.logger.debug) - for i in range(dupes): - req = Request('http://example') + for _ in range(dupes): + req = Request("http://example") df.log(req, spider=mock.Mock()) assert df.logger.debug.call_count == logcount server = get_redis_mock() - df_quiet = RFPDupeFilter(server, 'foo') # debug=False + df_quiet = RFPDupeFilter(server, "foo") # debug=False _test(df_quiet, 5, 1) - df_debug = RFPDupeFilter(server, 'foo', debug=True) + df_debug = RFPDupeFilter(server, "foo", debug=True) _test(df_debug, 5, 5) -@mock.patch('scrapy_redis.dupefilter.get_redis_from_settings') -class TestFromMethods(object): +@mock.patch("scrapy_redis.dupefilter.get_redis_from_settings") +class TestFromMethods: def setup(self): - self.settings = Settings({ - 'DUPEFILTER_DEBUG': True, - }) + self.settings = Settings( + { + "DUPEFILTER_DEBUG": True, + } + ) def test_from_settings(self, get_redis_from_settings): df = RFPDupeFilter.from_settings(self.settings) @@ -102,5 +104,5 @@ def test_from_crawler(self, get_redis_from_settings): def assert_dupefilter(self, df, get_redis_from_settings): assert df.server is get_redis_from_settings.return_value - assert df.key.startswith('dupefilter:') + assert df.key.startswith("dupefilter:") assert df.debug # true diff --git a/tests/test_picklecompat.py b/tests/test_picklecompat.py index b9b3b40d..5c9c243f 100644 --- a/tests/test_picklecompat.py +++ b/tests/test_picklecompat.py @@ -3,16 +3,16 @@ def test_picklecompat(): obj = { - '_encoding': 'utf-8', - 'body': '', - 'callback': '_response_downloaded', - 'cookies': {}, - 'dont_filter': False, - 'errback': None, - 'headers': {'Referer': ['http://www.dmoz.org/']}, - 'meta': {'depth': 1, 'link_text': 'Fran\xe7ais', 'rule': 0}, - 'method': 'GET', - 'priority': 0, - 'url': 'http://www.dmoz.org/World/Fran%C3%A7ais/', + "_encoding": "utf-8", + "body": "", + "callback": "_response_downloaded", + "cookies": {}, + "dont_filter": False, + "errback": None, + "headers": {"Referer": ["http://www.dmoz.org/"]}, + "meta": {"depth": 1, "link_text": "Fran\xe7ais", "rule": 0}, + "method": "GET", + "priority": 0, + "url": "http://www.dmoz.org/World/Fran%C3%A7ais/", } assert obj == picklecompat.loads(picklecompat.dumps(obj)) diff --git a/tests/test_queue.py b/tests/test_queue.py index adcbe716..84bd1165 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -1,4 +1,4 @@ -import mock +from unittest import mock from scrapy import Spider from scrapy.http import Request @@ -6,23 +6,23 @@ from scrapy_redis.queue import Base -class TestBaseQueue(object): +class TestBaseQueue: queue_cls = Base def setup(self): self.server = mock.Mock() - self.spider = Spider(name='foo') + self.spider = Spider(name="foo") self.spider.parse_method = lambda x: x - self.key = 'key' + self.key = "key" self.q = self.queue_cls(self.server, self.spider, self.key) def test_encode_decode_requests(self, q=None): if q is None: q = self.q - req = Request('http://example.com', - callback=self.spider.parse, - meta={'foo': 'bar'}) + req = Request( + "http://example.com", callback=self.spider.parse, meta={"foo": "bar"} + ) out = q._decode_request(q._encode_request(req)) assert req.url == out.url assert req.meta == out.meta diff --git a/tests/test_scrapy_redis.py b/tests/test_scrapy_redis.py index f5db4e40..5babbcc3 100644 --- a/tests/test_scrapy_redis.py +++ b/tests/test_scrapy_redis.py @@ -1,40 +1,39 @@ import os +from unittest import TestCase, mock -import mock import redis - from scrapy import Request, Spider from scrapy.settings import Settings from scrapy.utils.test import get_crawler -from unittest import TestCase from scrapy_redis import connection from scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.queue import FifoQueue, LifoQueue, PriorityQueue from scrapy_redis.scheduler import Scheduler - # allow test settings from environment -REDIS_HOST = os.environ.get('REDIS_HOST', 'localhost') -REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) +REDIS_HOST = os.environ.get("REDIS_HOST", "localhost") +REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) def get_spider(*args, **kwargs): - crawler = get_crawler(spidercls=kwargs.pop('spidercls', None), - settings_dict=kwargs.pop('settings_dict', None)) + crawler = get_crawler( + spidercls=kwargs.pop("spidercls", None), + settings_dict=kwargs.pop("settings_dict", None), + ) return crawler._create_spider(*args, **kwargs) -class RedisTestMixin(object): +class RedisTestMixin: @property def server(self): - if not hasattr(self, '_redis'): + if not hasattr(self, "_redis"): self._redis = redis.Redis(REDIS_HOST, REDIS_PORT) return self._redis def clear_keys(self, prefix): - keys = self.server.keys(prefix + '*') + keys = self.server.keys(prefix + "*") if keys: self.server.delete(*keys) @@ -42,19 +41,19 @@ def clear_keys(self, prefix): class DupeFilterTest(RedisTestMixin, TestCase): def setUp(self): - self.key = 'scrapy_redis:tests:dupefilter:' + self.key = "scrapy_redis:tests:dupefilter:" self.df = RFPDupeFilter(self.server, self.key) def tearDown(self): self.clear_keys(self.key) def test_dupe_filter(self): - req = Request('http://example.com') + req = Request("http://example.com") self.assertFalse(self.df.request_seen(req)) self.assertTrue(self.df.request_seen(req)) - self.df.close('nothing') + self.df.close("nothing") class QueueTestMixin(RedisTestMixin): @@ -62,9 +61,9 @@ class QueueTestMixin(RedisTestMixin): queue_cls = None def setUp(self): - self.spider = get_spider(name='myspider') - self.key = f'scrapy_redis:tests:{self.spider.name}:queue' - self.q = self.queue_cls(self.server, Spider('myspider'), self.key) + self.spider = get_spider(name="myspider") + self.key = f"scrapy_redis:tests:{self.spider.name}:queue" + self.q = self.queue_cls(self.server, Spider("myspider"), self.key) def tearDown(self): self.clear_keys(self.key) @@ -80,7 +79,7 @@ def test_clear(self): # duplication filter whenever the serielized requests are the same. # This might be unwanted on repetitive requests to the same page # even with dont_filter=True flag. - req = Request(f'http://example.com/?page={i}') + req = Request(f"http://example.com/?page={i}") self.q.push(req) self.assertEqual(len(self.q), 10) @@ -93,8 +92,8 @@ class FifoQueueTest(QueueTestMixin, TestCase): queue_cls = FifoQueue def test_queue(self): - req1 = Request('http://example.com/page1') - req2 = Request('http://example.com/page2') + req1 = Request("http://example.com/page1") + req2 = Request("http://example.com/page2") self.q.push(req1) self.q.push(req2) @@ -111,9 +110,9 @@ class PriorityQueueTest(QueueTestMixin, TestCase): queue_cls = PriorityQueue def test_queue(self): - req1 = Request('http://example.com/page1', priority=100) - req2 = Request('http://example.com/page2', priority=50) - req3 = Request('http://example.com/page2', priority=200) + req1 = Request("http://example.com/page1", priority=100) + req2 = Request("http://example.com/page2", priority=50) + req3 = Request("http://example.com/page2", priority=200) self.q.push(req1) self.q.push(req2) @@ -133,8 +132,8 @@ class LifoQueueTest(QueueTestMixin, TestCase): queue_cls = LifoQueue def test_queue(self): - req1 = Request('http://example.com/page1') - req2 = Request('http://example.com/page2') + req1 = Request("http://example.com/page1") + req2 = Request("http://example.com/page2") self.q.push(req1) self.q.push(req2) @@ -149,19 +148,22 @@ def test_queue(self): class SchedulerTest(RedisTestMixin, TestCase): def setUp(self): - self.key_prefix = 'scrapy_redis:tests:' - self.queue_key = self.key_prefix + '%(spider)s:requests' - self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' - self.spider = get_spider(name='myspider', settings_dict={ - 'REDIS_HOST': REDIS_HOST, - 'REDIS_PORT': REDIS_PORT, - 'SCHEDULER_QUEUE_KEY': self.queue_key, - 'SCHEDULER_DUPEFILTER_KEY': self.dupefilter_key, - 'SCHEDULER_FLUSH_ON_START': False, - 'SCHEDULER_PERSIST': False, - 'SCHEDULER_SERIALIZER': 'pickle', - 'DUPEFILTER_CLASS': 'scrapy_redis.dupefilter.RFPDupeFilter', - }) + self.key_prefix = "scrapy_redis:tests:" + self.queue_key = self.key_prefix + "%(spider)s:requests" + self.dupefilter_key = self.key_prefix + "%(spider)s:dupefilter" + self.spider = get_spider( + name="myspider", + settings_dict={ + "REDIS_HOST": REDIS_HOST, + "REDIS_PORT": REDIS_PORT, + "SCHEDULER_QUEUE_KEY": self.queue_key, + "SCHEDULER_DUPEFILTER_KEY": self.dupefilter_key, + "SCHEDULER_FLUSH_ON_START": False, + "SCHEDULER_PERSIST": False, + "SCHEDULER_SERIALIZER": "pickle", + "DUPEFILTER_CLASS": "scrapy_redis.dupefilter.RFPDupeFilter", + }, + ) self.scheduler = Scheduler.from_crawler(self.spider.crawler) def tearDown(self): @@ -174,7 +176,7 @@ def test_scheduler(self): self.scheduler.open(self.spider) self.assertEqual(len(self.scheduler), 0) - req = Request('http://example.com') + req = Request("http://example.com") self.scheduler.enqueue_request(req) self.assertTrue(self.scheduler.has_pending_requests()) self.assertEqual(len(self.scheduler), 1) @@ -189,7 +191,7 @@ def test_scheduler(self): self.assertFalse(self.scheduler.has_pending_requests()) self.assertEqual(len(self.scheduler), 0) - self.scheduler.close('finish') + self.scheduler.close("finish") def test_scheduler_persistent(self): # TODO: Improve this test to avoid the need to check for log messages. @@ -200,20 +202,22 @@ def test_scheduler_persistent(self): self.assertEqual(self.spider.log.call_count, 0) - self.scheduler.enqueue_request(Request('http://example.com/page1')) - self.scheduler.enqueue_request(Request('http://example.com/page2')) + self.scheduler.enqueue_request(Request("http://example.com/page1")) + self.scheduler.enqueue_request(Request("http://example.com/page2")) self.assertTrue(self.scheduler.has_pending_requests()) - self.scheduler.close('finish') + self.scheduler.close("finish") self.scheduler.open(self.spider) - self.spider.log.assert_has_calls([ - mock.call("Resuming crawl (2 requests scheduled)"), - ]) + self.spider.log.assert_has_calls( + [ + mock.call("Resuming crawl (2 requests scheduled)"), + ] + ) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False - self.scheduler.close('finish') + self.scheduler.close("finish") self.assertEqual(len(self.scheduler), 0) @@ -222,60 +226,64 @@ class ConnectionTest(TestCase): # We can get a connection from just REDIS_URL. def test_redis_url(self): - settings = Settings({ - 'REDIS_URL': 'redis://foo:bar@localhost:9001/42', - }) + settings = Settings( + { + "REDIS_URL": "redis://foo:bar@localhost:9001/42", + } + ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs - self.assertEqual(connect_args['host'], 'localhost') - self.assertEqual(connect_args['port'], 9001) - self.assertEqual(connect_args['password'], 'bar') - self.assertEqual(connect_args['db'], 42) + self.assertEqual(connect_args["host"], "localhost") + self.assertEqual(connect_args["port"], 9001) + self.assertEqual(connect_args["password"], "bar") + self.assertEqual(connect_args["db"], 42) # We can get a connection from REDIS_HOST/REDIS_PORT. def test_redis_host_port(self): - settings = Settings({ - 'REDIS_HOST': 'localhost', - 'REDIS_PORT': 9001, - }) + settings = Settings( + { + "REDIS_HOST": "localhost", + "REDIS_PORT": 9001, + } + ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs - self.assertEqual(connect_args['host'], 'localhost') - self.assertEqual(connect_args['port'], 9001) + self.assertEqual(connect_args["host"], "localhost") + self.assertEqual(connect_args["port"], 9001) # REDIS_URL takes precedence over REDIS_HOST/REDIS_PORT. def test_redis_url_precedence(self): - settings = Settings(dict( - REDIS_HOST='baz', - REDIS_PORT=1337, - REDIS_URL='redis://foo:bar@localhost:9001/42' - )) + settings = Settings( + { + "REDIS_HOST": "baz", + "REDIS_PORT": 1337, + "REDIS_URL": "redis://foo:bar@localhost:9001/42", + } + ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs - self.assertEqual(connect_args['host'], 'localhost') - self.assertEqual(connect_args['port'], 9001) - self.assertEqual(connect_args['password'], 'bar') - self.assertEqual(connect_args['db'], 42) + self.assertEqual(connect_args["host"], "localhost") + self.assertEqual(connect_args["port"], 9001) + self.assertEqual(connect_args["password"], "bar") + self.assertEqual(connect_args["db"], 42) # We fallback to REDIS_HOST/REDIS_PORT if REDIS_URL is None. def test_redis_host_port_fallback(self): - settings = Settings(dict( - REDIS_HOST='baz', - REDIS_PORT=1337, - REDIS_URL=None - )) + settings = Settings( + {"REDIS_HOST": "baz", "REDIS_PORT": 1337, "REDIS_URL": None} + ) server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs - self.assertEqual(connect_args['host'], 'baz') - self.assertEqual(connect_args['port'], 1337) + self.assertEqual(connect_args["host"], "baz") + self.assertEqual(connect_args["port"], 1337) # We use default values for REDIS_HOST/REDIS_PORT. def test_redis_default(self): @@ -284,5 +292,5 @@ def test_redis_default(self): server = connection.from_settings(settings) connect_args = server.connection_pool.connection_kwargs - self.assertEqual(connect_args['host'], 'localhost') - self.assertEqual(connect_args['port'], 6379) + self.assertEqual(connect_args["host"], "localhost") + self.assertEqual(connect_args["port"], 6379) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index 1dce5cbd..11025f6f 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -1,20 +1,16 @@ import contextlib -import mock import os -import pytest +from unittest import mock +import pytest from scrapy import signals from scrapy.exceptions import DontCloseSpider from scrapy.settings import Settings -from scrapy_redis.spiders import ( - RedisCrawlSpider, - RedisSpider, -) +from scrapy_redis.spiders import RedisCrawlSpider, RedisSpider - -REDIS_HOST = os.environ.get('REDIS_HOST', 'localhost') -REDIS_PORT = int(os.environ.get('REDIS_PORT', 6379)) +REDIS_HOST = os.environ.get("REDIS_HOST", "localhost") +REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) @contextlib.contextmanager @@ -26,21 +22,26 @@ def flushall(server): class MySpider(RedisSpider): - name = 'myspider' + name = "myspider" class MyCrawlSpider(RedisCrawlSpider): - name = 'myspider' + name = "myspider" def get_crawler(**kwargs): - return mock.Mock(settings=Settings({ - "REDIS_HOST": REDIS_HOST, - "REDIS_PORT": REDIS_PORT, - }), **kwargs) + return mock.Mock( + settings=Settings( + { + "REDIS_HOST": REDIS_HOST, + "REDIS_PORT": REDIS_PORT, + } + ), + **kwargs, + ) -class TestRedisMixin_setup_redis(object): +class TestRedisMixin_setup_redis: def setup(self): self.myspider = MySpider() @@ -52,33 +53,35 @@ def test_crawler_required(self): def test_requires_redis_key(self): self.myspider.crawler = get_crawler() - self.myspider.redis_key = '' + self.myspider.redis_key = "" with pytest.raises(ValueError) as excinfo: self.myspider.setup_redis() assert "redis_key" in str(excinfo.value) def test_invalid_batch_size(self): - self.myspider.redis_batch_size = 'x' + self.myspider.redis_batch_size = "x" self.myspider.crawler = get_crawler() with pytest.raises(ValueError) as excinfo: self.myspider.setup_redis() assert "redis_batch_size" in str(excinfo.value) def test_invalid_idle_time(self): - self.myspider.max_idle_time = 'x' + self.myspider.max_idle_time = "x" self.myspider.crawler = get_crawler() with pytest.raises(ValueError) as excinfo: self.myspider.setup_redis() assert "max_idle_time" in str(excinfo.value) - @mock.patch('scrapy_redis.spiders.connection') + @mock.patch("scrapy_redis.spiders.connection") def test_via_from_crawler(self, connection): server = connection.from_settings.return_value = mock.Mock() crawler = get_crawler() myspider = MySpider.from_crawler(crawler) assert myspider.server is server connection.from_settings.assert_called_with(crawler.settings) - crawler.signals.connect.assert_called_with(myspider.spider_idle, signal=signals.spider_idle) + crawler.signals.connect.assert_called_with( + myspider.spider_idle, signal=signals.spider_idle + ) # Second call does nothing. server = myspider.server crawler.signals.connect.reset_mock() @@ -87,27 +90,31 @@ def test_via_from_crawler(self, connection): assert crawler.signals.connect.call_count == 0 -@pytest.mark.parametrize('spider_cls', [ - MySpider, - MyCrawlSpider, -]) +@pytest.mark.parametrize( + "spider_cls", + [ + MySpider, + MyCrawlSpider, + ], +) def test_from_crawler_with_spider_arguments(spider_cls): crawler = get_crawler() spider = spider_cls.from_crawler( - crawler, 'foo', - redis_key='key:%(name)s', - redis_batch_size='2000', - max_idle_time='100', + crawler, + "foo", + redis_key="key:%(name)s", + redis_batch_size="2000", + max_idle_time="100", ) - assert spider.name == 'foo' - assert spider.redis_key == 'key:foo' + assert spider.name == "foo" + assert spider.redis_key == "key:foo" assert spider.redis_batch_size == 2000 assert spider.max_idle_time == 100 class MockRequest(mock.Mock): def __init__(self, url, **kwargs): - super(MockRequest, self).__init__() + super().__init__() self.url = url def __eq__(self, other): @@ -117,38 +124,44 @@ def __hash__(self): return hash(self.url) def __repr__(self): - return f'<{self.__class__.__name__}({self.url})>' + return f"<{self.__class__.__name__}({self.url})>" -@pytest.mark.parametrize('spider_cls', [ - MySpider, - MyCrawlSpider, -]) -@pytest.mark.parametrize('start_urls_as_zset', [False, True]) -@pytest.mark.parametrize('start_urls_as_set', [False, True]) -@mock.patch('scrapy.spiders.Request', MockRequest) +@pytest.mark.parametrize( + "spider_cls", + [ + MySpider, + MyCrawlSpider, + ], +) +@pytest.mark.parametrize("start_urls_as_zset", [False, True]) +@pytest.mark.parametrize("start_urls_as_set", [False, True]) +@mock.patch("scrapy.spiders.Request", MockRequest) def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_cls): batch_size = 5 - redis_key = 'start:urls' + redis_key = "start:urls" crawler = get_crawler() - crawler.settings.setdict({ - 'REDIS_HOST': REDIS_HOST, - 'REDIS_PORT': REDIS_PORT, - 'REDIS_START_URLS_KEY': redis_key, - 'REDIS_START_URLS_AS_ZSET': start_urls_as_zset, - 'REDIS_START_URLS_AS_SET': start_urls_as_set, - 'CONCURRENT_REQUESTS': batch_size, - }) + crawler.settings.setdict( + { + "REDIS_HOST": REDIS_HOST, + "REDIS_PORT": REDIS_PORT, + "REDIS_START_URLS_KEY": redis_key, + "REDIS_START_URLS_AS_ZSET": start_urls_as_zset, + "REDIS_START_URLS_AS_SET": start_urls_as_set, + "CONCURRENT_REQUESTS": batch_size, + } + ) spider = spider_cls.from_crawler(crawler) with flushall(spider.server): - urls = [ - f'http://example.com/{i}' for i in range(batch_size * 2) - ] + urls = [f"http://example.com/{i}" for i in range(batch_size * 2)] reqs = [] if start_urls_as_set: server_put = spider.server.sadd elif start_urls_as_zset: - server_put = lambda key, value: spider.server.zadd(key, {value: 0}) + + def server_put(key, value): + spider.server.zadd(key, {value: 0}) + else: server_put = spider.server.rpush for url in urls: @@ -159,7 +172,7 @@ def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_c start_requests = list(spider.start_requests()) if start_urls_as_zset or start_urls_as_set: assert len(start_requests) == batch_size - assert set(map(lambda x: x.url, start_requests)).issubset(map(lambda x: x.url, reqs)) + assert {r.url for r in start_requests}.issubset(r.url for r in reqs) else: assert start_requests == reqs[:batch_size] @@ -174,10 +187,11 @@ def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_c assert crawler.engine.crawl.call_count == batch_size if start_urls_as_zset or start_urls_as_set: - crawler.engine.crawl.assert_has_calls([ - mock.call(req) for req in reqs if req not in start_requests - ], any_order=True) + crawler.engine.crawl.assert_has_calls( + [mock.call(req) for req in reqs if req not in start_requests], + any_order=True, + ) else: - crawler.engine.crawl.assert_has_calls([ - mock.call(req) for req in reqs[batch_size:] - ]) + crawler.engine.crawl.assert_has_calls( + [mock.call(req) for req in reqs[batch_size:]] + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index b0a7b656..d57bc24f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,6 +2,6 @@ def test_bytes_to_str(): - assert bytes_to_str(b'foo') == 'foo' + assert bytes_to_str(b"foo") == "foo" # This char is the same in bytes or latin1. - assert bytes_to_str(b'\xc1', 'latin1') == '\xc1' + assert bytes_to_str(b"\xc1", "latin1") == "\xc1"