From 4d25ab848754ac69ced2bc8657a0693b992b5c63 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 22 Jul 2024 02:22:06 -0400 Subject: [PATCH] docs: Add contributing docs and code comments to explain inter-process communication --- docs/contributing.rst | 19 +++++++++++++++++++ scrapyd/environ.py | 6 ++++++ scrapyd/runner.py | 2 ++ 3 files changed, 27 insertions(+) diff --git a/docs/contributing.rst b/docs/contributing.rst index c5cef0d7..5d05857c 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -36,3 +36,22 @@ To install an editable version for development, clone the repository, change to .. code-block:: shell pip install -e . + +Developer documentation +----------------------- + +Scrapyd starts Scrapy processes. It runs ``scrapy crawl`` in the :ref:`launcher`, and ``scrapy list`` in the :ref:`schedule.json` (to check the spider exists), :ref:`addversion.json` (to return the number of spiders) and :ref:`listspiders.json` (to return the names of spiders) webservices. + +Environment variables +~~~~~~~~~~~~~~~~~~~~~ + +Scrapyd uses environment variables to communicate between the Scrapyd process and the Scrapy processes that it starts. + +SCRAPY_PROJECT + The project to use. See ``scrapyd/runner.py``. +SCRAPYD_EGG_VERSION + The version of the project, to be retrieved as an egg from :ref:`eggstorage` and activated. +SCRAPY_SETTINGS_MODULE + The Python path to the `settings `__ module of the project. + + This is usually the module from the `entry points `__ of the egg, but can be the module from the ``[settings]`` section of a :ref:`scrapy.cfg` file. See ``scrapyd/environ.py``. diff --git a/scrapyd/environ.py b/scrapyd/environ.py index c1c26ab8..1ed3ed8c 100644 --- a/scrapyd/environ.py +++ b/scrapyd/environ.py @@ -30,12 +30,18 @@ def get_settings(self, message): def get_environment(self, message, slot): project = message["_project"] + env = self.initenv.copy() env["SCRAPY_PROJECT"] = project + # If the version is not provided, then the runner uses the default version, determined by egg storage. if "_version" in message: env["SCRAPYD_EGG_VERSION"] = message["_version"] + # Scrapy discovers the same scrapy.cfg files as Scrapyd. So, this is only needed if users are adding [settings] + # sections to Scrapyd configuration files (which Scrapy doesn't discover). This might lead to strange behavior + # if an egg project and a [settings] project have the same name (unlikely). Preserved, since committed in 2010. if project in self.settings: env["SCRAPY_SETTINGS_MODULE"] = self.settings[project] + return env def _get_feed_uri(self, message, extension): diff --git a/scrapyd/runner.py b/scrapyd/runner.py index 394cd45f..a36173e3 100644 --- a/scrapyd/runner.py +++ b/scrapyd/runner.py @@ -26,6 +26,8 @@ def activate_egg(eggpath): distribution.activate() + # setdefault() was added in https://github.com/scrapy/scrapyd/commit/0641a57. It's not clear why, since the egg + # should control its settings module. That said, it is unlikely to already be set. os.environ.setdefault("SCRAPY_SETTINGS_MODULE", distribution.get_entry_info("scrapy", "settings").module_name)