diff --git a/sh_scrapy/middlewares.py b/sh_scrapy/middlewares.py index 848ad77..6b93d3e 100644 --- a/sh_scrapy/middlewares.py +++ b/sh_scrapy/middlewares.py @@ -15,11 +15,11 @@ class HubstorageSpiderMiddleware(object): """Hubstorage spider middleware. - + What it does: - + - Sets parent request ids to the requests coming out of the spider. - + """ def __init__(self): @@ -27,23 +27,26 @@ def __init__(self): def process_spider_output(self, response, result, spider): parent = self._seen_requests.pop(response.request, None) + print(f"[SpiderMw] parent={parent}") for x in result: + print(f"[SpiderMw] result={x}") if isinstance(x, Request): x.meta[HS_PARENT_ID_KEY] = parent # Remove request id if it was for some reason set in the request coming from Spider. + print(f"[SpiderMw] x.meta={x.meta}") x.meta.pop(HS_REQUEST_ID_KEY, None) yield x class HubstorageDownloaderMiddleware(object): """Hubstorage dowloader middleware. - + What it does: - + - Generates request ids for all downloaded requests. - Sets parent request ids for requests generated in downloader middlewares. - Stores all downloaded requests into Hubstorage. - + """ def __init__(self): @@ -55,14 +58,19 @@ def process_request(self, request, spider): # Check if request id is set, which usually happens for retries or redirects because # those requests are usually copied from the original one. request_id = request.meta.pop(HS_REQUEST_ID_KEY, None) + print(f"[DownloaderMw] parent_or_request_id={request_id}") if request_id is not None: # Set original request id or None as a parent request id. request.meta[HS_PARENT_ID_KEY] = request_id + print(f"[DownloaderMw] request.meta={request.meta}") def process_response(self, request, response, spider): + print(f"[DownloaderMw] request={request} parent={request.meta.setdefault(HS_PARENT_ID_KEY)}") + # This class of response check is intended to fix the bug described here # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112 if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"): + print(f"[DownloaderMw] skip") return response self.pipe_writer.write_request(