Skip to content

Commit

Permalink
tmp: trace
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Jan 22, 2024
1 parent b27e8e9 commit be5d951
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions sh_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,38 @@

class HubstorageSpiderMiddleware(object):
"""Hubstorage spider middleware.
What it does:
- Sets parent request ids to the requests coming out of the spider.
"""

def __init__(self):
self._seen_requests = seen_requests

def process_spider_output(self, response, result, spider):
parent = self._seen_requests.pop(response.request, None)
print(f"[SpiderMw] parent={parent}")
for x in result:
print(f"[SpiderMw] result={x}")
if isinstance(x, Request):
x.meta[HS_PARENT_ID_KEY] = parent
# Remove request id if it was for some reason set in the request coming from Spider.
print(f"[SpiderMw] x.meta={x.meta}")
x.meta.pop(HS_REQUEST_ID_KEY, None)
yield x


class HubstorageDownloaderMiddleware(object):
"""Hubstorage dowloader middleware.
What it does:
- Generates request ids for all downloaded requests.
- Sets parent request ids for requests generated in downloader middlewares.
- Stores all downloaded requests into Hubstorage.
"""

def __init__(self):
Expand All @@ -55,14 +58,19 @@ def process_request(self, request, spider):
# Check if request id is set, which usually happens for retries or redirects because
# those requests are usually copied from the original one.
request_id = request.meta.pop(HS_REQUEST_ID_KEY, None)
print(f"[DownloaderMw] parent_or_request_id={request_id}")
if request_id is not None:
# Set original request id or None as a parent request id.
request.meta[HS_PARENT_ID_KEY] = request_id
print(f"[DownloaderMw] request.meta={request.meta}")

def process_response(self, request, response, spider):
print(f"[DownloaderMw] request={request} parent={request.meta.setdefault(HS_PARENT_ID_KEY)}")

# This class of response check is intended to fix the bug described here
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112
if type(response).__name__ == "DummyResponse" and type(response).__module__.startswith("scrapy_poet"):
print(f"[DownloaderMw] skip")
return response

self.pipe_writer.write_request(
Expand Down

0 comments on commit be5d951

Please sign in to comment.