From e4d3f805a1d7ab056921e25f4a3d5441c41fbd7a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 29 Jan 2025 14:28:07 +0100 Subject: [PATCH 01/51] try threads --- jg/plucker/scrapers.py | 160 +++++++++++++++++++++++++++-------------- jg/plucker/settings.py | 2 +- 2 files changed, 107 insertions(+), 55 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index e927af8..a1c7f5a 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,18 +1,22 @@ import asyncio import logging import pickle -from threading import Thread -import traceback from pathlib import Path -from typing import Any, Callable, Coroutine, Generator, Type +from threading import Thread +from typing import Any, Coroutine, Generator, Type, cast import nest_asyncio +from crawlee import Request as ApifyRequest from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import apply_apify_settings -from apify.storages import KeyValueStore +from apify.storages import KeyValueStore, RequestQueue +from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients import MemoryStorageClient # pyright: ignore +from itemadapter import ItemAdapter # pyright: ignore from scrapy import Item, Request, Spider +from scrapy.core.scheduler import BaseScheduler from scrapy.crawler import CrawlerProcess from scrapy.http.headers import Headers from scrapy.http.response import Response @@ -22,6 +26,7 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol +from crawlee.storage_clients.models import ProcessedRequest logger = logging.getLogger("jg.plucker") @@ -55,13 +60,22 @@ async def run_actor( Actor.log.info(f"Spider {spider_class.name}") spider_params = dict(spider_params or (await Actor.get_input()) or {}) proxy_config = spider_params.pop("proxyConfig", None) - settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) - settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.KeyValueCacheStorage" + # settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) + + # TODO experimenting + # settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" + # del settings["ITEM_PIPELINES"][ + # "apify.scrapy.pipelines.ActorDatasetPushPipeline" + # ] + # settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 + # settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" + run_spider(settings, spider_class, spider_params) def configure_async(): - nest_asyncio.apply() + # nest_asyncio.apply() + pass def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: @@ -141,36 +155,20 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): raise StatsError(f"Items missing required fields: {item_count}") -# class NestedLoopThread(Thread): -# def __init__(self, func, *args, **kwargs): -# self.func = func -# self.args = args -# self.kwargs = kwargs -# super().__init__() - -# def run(self): -# return asyncio.run(self.func(*self.args, **self.kwargs)) +def run_async(coroutine: Coroutine) -> Any: + result = None + def run(): + nonlocal result + result = asyncio.run(coroutine) -class AsyncThread(Thread): - def __init__(self, *args, **kwargs): - self._target: Callable | None - self._args: tuple - self._kwargs: dict - super().__init__(*args, **kwargs) # sets the above attributes - self.result = None + t = Thread(target=run) + t.start() + t.join() + return result - def run(self) -> Any: - try: - if self._target is not None: - self.result = asyncio.run(self._target(*self._args, **self._kwargs)) - finally: - # Avoid a refcycle if the thread is running a function with - # an argument that has a member that points to the thread. - del self._target, self._args, self._kwargs - -class KeyValueCacheStorage: +class CacheStorage: # TODO implement expiration as in https://github.com/scrapy/scrapy/blob/a8d9746f562681ed5a268148ec959dcf0881d859/scrapy/extensions/httpcache.py#L250 # TODO implement gzipping @@ -189,16 +187,7 @@ def open_spider(self, spider: Spider) -> None: logger.debug("Using Apify key value cache storage", extra={"spider": spider}) self.spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - - config = Configuration.get_global_configuration() - storage_client = ( - ApifyStorageClient.from_config(config) - if config.is_at_home - else MemoryStorageClient.from_config(config) - ) - self._kv = self._run_async( - KeyValueStore.open(configuration=config, storage_client=storage_client) - ) + self._kv = run_async(Actor.open_key_value_store()) def close_spider(self, spider: Spider) -> None: pass @@ -208,7 +197,7 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None assert self._fingerprinter is not None, "Request fingerprinter not initialized" key = self._fingerprinter.fingerprint(request).hex() - value = self._run_async(self._kv.get_value(key)) + value = run_async(self._kv.get_value(key)) if value is None: return None # not cached @@ -234,16 +223,79 @@ def store_response( "body": response.body, } value = pickle.dumps(data, protocol=4) - self._run_async(self._kv.set_value(key, value)) + run_async(self._kv.set_value(key, value)) + + +class Pipeline: + async def process_item( + self, + item: Item, + spider: Spider, + ) -> Item: + item_dict = ItemAdapter(item).asdict() + Actor.log.debug( + f"Pushing item={item_dict} produced by spider={spider} to the dataset." + ) + run_async(Actor.push_data(item_dict)) + return item + - def _run_async(self, coroutine: Coroutine) -> Any: - result = None +class Scheduler(BaseScheduler): + def __init__(self) -> None: + self._rq: RequestQueue | None = None + self.spider: Spider | None = None + + def open(self, spider: Spider) -> None: # this has to be named "open" + self.spider = spider + self._rq = run_async(Actor.open_request_queue()) + + def has_pending_requests(self) -> bool: + assert self._rq is not None, "Request queue not initialized" + + is_finished = cast(bool, run_async(self._rq.is_finished())) + return not is_finished + + def enqueue_request(self, request: Request) -> bool: + assert self.spider is not None, "Spider not initialized" + assert self._rq is not None, "Request queue not initialized" - def run(): - nonlocal result - result = asyncio.run(coroutine) + call_id = crypto_random_object_id(8) + Actor.log.debug( + f"[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})..." + ) + apify_request = to_apify_request(request, spider=self.spider) + if apify_request is None: + Actor.log.error( + f"Request {request} was not enqueued because it could not be converted to Apify request." + ) + return False + Actor.log.debug( + f"[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})" + ) + result = cast(ProcessedRequest, run_async(self._rq.add_request(apify_request))) + Actor.log.debug(f"[{call_id}]: rq.add_request.result={result}...") + return bool(result.was_already_present) + + def next_request(self) -> Request | None: + assert self._rq is not None, "Request queue not initialized" + assert self.spider is not None, "Spider not initialized" + + call_id = crypto_random_object_id(8) + Actor.log.debug(f"[{call_id}]: ApifyScheduler.next_request was called...") + apify_request = cast(ApifyRequest, run_async(self._rq.fetch_next_request())) + Actor.log.debug( + f"[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})" + ) + if apify_request is None: + return None + + # Let the Request Queue know that the request is being handled. Every request should be marked as handled, + # retrying is handled by the Scrapy's RetryMiddleware. + run_async(self._rq.mark_request_as_handled(apify_request)) - t = Thread(target=run) - t.start() - t.join() - return result + scrapy_request = to_scrapy_request(apify_request, spider=self.spider) + Actor.log.debug( + f"[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned " + f"(scrapy_request={scrapy_request})", + ) + return scrapy_request diff --git a/jg/plucker/settings.py b/jg/plucker/settings.py index 97dba48..d18a276 100644 --- a/jg/plucker/settings.py +++ b/jg/plucker/settings.py @@ -22,7 +22,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 403, 408, 429, 999] -HTTPCACHE_ENABLED = True +# HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 18000 # 5 hours From f6c197f6c7a200d58bd9d2d5d93c4815a0b7f68a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 20:51:32 +0100 Subject: [PATCH 02/51] use thread pool --- jg/plucker/cli.py | 2 +- jg/plucker/scrapers.py | 68 +++++++++++++++++++++++++----------------- jg/plucker/settings.py | 2 +- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index c17c1d3..de97abd 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -94,7 +94,7 @@ def crawl( raise click.BadParameter( f"Actor {actor_path} not found! Valid actors: {actors}" ) - asyncio.run(run_actor(settings, spider_class, spider_params)) + run_actor(settings, spider_class, spider_params) else: logger.info(f"Crawling as Scrapy spider {spider_name!r}") run_spider(settings, spider_class, spider_params) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index a1c7f5a..d0c3f9c 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,19 +1,20 @@ import asyncio import logging import pickle +from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from threading import Thread from typing import Any, Coroutine, Generator, Type, cast import nest_asyncio -from crawlee import Request as ApifyRequest from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import apply_apify_settings -from apify.storages import KeyValueStore, RequestQueue +from apify.storages import Dataset, KeyValueStore, RequestQueue +from crawlee import Request as ApifyRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients import MemoryStorageClient # pyright: ignore +from crawlee.storage_clients.models import ProcessedRequest from itemadapter import ItemAdapter # pyright: ignore from scrapy import Item, Request, Spider from scrapy.core.scheduler import BaseScheduler @@ -26,12 +27,14 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol -from crawlee.storage_clients.models import ProcessedRequest logger = logging.getLogger("jg.plucker") +thread_pool = ThreadPoolExecutor() + + def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None ) -> None: @@ -51,26 +54,41 @@ def run_spider( evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) -async def run_actor( +def run_actor( settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: - config = Configuration.get_global_configuration() - config.purge_on_start = True - async with Actor: + run_async(Actor.init()) + try: Actor.log.info(f"Spider {spider_class.name}") - spider_params = dict(spider_params or (await Actor.get_input()) or {}) + Actor.log.info("Reading input") + spider_params = dict(spider_params or (run_async(Actor.get_input())) or {}) proxy_config = spider_params.pop("proxyConfig", None) - # settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) - # TODO experimenting - # settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" - # del settings["ITEM_PIPELINES"][ - # "apify.scrapy.pipelines.ActorDatasetPushPipeline" - # ] - # settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 - # settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" + Actor.log.info("Applying Apify settings") + settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) + Actor.log.info("Overriding Apify settings with custom ones") + settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" + del settings["ITEM_PIPELINES"][ + "apify.scrapy.pipelines.ActorDatasetPushPipeline" + ] + settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 + settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" + + Actor.log.info("Purging the default dataset") + dataset = cast(Dataset, run_async(Actor.open_dataset())) + run_async(dataset.drop()) + + Actor.log.info("Purging the default request queue") + request_queue = cast(RequestQueue, run_async(Actor.open_request_queue())) + run_async(request_queue.drop()) + + Actor.log.info("Starting the spider") run_spider(settings, spider_class, spider_params) + except Exception as e: + run_async(Actor.fail(exception=e)) + else: + run_async(Actor.exit()) def configure_async(): @@ -156,16 +174,8 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): def run_async(coroutine: Coroutine) -> Any: - result = None - - def run(): - nonlocal result - result = asyncio.run(coroutine) - - t = Thread(target=run) - t.start() - t.join() - return result + future = thread_pool.submit(asyncio.run, coroutine) + return future.result() class CacheStorage: @@ -187,7 +197,9 @@ def open_spider(self, spider: Spider) -> None: logger.debug("Using Apify key value cache storage", extra={"spider": spider}) self.spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - self._kv = run_async(Actor.open_key_value_store()) + self._kv = run_async( + Actor.open_key_value_store(name=f"httpcache-{spider.name}") + ) def close_spider(self, spider: Spider) -> None: pass diff --git a/jg/plucker/settings.py b/jg/plucker/settings.py index d18a276..97dba48 100644 --- a/jg/plucker/settings.py +++ b/jg/plucker/settings.py @@ -22,7 +22,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 403, 408, 429, 999] -# HTTPCACHE_ENABLED = True +HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 18000 # 5 hours From b771eba80379a35c05e48e13016d69159375863c Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 21:00:12 +0100 Subject: [PATCH 03/51] threads cannot be reused, the asyncio loop would be closed --- jg/plucker/scrapers.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index d0c3f9c..412b1fc 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,7 +1,7 @@ import asyncio import logging import pickle -from concurrent.futures import ThreadPoolExecutor +from threading import Thread from pathlib import Path from typing import Any, Coroutine, Generator, Type, cast @@ -32,9 +32,6 @@ logger = logging.getLogger("jg.plucker") -thread_pool = ThreadPoolExecutor() - - def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None ) -> None: @@ -174,8 +171,16 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): def run_async(coroutine: Coroutine) -> Any: - future = thread_pool.submit(asyncio.run, coroutine) - return future.result() + result = None + + def run() -> None: + nonlocal result + result = asyncio.run(coroutine) + + t = Thread(target=run) + t.start() + t.join() + return result class CacheStorage: From b5e28b65cc12516be1d540508de6a90030aa2ec2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 21:14:30 +0100 Subject: [PATCH 04/51] make sure to have a new loop --- jg/plucker/scrapers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 412b1fc..dca3b0b 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -175,6 +175,7 @@ def run_async(coroutine: Coroutine) -> Any: def run() -> None: nonlocal result + asyncio.set_event_loop(asyncio.new_event_loop()) result = asyncio.run(coroutine) t = Thread(target=run) From 9120952f9f37a3d2f452f49e3dc37b40e3a0f1d2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 21:18:36 +0100 Subject: [PATCH 05/51] debug --- jg/plucker/scrapers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index dca3b0b..d4c748d 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -3,6 +3,7 @@ import pickle from threading import Thread from pathlib import Path +import threading from typing import Any, Coroutine, Generator, Type, cast import nest_asyncio @@ -176,6 +177,9 @@ def run_async(coroutine: Coroutine) -> Any: def run() -> None: nonlocal result asyncio.set_event_loop(asyncio.new_event_loop()) + print( + f"Thread {threading.current_thread().name} has event loop: {asyncio.get_event_loop()}, executing {coroutine.__name__}" + ) result = asyncio.run(coroutine) t = Thread(target=run) From 3cd8a499031805876888323a65ff2d407f3543a6 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:04:21 +0100 Subject: [PATCH 06/51] cast magic See https://github.com/encode/httpx/discussions/2959#discussioncomment-7665278 --- jg/plucker/scrapers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index d4c748d..97ff002 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -56,6 +56,7 @@ def run_actor( settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: run_async(Actor.init()) + Actor._apify_client.http_client.httpx_client.headers["Connection"] = "close" try: Actor.log.info(f"Spider {spider_class.name}") Actor.log.info("Reading input") From 1739af2063ee74fd0cb67d1b48cb5e6ced5412c4 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:10:25 +0100 Subject: [PATCH 07/51] cast more magic --- jg/plucker/scrapers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 97ff002..669227d 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -56,7 +56,7 @@ def run_actor( settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: run_async(Actor.init()) - Actor._apify_client.http_client.httpx_client.headers["Connection"] = "close" + Actor._apify_client.http_client.httpx_client._headers["Connection"] = "close" try: Actor.log.info(f"Spider {spider_class.name}") Actor.log.info("Reading input") From 0e8d848e7b20a434f41972c84c9547437c002f0a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:29:23 +0100 Subject: [PATCH 08/51] cast async magic --- jg/plucker/scrapers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 669227d..9a4c949 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -57,6 +57,7 @@ def run_actor( ) -> None: run_async(Actor.init()) Actor._apify_client.http_client.httpx_client._headers["Connection"] = "close" + Actor._apify_client.http_client.httpx_async_client._headers["Connection"] = "close" try: Actor.log.info(f"Spider {spider_class.name}") Actor.log.info("Reading input") From e87a3447216bdb69ae90549393f668a8df236e27 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:32:18 +0100 Subject: [PATCH 09/51] debug --- jg/plucker/scrapers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 9a4c949..4413e98 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -182,6 +182,7 @@ def run() -> None: print( f"Thread {threading.current_thread().name} has event loop: {asyncio.get_event_loop()}, executing {coroutine.__name__}" ) + print(f"Headers: {Actor._apify_client.http_client.httpx_async_client._headers}") result = asyncio.run(coroutine) t = Thread(target=run) From 63c581478c93b9f00c9dd2044df6784a89819dcb Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:33:12 +0100 Subject: [PATCH 10/51] re-raise --- jg/plucker/scrapers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 4413e98..e29e9be 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -87,6 +87,7 @@ def run_actor( run_spider(settings, spider_class, spider_params) except Exception as e: run_async(Actor.fail(exception=e)) + raise else: run_async(Actor.exit()) From 0ff21d4763e5a2e3e694c80415b7fd6d1449e5a0 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 30 Jan 2025 23:46:41 +0100 Subject: [PATCH 11/51] perform sorcery --- jg/plucker/scrapers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index e29e9be..db9a61e 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -6,6 +6,7 @@ import threading from typing import Any, Coroutine, Generator, Type, cast +import httpx import nest_asyncio from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient @@ -56,8 +57,15 @@ def run_actor( settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: run_async(Actor.init()) - Actor._apify_client.http_client.httpx_client._headers["Connection"] = "close" - Actor._apify_client.http_client.httpx_async_client._headers["Connection"] = "close" + Actor._apify_client.http_client.httpx_client = None + headers = Actor._apify_client.http_client.httpx_async_client.headers + headers["Connection"] = "close" + Actor._apify_client.http_client.httpx_async_client = httpx.AsyncClient( + headers=headers, + follow_redirects=True, + timeout=Actor._apify_client.http_client.httpx_async_client.timeout, + limits=httpx.Limits(max_keepalive_connections=0, max_connections=1), + ) try: Actor.log.info(f"Spider {spider_class.name}") Actor.log.info("Reading input") From fa0a15290aa731aecb90e202ef944dbb147927e5 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 31 Jan 2025 00:13:49 +0100 Subject: [PATCH 12/51] expire --- jg/plucker/scrapers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index db9a61e..e25b3a7 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -64,7 +64,9 @@ def run_actor( headers=headers, follow_redirects=True, timeout=Actor._apify_client.http_client.httpx_async_client.timeout, - limits=httpx.Limits(max_keepalive_connections=0, max_connections=1), + limits=httpx.Limits( + max_keepalive_connections=0, max_connections=1, keepalive_expiry=0 + ), ) try: Actor.log.info(f"Spider {spider_class.name}") From 07beb2c2b52ab20259daac4aa91b30a4490adc4f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 31 Jan 2025 00:20:43 +0100 Subject: [PATCH 13/51] debug --- jg/plucker/scrapers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index e25b3a7..14900c2 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -189,11 +189,9 @@ def run_async(coroutine: Coroutine) -> Any: def run() -> None: nonlocal result - asyncio.set_event_loop(asyncio.new_event_loop()) print( - f"Thread {threading.current_thread().name} has event loop: {asyncio.get_event_loop()}, executing {coroutine.__name__}" + f"Thread {threading.current_thread().name} executing {coroutine.__name__}" ) - print(f"Headers: {Actor._apify_client.http_client.httpx_async_client._headers}") result = asyncio.run(coroutine) t = Thread(target=run) From 905d429ba0dd0955d38df747214f24e99ff13cfd Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 31 Jan 2025 00:27:16 +0100 Subject: [PATCH 14/51] aggressively close connections before doing anything --- jg/plucker/scrapers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 14900c2..fd07030 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -187,12 +187,18 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): def run_async(coroutine: Coroutine) -> Any: result = None + async def pokus(): + await ( + Actor._apify_client.http_client.httpx_async_client._transport._pool.aclose() + ) + return await coroutine + def run() -> None: nonlocal result print( f"Thread {threading.current_thread().name} executing {coroutine.__name__}" ) - result = asyncio.run(coroutine) + result = asyncio.run(pokus()) t = Thread(target=run) t.start() From 92ab6d917787fd4ce95522f40fd5222336f6d7f6 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 11:02:45 +0100 Subject: [PATCH 15/51] use Twisted reactor to run asyncio --- jg/plucker/cli.py | 3 - jg/plucker/scrapers.py | 179 ++++++++++++++++++++++++++--------------- 2 files changed, 114 insertions(+), 68 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index de97abd..d5a02b8 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -1,4 +1,3 @@ -import asyncio import importlib import json import logging @@ -26,7 +25,6 @@ from jg.plucker.scrapers import ( StatsError, - configure_async, generate_schema, get_spider_module_name, iter_actor_paths, @@ -85,7 +83,6 @@ def crawl( logger.info("Reading spider params from stdin") spider_params = json.load(spider_params_f) - configure_async() try: if apify: logger.info(f"Crawling as Apify actor {actor_path}") diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index fd07030..bbfe97e 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,110 +1,165 @@ import asyncio +import builtins import logging import pickle -from threading import Thread -from pathlib import Path import threading +from pathlib import Path +from threading import Thread from typing import Any, Coroutine, Generator, Type, cast -import httpx -import nest_asyncio -from apify import Actor, Configuration -from apify.apify_storage_client import ApifyStorageClient +from apify import Actor from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import apply_apify_settings from apify.storages import Dataset, KeyValueStore, RequestQueue from crawlee import Request as ApifyRequest from crawlee._utils.crypto import crypto_random_object_id -from crawlee.storage_clients import MemoryStorageClient # pyright: ignore from crawlee.storage_clients.models import ProcessedRequest from itemadapter import ItemAdapter # pyright: ignore from scrapy import Item, Request, Spider from scrapy.core.scheduler import BaseScheduler -from scrapy.crawler import CrawlerProcess +from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy.http.headers import Headers from scrapy.http.response import Response from scrapy.responsetypes import responsetypes from scrapy.settings import BaseSettings, Settings from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader from scrapy.statscollectors import StatsCollector +from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol +from twisted.internet import asyncioreactor, defer +from twisted.internet.task import react logger = logging.getLogger("jg.plucker") +# new_client_original = _ActorType.new_client + + +# def new_client_patch(self, **kwargs) -> ApifyClientAsync: +# print(f"PATCH thread {threading.current_thread().name}") +# client = new_client_original(self, **kwargs) +# # client.http_client.httpx_async_client._headers["Connection"] = "close" +# # client.http_client = HTTPClientAsync( +# # token=token, +# # max_retries=client.max_retries, +# # min_delay_between_retries_millis=client.min_delay_between_retries_millis, +# # timeout_secs=client.timeout_secs, +# # ) +# http_client = client.http_client +# http_client.httpx_async_client = httpx.AsyncClient( +# headers={"Fuck": "you"}, +# follow_redirects=True, +# timeout=http_client.timeout_secs, +# ) +# return client + + +# _ActorType.new_client = new_client_patch + + def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None ) -> None: - logger.debug(f"Spider params: {spider_params!r}") - settings.set("SPIDER_PARAMS", spider_params) + raise NotImplementedError() + # logger.debug(f"Spider params: {spider_params!r}") + # settings.set("SPIDER_PARAMS", spider_params) - crawler_process = CrawlerProcess(settings, install_root_handler=False) - crawler_process.crawl(spider_class) - stats_collector = get_stats_collector(crawler_process) - crawler_process.start() + # crawler_process = CrawlerProcess(settings, install_root_handler=False) + # crawler_process.crawl(spider_class) + # stats_collector = get_stats_collector(crawler_process) + # crawler_process.start() - min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) - logger.debug(f"Min items required: {min_items}") + # min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) + # logger.debug(f"Min items required: {min_items}") - logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") - evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) - evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) + # logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") + # evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) + # evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) def run_actor( - settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None + base_settings: Settings, + spider_class: Type[Spider], + spider_params: dict[str, Any] | None, ) -> None: - run_async(Actor.init()) - Actor._apify_client.http_client.httpx_client = None - headers = Actor._apify_client.http_client.httpx_async_client.headers - headers["Connection"] = "close" - Actor._apify_client.http_client.httpx_async_client = httpx.AsyncClient( - headers=headers, - follow_redirects=True, - timeout=Actor._apify_client.http_client.httpx_async_client.timeout, - limits=httpx.Limits( - max_keepalive_connections=0, max_connections=1, keepalive_expiry=0 - ), - ) - try: + asyncioreactor.install() + + @defer.inlineCallbacks + def crawl(reactor): + Actor.log.info("Starting actor") + yield deferred_from_coro(Actor.init()) + Actor.log.info(f"Spider {spider_class.name}") Actor.log.info("Reading input") - spider_params = dict(spider_params or (run_async(Actor.get_input())) or {}) - proxy_config = spider_params.pop("proxyConfig", None) + params = spider_params or (yield deferred_from_coro(Actor.get_input())) or {} + proxy_config = params.pop("proxyConfig", None) Actor.log.info("Applying Apify settings") - settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) - - Actor.log.info("Overriding Apify settings with custom ones") - settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" - del settings["ITEM_PIPELINES"][ - "apify.scrapy.pipelines.ActorDatasetPushPipeline" - ] - settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 - settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" + settings = apply_apify_settings( + settings=base_settings, proxy_config=proxy_config + ) + runner = CrawlerRunner(settings) Actor.log.info("Purging the default dataset") - dataset = cast(Dataset, run_async(Actor.open_dataset())) - run_async(dataset.drop()) + dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) + yield deferred_from_coro(dataset.drop()) Actor.log.info("Purging the default request queue") - request_queue = cast(RequestQueue, run_async(Actor.open_request_queue())) - run_async(request_queue.drop()) + request_queue = cast( + RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) + ) + yield deferred_from_coro(request_queue.drop()) Actor.log.info("Starting the spider") - run_spider(settings, spider_class, spider_params) - except Exception as e: - run_async(Actor.fail(exception=e)) - raise - else: - run_async(Actor.exit()) + yield runner.crawl(spider_class) + Actor.log.info("Exiting actor") + builtins.__IPYTHON__ = True # deception, Actor.exit() won't call sys.exit() + yield deferred_from_coro(Actor.exit()) + del builtins.__IPYTHON__ -def configure_async(): - # nest_asyncio.apply() - pass + Actor.log.info("Done!") + + react(crawl, []) + + # # httpx_client = Actor._apify_client.http_client.httpx_async_client + # # httpx_client._headers["Connection"] = "close" + # # print(f"HTTPX setting connection {httpx_client._headers} (id: {id(httpx_client)})") + # try: + # Actor.log.info(f"Spider {spider_class.name}") + # Actor.log.info("Reading input") + # spider_params = dict(spider_params or (run_async(Actor.get_input())) or {}) + # proxy_config = spider_params.pop("proxyConfig", None) + + # Actor.log.info("Applying Apify settings") + # settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) + + # Actor.log.info("Overriding Apify settings with custom ones") + # settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" + # del settings["ITEM_PIPELINES"][ + # "apify.scrapy.pipelines.ActorDatasetPushPipeline" + # ] + # settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 + # settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" + + # Actor.log.info("Purging the default dataset") + # dataset = cast(Dataset, run_async(Actor.open_dataset())) + # run_async(dataset.drop()) + + # Actor.log.info("Purging the default request queue") + # request_queue = cast(RequestQueue, run_async(Actor.open_request_queue())) + # run_async(request_queue.drop()) + + # Actor.log.info("Starting the spider") + # # run_spider(settings, spider_class, spider_params) + + # except Exception as e: + # run_async(Actor.fail(exception=e)) + # raise + # else: + # run_async(Actor.exit()) def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: @@ -187,18 +242,12 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): def run_async(coroutine: Coroutine) -> Any: result = None - async def pokus(): - await ( - Actor._apify_client.http_client.httpx_async_client._transport._pool.aclose() - ) - return await coroutine - def run() -> None: nonlocal result print( - f"Thread {threading.current_thread().name} executing {coroutine.__name__}" + f"Thread {threading.current_thread().name}, executing {coroutine.__name__}" ) - result = asyncio.run(pokus()) + result = asyncio.run(coroutine) t = Thread(target=run) t.start() From ca39323e50dedc25eff8959f4f245be33c8bc1e0 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 15:03:51 +0100 Subject: [PATCH 16/51] nicer --- jg/plucker/scrapers.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index bbfe97e..0ff0c44 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,5 +1,6 @@ import asyncio import builtins +import contextlib import logging import pickle import threading @@ -116,9 +117,8 @@ def crawl(reactor): yield runner.crawl(spider_class) Actor.log.info("Exiting actor") - builtins.__IPYTHON__ = True # deception, Actor.exit() won't call sys.exit() - yield deferred_from_coro(Actor.exit()) - del builtins.__IPYTHON__ + with prevent_sys_exit(): + yield deferred_from_coro(Actor.exit()) Actor.log.info("Done!") @@ -162,6 +162,16 @@ def crawl(reactor): # run_async(Actor.exit()) +@contextlib.contextmanager +def prevent_sys_exit(): + """Deception, Actor.exit() won't call sys.exit(), see also https://github.com/apify/apify-sdk-python/pull/389""" + builtins.__IPYTHON__ = True + try: + yield + finally: + builtins.__IPYTHON__ = False + + def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: for actor_spec in Path(path).rglob(".actor/actor.json"): yield actor_spec.parent.parent.relative_to(".") From d51ace59d031baa264422e01d2e94d66aa7e34cc Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 15:34:51 +0100 Subject: [PATCH 17/51] make it work, somehow --- jg/plucker/scrapers.py | 98 ++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 60 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 0ff0c44..8d9a261 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -36,8 +36,6 @@ # new_client_original = _ActorType.new_client - - # def new_client_patch(self, **kwargs) -> ApifyClientAsync: # print(f"PATCH thread {threading.current_thread().name}") # client = new_client_original(self, **kwargs) @@ -55,29 +53,39 @@ # timeout=http_client.timeout_secs, # ) # return client +# _ActorType.new_client = new_client_patch +# httpx_client = Actor._apify_client.http_client.httpx_async_client +# httpx_client._headers["Connection"] = "close" +# print(f"HTTPX setting connection {httpx_client._headers} (id: {id(httpx_client)})") -# _ActorType.new_client = new_client_patch +# Actor.log.info("Overriding Apify settings with custom ones") +# settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" +# del settings["ITEM_PIPELINES"][ +# "apify.scrapy.pipelines.ActorDatasetPushPipeline" +# ] +# settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 +# settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None ) -> None: - raise NotImplementedError() - # logger.debug(f"Spider params: {spider_params!r}") - # settings.set("SPIDER_PARAMS", spider_params) + # TODO use crawler runner instead? make run_spider() and run_actor() DRY? + logger.debug(f"Spider params: {spider_params!r}") + settings.set("SPIDER_PARAMS", spider_params) - # crawler_process = CrawlerProcess(settings, install_root_handler=False) - # crawler_process.crawl(spider_class) - # stats_collector = get_stats_collector(crawler_process) - # crawler_process.start() + crawler_process = CrawlerProcess(settings, install_root_handler=False) + crawler_process.crawl(spider_class) + stats_collector = get_stats_collector(crawler_process) + crawler_process.start() - # min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) - # logger.debug(f"Min items required: {min_items}") + min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) + logger.debug(f"Min items required: {min_items}") - # logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") - # evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) - # evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) + logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") + evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) + evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) def run_actor( @@ -85,6 +93,7 @@ def run_actor( spider_class: Type[Spider], spider_params: dict[str, Any] | None, ) -> None: + logger.debug("Installing asyncio reactor") asyncioreactor.install() @defer.inlineCallbacks @@ -97,25 +106,31 @@ def crawl(reactor): params = spider_params or (yield deferred_from_coro(Actor.get_input())) or {} proxy_config = params.pop("proxyConfig", None) + logger.debug(f"Spider params: {spider_params!r}") + base_settings.set("SPIDER_PARAMS", spider_params) + Actor.log.info("Applying Apify settings") settings = apply_apify_settings( settings=base_settings, proxy_config=proxy_config ) runner = CrawlerRunner(settings) - Actor.log.info("Purging the default dataset") - dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) - yield deferred_from_coro(dataset.drop()) + # TODO purge on start + # Actor.log.info("Purging the default dataset") + # dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) + # yield deferred_from_coro(dataset.drop()) - Actor.log.info("Purging the default request queue") - request_queue = cast( - RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) - ) - yield deferred_from_coro(request_queue.drop()) + # Actor.log.info("Purging the default request queue") + # request_queue = cast( + # RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) + # ) + # yield deferred_from_coro(request_queue.drop()) Actor.log.info("Starting the spider") yield runner.crawl(spider_class) + # TODO evaluate stats + Actor.log.info("Exiting actor") with prevent_sys_exit(): yield deferred_from_coro(Actor.exit()) @@ -124,43 +139,6 @@ def crawl(reactor): react(crawl, []) - # # httpx_client = Actor._apify_client.http_client.httpx_async_client - # # httpx_client._headers["Connection"] = "close" - # # print(f"HTTPX setting connection {httpx_client._headers} (id: {id(httpx_client)})") - # try: - # Actor.log.info(f"Spider {spider_class.name}") - # Actor.log.info("Reading input") - # spider_params = dict(spider_params or (run_async(Actor.get_input())) or {}) - # proxy_config = spider_params.pop("proxyConfig", None) - - # Actor.log.info("Applying Apify settings") - # settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) - - # Actor.log.info("Overriding Apify settings with custom ones") - # settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" - # del settings["ITEM_PIPELINES"][ - # "apify.scrapy.pipelines.ActorDatasetPushPipeline" - # ] - # settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 - # settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" - - # Actor.log.info("Purging the default dataset") - # dataset = cast(Dataset, run_async(Actor.open_dataset())) - # run_async(dataset.drop()) - - # Actor.log.info("Purging the default request queue") - # request_queue = cast(RequestQueue, run_async(Actor.open_request_queue())) - # run_async(request_queue.drop()) - - # Actor.log.info("Starting the spider") - # # run_spider(settings, spider_class, spider_params) - - # except Exception as e: - # run_async(Actor.fail(exception=e)) - # raise - # else: - # run_async(Actor.exit()) - @contextlib.contextmanager def prevent_sys_exit(): From ea69f522ade5aacd608b1f2c55eb035524e4adb9 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 15:36:50 +0100 Subject: [PATCH 18/51] use apify from branch --- poetry.lock | 111 +++++++++++++++++++++++++++---------------------- pyproject.toml | 2 +- 2 files changed, 62 insertions(+), 51 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7a34cf6..e9c08e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,30 +45,34 @@ trio = ["trio (>=0.26.1)"] [[package]] name = "apify" -version = "2.2.1" +version = "2.2.2" description = "Apify SDK for Python" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "apify-2.2.1-py3-none-any.whl", hash = "sha256:60b190d6e7d438d2ccbeeb40151adcbbd1adfb3bf85936fa01c37e3fbb8e2edb"}, - {file = "apify-2.2.1.tar.gz", hash = "sha256:9a30828e5f908c020e85fc14f70c74e890ab1b20157ce20b50d199564b12d649"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] apify-client = ">=1.8.1" apify-shared = ">=1.2.1" -crawlee = ">=0.5.1,<0.6.0" +crawlee = "~0.5.1" cryptography = ">=42.0.0" httpx = ">=0.27.0" lazy-object-proxy = ">=1.10.0" more_itertools = ">=10.2.0" -scrapy = {version = ">=2.11.0", optional = true, markers = "extra == \"scrapy\""} +scrapy = {version = ">=2.11.0", optional = true} typing-extensions = ">=4.1.0" -websockets = ">=10.0,<14.0.0" +websockets = ">=10.0 <14.0.0" [package.extras] scrapy = ["scrapy (>=2.11.0)"] +[package.source] +type = "git" +url = "https://github.com/apify/apify-sdk-python.git" +reference = "fixing-scrapy" +resolved_reference = "bf284319ec576143155b5ee3e1cf572fcd5d03a6" + [[package]] name = "apify-client" version = "1.8.1" @@ -153,17 +157,18 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"] [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -335,13 +340,13 @@ cffi = ">=1.0.0" [[package]] name = "certifi" -version = "2024.12.14" +version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, - {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, + {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, + {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, ] [[package]] @@ -594,13 +599,13 @@ rich = "*" [[package]] name = "crawlee" -version = "0.5.2" +version = "0.5.4" description = "Crawlee for Python" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "crawlee-0.5.2-py3-none-any.whl", hash = "sha256:7d72850c7fbc8b627250449e3c733d68a7701252385a4482ca635c3b431b0885"}, - {file = "crawlee-0.5.2.tar.gz", hash = "sha256:cd90b984dc3ec2e48339761fc2032db12663447c8e18f07ab7dae31a19263067"}, + {file = "crawlee-0.5.4-py3-none-any.whl", hash = "sha256:2b02ebab913a9bbc74f1a52fc1f4c9c46ce40c0d7f0342482a0576b645aa0e9d"}, + {file = "crawlee-0.5.4.tar.gz", hash = "sha256:24b2c18e784fc94adfb3c7f061b9694e5148c3050b279349801971ae8f0db0d9"}, ] [package.dependencies] @@ -623,7 +628,8 @@ typing-extensions = ">=4.1.0" yarl = ">=1.18.0" [package.extras] -all = ["beautifulsoup4 (>=4.12.0)", "curl-cffi (>=0.7.2)", "html5lib (>=1.0)", "lxml (>=5.2.0)", "parsel (>=1.9.0)", "playwright (>=1.27.0)"] +adaptive-playwright = ["jaro-winkler (>=2.0.3)", "playwright (>=1.27.0)", "scikit-learn (==1.5.2)", "scikit-learn (>=1.6.0)"] +all = ["beautifulsoup4 (>=4.12.0)", "curl-cffi (>=0.7.2)", "html5lib (>=1.0)", "jaro-winkler (>=2.0.3)", "lxml (>=5.2.0)", "parsel (>=1.9.0)", "playwright (>=1.27.0)", "scikit-learn (==1.5.2)", "scikit-learn (>=1.6.0)"] beautifulsoup = ["beautifulsoup4 (>=4.12.0)", "html5lib (>=1.0)", "lxml (>=5.2.0)"] curl-impersonate = ["curl-cffi (>=0.7.2)"] parsel = ["parsel (>=1.9.0)"] @@ -847,18 +853,18 @@ files = [ [[package]] name = "h2" -version = "4.1.0" -description = "HTTP/2 State-Machine based protocol implementation" +version = "4.2.0" +description = "Pure-Python HTTP/2 protocol implementation" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" files = [ - {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, - {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, + {file = "h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0"}, + {file = "h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f"}, ] [package.dependencies] -hpack = ">=4.0,<5" -hyperframe = ">=6.0,<7" +hpack = ">=4.1,<5" +hyperframe = ">=6.1,<7" [[package]] name = "hpack" @@ -1099,15 +1105,20 @@ readchar = ">=4.2.0" [[package]] name = "itemadapter" -version = "0.10.0" +version = "0.11.0" description = "Common interface for data container classes" optional = false python-versions = ">=3.9" files = [ - {file = "itemadapter-0.10.0-py3-none-any.whl", hash = "sha256:d404a91cd0ebf17b7983af1aae43116d375e8d831a1dcbe98de5723b2c66e36d"}, - {file = "itemadapter-0.10.0.tar.gz", hash = "sha256:2655c8c50f1a8405c9fa74b8cdc4da7fec541ca217bc821b90acc8451c98a9d2"}, + {file = "itemadapter-0.11.0-py3-none-any.whl", hash = "sha256:07bc1a26a51f124ec155b80ee3d170eda06ffccd7ceba99c08bea68ad4de5fcd"}, + {file = "itemadapter-0.11.0.tar.gz", hash = "sha256:3b0f27f4c5e2e8ae415d83e3d60d33adb7ba09b98c30638bc606fb1dff2ecdd2"}, ] +[package.extras] +attrs = ["attrs (>=18.1.0)"] +pydantic = ["pydantic (>=1.8)"] +scrapy = ["scrapy (>=2.2)"] + [[package]] name = "itemloaders" version = "1.3.2" @@ -2358,29 +2369,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.9.3" +version = "0.9.4" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.9.3-py3-none-linux_armv6l.whl", hash = "sha256:7f39b879064c7d9670197d91124a75d118d00b0990586549949aae80cdc16624"}, - {file = "ruff-0.9.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a187171e7c09efa4b4cc30ee5d0d55a8d6c5311b3e1b74ac5cb96cc89bafc43c"}, - {file = "ruff-0.9.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c59ab92f8e92d6725b7ded9d4a31be3ef42688a115c6d3da9457a5bda140e2b4"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc153c25e715be41bb228bc651c1e9b1a88d5c6e5ed0194fa0dfea02b026439"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:646909a1e25e0dc28fbc529eab8eb7bb583079628e8cbe738192853dbbe43af5"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a5a46e09355695fbdbb30ed9889d6cf1c61b77b700a9fafc21b41f097bfbba4"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c4bb09d2bbb394e3730d0918c00276e79b2de70ec2a5231cd4ebb51a57df9ba1"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96a87ec31dc1044d8c2da2ebbed1c456d9b561e7d087734336518181b26b3aa5"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb7554aca6f842645022fe2d301c264e6925baa708b392867b7a62645304df4"}, - {file = "ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cabc332b7075a914ecea912cd1f3d4370489c8018f2c945a30bcc934e3bc06a6"}, - {file = "ruff-0.9.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:33866c3cc2a575cbd546f2cd02bdd466fed65118e4365ee538a3deffd6fcb730"}, - {file = "ruff-0.9.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:006e5de2621304c8810bcd2ee101587712fa93b4f955ed0985907a36c427e0c2"}, - {file = "ruff-0.9.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ba6eea4459dbd6b1be4e6bfc766079fb9b8dd2e5a35aff6baee4d9b1514ea519"}, - {file = "ruff-0.9.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:90230a6b8055ad47d3325e9ee8f8a9ae7e273078a66401ac66df68943ced029b"}, - {file = "ruff-0.9.3-py3-none-win32.whl", hash = "sha256:eabe5eb2c19a42f4808c03b82bd313fc84d4e395133fb3fc1b1516170a31213c"}, - {file = "ruff-0.9.3-py3-none-win_amd64.whl", hash = "sha256:040ceb7f20791dfa0e78b4230ee9dce23da3b64dd5848e40e3bf3ab76468dcf4"}, - {file = "ruff-0.9.3-py3-none-win_arm64.whl", hash = "sha256:800d773f6d4d33b0a3c60e2c6ae8f4c202ea2de056365acfa519aa48acf28e0b"}, - {file = "ruff-0.9.3.tar.gz", hash = "sha256:8293f89985a090ebc3ed1064df31f3b4b56320cdfcec8b60d3295bddb955c22a"}, + {file = "ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706"}, + {file = "ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf"}, + {file = "ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214"}, + {file = "ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231"}, + {file = "ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b"}, + {file = "ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6"}, + {file = "ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c"}, + {file = "ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0"}, + {file = "ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402"}, + {file = "ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e"}, + {file = "ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41"}, + {file = "ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7"}, ] [[package]] @@ -3108,4 +3119,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.12.*" -content-hash = "0fe178f41b420f134b9c3bbb105fc1b8eb302f68fda574981026e0e48963a9e7" +content-hash = "27274f4fbccdbebda12a59ca4811760caaef849b6dc434e4e94e16665afe3389" diff --git a/pyproject.toml b/pyproject.toml index af9c7db..098fb63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ plucker = "jg.plucker.cli:main" [tool.poetry.dependencies] python = "3.12.*" -apify = {version = "2.2.1", extras = ["scrapy"]} +apify = { git = "https://github.com/apify/apify-sdk-python.git", branch = "fixing-scrapy", extras = ["scrapy"] } apify-client = "1.8.1" # deployment of actors, monitoring, automation apify-shared = "*" # importing a few enums click = "8.1.8" From 21120c8384c443b6202af3052da9f598065a119f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 16:31:29 +0100 Subject: [PATCH 19/51] build Plucker on top of current Apify WIP --- jg/plucker/cli.py | 21 +++--- jg/plucker/loggers.py | 12 ++-- jg/plucker/scrapers.py | 154 ++++++----------------------------------- 3 files changed, 39 insertions(+), 148 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index d5a02b8..e21987d 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -1,3 +1,4 @@ +import asyncio import importlib import json import logging @@ -12,11 +13,12 @@ from pydantic import BaseModel from scrapy.utils.project import get_project_settings -from jg.plucker.loggers import configure_logging +from apify.scrapy import run_scrapy_actor, setup_logging +from twisted.internet import asyncioreactor -settings = get_project_settings() -configure_logging(settings, sys.argv) +# settings = get_project_settings() +# configure_logging(settings, sys.argv) # ruff: noqa: E402 @@ -25,11 +27,12 @@ from jg.plucker.scrapers import ( StatsError, + actor_main, generate_schema, get_spider_module_name, iter_actor_paths, - run_actor, - run_spider, + # run_actor, + # run_spider, ) @@ -48,7 +51,7 @@ def __str__(self) -> str: @click.group() @click.option("-d", "--debug", default=False, is_flag=True) def main(debug: bool = False): - pass # --debug is processed in configure_logging() + setup_logging() # TODO process --debug @main.command(context_settings={"ignore_unknown_options": True}) @@ -91,10 +94,12 @@ def crawl( raise click.BadParameter( f"Actor {actor_path} not found! Valid actors: {actors}" ) - run_actor(settings, spider_class, spider_params) + asyncioreactor.install(asyncio.get_event_loop()) + run_scrapy_actor(actor_main(spider_class, spider_params)) else: logger.info(f"Crawling as Scrapy spider {spider_name!r}") - run_spider(settings, spider_class, spider_params) + raise NotImplementedError() + # TODO run_spider(settings, spider_class, spider_params) except StatsError as e: logger.error(e) raise click.Abort() diff --git a/jg/plucker/loggers.py b/jg/plucker/loggers.py index 5bd2478..6f75dca 100644 --- a/jg/plucker/loggers.py +++ b/jg/plucker/loggers.py @@ -4,7 +4,7 @@ from apify.log import ActorLogFormatter from scrapy.settings import Settings -from scrapy.utils import log as scrapy_logging +from scrapy.utils import log as scrapy_log CUSTOM_LOGGER_NAMES = ["jg.plucker", "apify", "apify_client"] @@ -26,16 +26,16 @@ def configure_logging(settings: Settings, argv: list[str]): configure_logger(logger_name, logging_level, handler) # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` - # call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though + # call here: https://github.com/scrapy/scrapy/blob/2.12.0/scrapy/utils/log.py#L117 (even though # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because # otherwise we would lose some log messages. - scrapy_logging.configure_logging = reconfigure_scrapy_logging( - logging_level, handler - )(scrapy_logging.configure_logging) + scrapy_log.configure_logging = reconfigure_scrapy_log(logging_level, handler)( + scrapy_log.configure_logging + ) -def reconfigure_scrapy_logging( +def reconfigure_scrapy_log( logging_level: str, *handlers: logging.StreamHandler ) -> Callable: def decorator(configure_logging: Callable) -> Callable: diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 8d9a261..51ffe53 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -25,11 +25,9 @@ from scrapy.settings import BaseSettings, Settings from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader from scrapy.statscollectors import StatsCollector -from scrapy.utils.defer import deferred_from_coro +from scrapy.utils.defer import deferred_to_future from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol -from twisted.internet import asyncioreactor, defer -from twisted.internet.task import react logger = logging.getLogger("jg.plucker") @@ -67,6 +65,17 @@ # settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 # settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" +# TODO purge on start +# Actor.log.info("Purging the default dataset") +# dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) +# yield deferred_from_coro(dataset.drop()) + +# Actor.log.info("Purging the default request queue") +# request_queue = cast( +# RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) +# ) +# yield deferred_from_coro(request_queue.drop()) + def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None @@ -88,66 +97,18 @@ def run_spider( evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) -def run_actor( - base_settings: Settings, - spider_class: Type[Spider], - spider_params: dict[str, Any] | None, -) -> None: - logger.debug("Installing asyncio reactor") - asyncioreactor.install() +async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | None): + async with Actor: + Actor.log.info(f"Starting actor for spider {spider_class.name}") - @defer.inlineCallbacks - def crawl(reactor): - Actor.log.info("Starting actor") - yield deferred_from_coro(Actor.init()) - - Actor.log.info(f"Spider {spider_class.name}") - Actor.log.info("Reading input") - params = spider_params or (yield deferred_from_coro(Actor.get_input())) or {} + params = spider_params or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) - logger.debug(f"Spider params: {spider_params!r}") - base_settings.set("SPIDER_PARAMS", spider_params) - - Actor.log.info("Applying Apify settings") - settings = apply_apify_settings( - settings=base_settings, proxy_config=proxy_config - ) - runner = CrawlerRunner(settings) - - # TODO purge on start - # Actor.log.info("Purging the default dataset") - # dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) - # yield deferred_from_coro(dataset.drop()) - - # Actor.log.info("Purging the default request queue") - # request_queue = cast( - # RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) - # ) - # yield deferred_from_coro(request_queue.drop()) - - Actor.log.info("Starting the spider") - yield runner.crawl(spider_class) + settings = apply_apify_settings(proxy_config=proxy_config) + settings.set("SPIDER_PARAMS", spider_params) - # TODO evaluate stats - - Actor.log.info("Exiting actor") - with prevent_sys_exit(): - yield deferred_from_coro(Actor.exit()) - - Actor.log.info("Done!") - - react(crawl, []) - - -@contextlib.contextmanager -def prevent_sys_exit(): - """Deception, Actor.exit() won't call sys.exit(), see also https://github.com/apify/apify-sdk-python/pull/389""" - builtins.__IPYTHON__ = True - try: - yield - finally: - builtins.__IPYTHON__ = False + crawler_runner = CrawlerRunner(settings) + await deferred_to_future(crawler_runner.crawl(spider_class)) def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: @@ -301,78 +262,3 @@ def store_response( } value = pickle.dumps(data, protocol=4) run_async(self._kv.set_value(key, value)) - - -class Pipeline: - async def process_item( - self, - item: Item, - spider: Spider, - ) -> Item: - item_dict = ItemAdapter(item).asdict() - Actor.log.debug( - f"Pushing item={item_dict} produced by spider={spider} to the dataset." - ) - run_async(Actor.push_data(item_dict)) - return item - - -class Scheduler(BaseScheduler): - def __init__(self) -> None: - self._rq: RequestQueue | None = None - self.spider: Spider | None = None - - def open(self, spider: Spider) -> None: # this has to be named "open" - self.spider = spider - self._rq = run_async(Actor.open_request_queue()) - - def has_pending_requests(self) -> bool: - assert self._rq is not None, "Request queue not initialized" - - is_finished = cast(bool, run_async(self._rq.is_finished())) - return not is_finished - - def enqueue_request(self, request: Request) -> bool: - assert self.spider is not None, "Spider not initialized" - assert self._rq is not None, "Request queue not initialized" - - call_id = crypto_random_object_id(8) - Actor.log.debug( - f"[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})..." - ) - apify_request = to_apify_request(request, spider=self.spider) - if apify_request is None: - Actor.log.error( - f"Request {request} was not enqueued because it could not be converted to Apify request." - ) - return False - Actor.log.debug( - f"[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})" - ) - result = cast(ProcessedRequest, run_async(self._rq.add_request(apify_request))) - Actor.log.debug(f"[{call_id}]: rq.add_request.result={result}...") - return bool(result.was_already_present) - - def next_request(self) -> Request | None: - assert self._rq is not None, "Request queue not initialized" - assert self.spider is not None, "Spider not initialized" - - call_id = crypto_random_object_id(8) - Actor.log.debug(f"[{call_id}]: ApifyScheduler.next_request was called...") - apify_request = cast(ApifyRequest, run_async(self._rq.fetch_next_request())) - Actor.log.debug( - f"[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})" - ) - if apify_request is None: - return None - - # Let the Request Queue know that the request is being handled. Every request should be marked as handled, - # retrying is handled by the Scrapy's RetryMiddleware. - run_async(self._rq.mark_request_as_handled(apify_request)) - - scrapy_request = to_scrapy_request(apify_request, spider=self.spider) - Actor.log.debug( - f"[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned " - f"(scrapy_request={scrapy_request})", - ) - return scrapy_request From 278d0bd99d104e93c2779444ebbccbaf0c4547ae Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 18:55:41 +0100 Subject: [PATCH 20/51] try stuff --- jg/plucker/scrapers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 51ffe53..606447f 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -101,12 +101,15 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | async with Actor: Actor.log.info(f"Starting actor for spider {spider_class.name}") - params = spider_params or (await Actor.get_input()) or {} + params = spider_params or {} # TODO or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) settings = apply_apify_settings(proxy_config=proxy_config) settings.set("SPIDER_PARAMS", spider_params) + # await Actor.open_request_queue(name="default") + + Actor.log.info("Starting the spider") crawler_runner = CrawlerRunner(settings) await deferred_to_future(crawler_runner.crawl(spider_class)) From f51e92b11b6b4f8578bb0600bc40cb39572b431b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 19:16:48 +0100 Subject: [PATCH 21/51] update apify code --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index e9c08e4..58c0915 100644 --- a/poetry.lock +++ b/poetry.lock @@ -71,7 +71,7 @@ scrapy = ["scrapy (>=2.11.0)"] type = "git" url = "https://github.com/apify/apify-sdk-python.git" reference = "fixing-scrapy" -resolved_reference = "bf284319ec576143155b5ee3e1cf572fcd5d03a6" +resolved_reference = "cc1af3d763621b5d46fe467fc41d413d102bbf8a" [[package]] name = "apify-client" From 016e738d7fa756b5b5171aa91f4ba758656062fe Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 19:16:58 +0100 Subject: [PATCH 22/51] debug --- jg/plucker/scrapers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 606447f..275bda5 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -15,6 +15,7 @@ from crawlee import Request as ApifyRequest from crawlee._utils.crypto import crypto_random_object_id from crawlee.storage_clients.models import ProcessedRequest +import httpx from itemadapter import ItemAdapter # pyright: ignore from scrapy import Item, Request, Spider from scrapy.core.scheduler import BaseScheduler @@ -100,6 +101,11 @@ def run_spider( async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | None): async with Actor: Actor.log.info(f"Starting actor for spider {spider_class.name}") + # Actor.apify_client.http_client.httpx_async_client = httpx.AsyncClient( + # headers={"Connection": "close"}, + # follow_redirects=True, + # timeout=Actor.apify_client.http_client.timeout_secs, + # ) params = spider_params or {} # TODO or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) From e361699034b667a1bf0b5006aff4191d83210e74 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 20:53:05 +0100 Subject: [PATCH 23/51] this doesn't help --- jg/plucker/scrapers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 275bda5..88825bf 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -113,8 +113,6 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | settings = apply_apify_settings(proxy_config=proxy_config) settings.set("SPIDER_PARAMS", spider_params) - # await Actor.open_request_queue(name="default") - Actor.log.info("Starting the spider") crawler_runner = CrawlerRunner(settings) await deferred_to_future(crawler_runner.crawl(spider_class)) From c2251983b10475bfda361b5a5d9116d4f1a3375a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 20:59:33 +0100 Subject: [PATCH 24/51] read input --- jg/plucker/scrapers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 88825bf..2905e26 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -107,7 +107,7 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | # timeout=Actor.apify_client.http_client.timeout_secs, # ) - params = spider_params or {} # TODO or (await Actor.get_input()) or {} + params = spider_params or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) settings = apply_apify_settings(proxy_config=proxy_config) From c8f0ef592a0416e127b4522df3317ddcae7b0d84 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 21:43:49 +0100 Subject: [PATCH 25/51] implement debug and cache storage --- jg/plucker/cli.py | 3 +- jg/plucker/scrapers.py | 67 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index e21987d..1242f1c 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -51,7 +51,8 @@ def __str__(self) -> str: @click.group() @click.option("-d", "--debug", default=False, is_flag=True) def main(debug: bool = False): - setup_logging() # TODO process --debug + setup_logging() + logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO) @main.command(context_settings={"ignore_unknown_options": True}) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 2905e26..c642579 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -8,7 +8,7 @@ from threading import Thread from typing import Any, Coroutine, Generator, Type, cast -from apify import Actor +from apify import Actor, Configuration from apify.scrapy.requests import to_apify_request, to_scrapy_request from apify.scrapy.utils import apply_apify_settings from apify.storages import Dataset, KeyValueStore, RequestQueue @@ -29,6 +29,15 @@ from scrapy.utils.defer import deferred_to_future from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol +from apify.scrapy.scheduler import ( + _start_event_loop, + _run_async_coro, + _TIMEOUT, + _shutdown_async_tasks, + _force_exit_event_loop, +) +from apify.apify_storage_client import ApifyStorageClient +import traceback logger = logging.getLogger("jg.plucker") @@ -101,16 +110,12 @@ def run_spider( async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | None): async with Actor: Actor.log.info(f"Starting actor for spider {spider_class.name}") - # Actor.apify_client.http_client.httpx_async_client = httpx.AsyncClient( - # headers={"Connection": "close"}, - # follow_redirects=True, - # timeout=Actor.apify_client.http_client.timeout_secs, - # ) params = spider_params or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) settings = apply_apify_settings(proxy_config=proxy_config) + settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) Actor.log.info("Starting the spider") @@ -226,25 +231,59 @@ def __init__(self, settings: BaseSettings): self._kv: KeyValueStore | None = None self._fingerprinter: RequestFingerprinterProtocol | None = None + logger.debug("Starting background thread for cache storage's event loop") + self._eventloop = asyncio.new_event_loop() + self._thread = threading.Thread( + target=lambda: _start_event_loop(self._eventloop), daemon=True + ) + self._thread.start() + def open_spider(self, spider: Spider) -> None: logger.debug("Using Apify key value cache storage", extra={"spider": spider}) self.spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - self._kv = run_async( - Actor.open_key_value_store(name=f"httpcache-{spider.name}") - ) + kv_name = f"httpcache-{spider.name}" + + async def open_kv() -> KeyValueStore: + config = Configuration.get_global_configuration() + if config.is_at_home: + storage_client = ApifyStorageClient.from_config(config) + return await KeyValueStore.open( + name=kv_name, storage_client=storage_client + ) + return await KeyValueStore.open(name=kv_name) + + logger.debug(f"Opening cache storage's {kv_name!r} key value store") + self._kv = _run_async_coro(self._eventloop, open_kv()) def close_spider(self, spider: Spider) -> None: - pass + logger.debug("Closing cache storage...") + try: + if self._eventloop.is_running(): + _run_async_coro(self._eventloop, _shutdown_async_tasks(self._eventloop)) + self._eventloop.call_soon_threadsafe(self._eventloop.stop) + self._thread.join(timeout=_TIMEOUT) + if self._thread.is_alive(): + logger.warning( + "Background thread for cache storage didn't exit cleanly! Forcing shutdown..." + ) + _force_exit_event_loop(self._eventloop, self._thread) + except KeyboardInterrupt: + logger.warning("Shutdown interrupted by KeyboardInterrupt!") + except Exception: + logger.exception("Exception occurred while shutting down cache storage") + finally: + logger.debug("Cache storage closed") def retrieve_response(self, spider: Spider, request: Request) -> Response | None: assert self._kv is not None, "Key value store not initialized" assert self._fingerprinter is not None, "Request fingerprinter not initialized" key = self._fingerprinter.fingerprint(request).hex() - value = run_async(self._kv.get_value(key)) + value = _run_async_coro(self._eventloop, self._kv.get_value(key)) if value is None: - return None # not cached + logger.debug("Cache miss", extra={"request": request}) + return None data = pickle.loads(value) url = data["url"] @@ -252,6 +291,8 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None headers = Headers(data["headers"]) body = data["body"] respcls = responsetypes.from_args(headers=headers, url=url, body=body) + + logger.debug("Cache hit", extra={"request": request}) return respcls(url=url, headers=headers, status=status, body=body) def store_response( @@ -268,4 +309,4 @@ def store_response( "body": response.body, } value = pickle.dumps(data, protocol=4) - run_async(self._kv.set_value(key, value)) + _run_async_coro(self._eventloop, self._kv.set_value(key, value)) From 21974b7ef62f09e7f9b8181a61938613b28fa272 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 22:05:14 +0100 Subject: [PATCH 26/51] implement cache expiration --- jg/plucker/cli.py | 17 +++-------------- jg/plucker/scrapers.py | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index 1242f1c..2802c28 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -8,22 +8,13 @@ from pathlib import Path from typing import IO, Callable, Generator, Type +import click +from apify.scrapy import run_scrapy_actor, setup_logging from apify_client import ApifyClient from apify_shared.consts import ActorJobStatus, ActorSourceType from pydantic import BaseModel -from scrapy.utils.project import get_project_settings - -from apify.scrapy import run_scrapy_actor, setup_logging -from twisted.internet import asyncioreactor - - -# settings = get_project_settings() -# configure_logging(settings, sys.argv) - - -# ruff: noqa: E402 -import click from scrapy import Item +from twisted.internet import asyncioreactor from jg.plucker.scrapers import ( StatsError, @@ -31,8 +22,6 @@ generate_schema, get_spider_module_name, iter_actor_paths, - # run_actor, - # run_spider, ) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index c642579..bbab079 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -6,6 +6,7 @@ import threading from pathlib import Path from threading import Thread +from time import time from typing import Any, Coroutine, Generator, Type, cast from apify import Actor, Configuration @@ -217,7 +218,6 @@ def run() -> None: class CacheStorage: - # TODO implement expiration as in https://github.com/scrapy/scrapy/blob/a8d9746f562681ed5a268148ec959dcf0881d859/scrapy/extensions/httpcache.py#L250 # TODO implement gzipping def __init__(self, settings: BaseSettings): @@ -227,6 +227,7 @@ def __init__(self, settings: BaseSettings): "Make sure you have it configured in the TWISTED_REACTOR setting. See the asyncio " "documentation of Scrapy for more information.", ) + self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") self.spider: Spider | None = None self._kv: KeyValueStore | None = None self._fingerprinter: RequestFingerprinterProtocol | None = None @@ -280,7 +281,17 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None assert self._fingerprinter is not None, "Request fingerprinter not initialized" key = self._fingerprinter.fingerprint(request).hex() - value = _run_async_coro(self._eventloop, self._kv.get_value(key)) + + seconds = _run_async_coro(self._eventloop, self._kv.get_value(f"{key}_time")) + if seconds is None: + logger.debug("Cache miss", extra={"request": request}) + return None + + if 0 < self.expiration_secs < time() - seconds: + logger.debug("Cache expired", extra={"request": request}) + return None + + value = _run_async_coro(self._eventloop, self._kv.get_value(f"{key}_data")) if value is None: logger.debug("Cache miss", extra={"request": request}) return None @@ -309,4 +320,5 @@ def store_response( "body": response.body, } value = pickle.dumps(data, protocol=4) - _run_async_coro(self._eventloop, self._kv.set_value(key, value)) + _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_data", value)) + _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_time", time())) From 0818a3cf4489007a05a595d5ae48e87571c5cd6b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 22:07:48 +0100 Subject: [PATCH 27/51] cleanup --- jg/plucker/scrapers.py | 99 ++++++++---------------------------------- 1 file changed, 18 insertions(+), 81 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index bbab079..97aa948 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,25 +1,23 @@ import asyncio -import builtins -import contextlib import logging import pickle import threading from pathlib import Path -from threading import Thread from time import time -from typing import Any, Coroutine, Generator, Type, cast +from typing import Any, Generator, Type from apify import Actor, Configuration -from apify.scrapy.requests import to_apify_request, to_scrapy_request +from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy.scheduler import ( + _TIMEOUT, + _force_exit_event_loop, + _run_async_coro, + _shutdown_async_tasks, + _start_event_loop, +) from apify.scrapy.utils import apply_apify_settings -from apify.storages import Dataset, KeyValueStore, RequestQueue -from crawlee import Request as ApifyRequest -from crawlee._utils.crypto import crypto_random_object_id -from crawlee.storage_clients.models import ProcessedRequest -import httpx -from itemadapter import ItemAdapter # pyright: ignore +from apify.storages import KeyValueStore from scrapy import Item, Request, Spider -from scrapy.core.scheduler import BaseScheduler from scrapy.crawler import CrawlerProcess, CrawlerRunner from scrapy.http.headers import Headers from scrapy.http.response import Response @@ -30,64 +28,11 @@ from scrapy.utils.defer import deferred_to_future from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol -from apify.scrapy.scheduler import ( - _start_event_loop, - _run_async_coro, - _TIMEOUT, - _shutdown_async_tasks, - _force_exit_event_loop, -) -from apify.apify_storage_client import ApifyStorageClient -import traceback logger = logging.getLogger("jg.plucker") -# new_client_original = _ActorType.new_client -# def new_client_patch(self, **kwargs) -> ApifyClientAsync: -# print(f"PATCH thread {threading.current_thread().name}") -# client = new_client_original(self, **kwargs) -# # client.http_client.httpx_async_client._headers["Connection"] = "close" -# # client.http_client = HTTPClientAsync( -# # token=token, -# # max_retries=client.max_retries, -# # min_delay_between_retries_millis=client.min_delay_between_retries_millis, -# # timeout_secs=client.timeout_secs, -# # ) -# http_client = client.http_client -# http_client.httpx_async_client = httpx.AsyncClient( -# headers={"Fuck": "you"}, -# follow_redirects=True, -# timeout=http_client.timeout_secs, -# ) -# return client -# _ActorType.new_client = new_client_patch - -# httpx_client = Actor._apify_client.http_client.httpx_async_client -# httpx_client._headers["Connection"] = "close" -# print(f"HTTPX setting connection {httpx_client._headers} (id: {id(httpx_client)})") - -# Actor.log.info("Overriding Apify settings with custom ones") -# settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.CacheStorage" -# del settings["ITEM_PIPELINES"][ -# "apify.scrapy.pipelines.ActorDatasetPushPipeline" -# ] -# settings["ITEM_PIPELINES"]["jg.plucker.scrapers.Pipeline"] = 1000 -# settings["SCHEDULER"] = "jg.plucker.scrapers.Scheduler" - -# TODO purge on start -# Actor.log.info("Purging the default dataset") -# dataset = cast(Dataset, (yield deferred_from_coro(Actor.open_dataset()))) -# yield deferred_from_coro(dataset.drop()) - -# Actor.log.info("Purging the default request queue") -# request_queue = cast( -# RequestQueue, (yield deferred_from_coro(Actor.open_request_queue())) -# ) -# yield deferred_from_coro(request_queue.drop()) - - def run_spider( settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None ) -> None: @@ -119,6 +64,14 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) + Actor.log.info("Purging the default dataset") + dataset = await Actor.open_dataset() + await dataset.drop() + + Actor.log.info("Purging the default request queue") + request_queue = await Actor.open_request_queue() + await request_queue.drop() + Actor.log.info("Starting the spider") crawler_runner = CrawlerRunner(settings) await deferred_to_future(crawler_runner.crawl(spider_class)) @@ -201,22 +154,6 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): raise StatsError(f"Items missing required fields: {item_count}") -def run_async(coroutine: Coroutine) -> Any: - result = None - - def run() -> None: - nonlocal result - print( - f"Thread {threading.current_thread().name}, executing {coroutine.__name__}" - ) - result = asyncio.run(coroutine) - - t = Thread(target=run) - t.start() - t.join() - return result - - class CacheStorage: # TODO implement gzipping From 4fa331e3a33629402b8995d9daabdff43f276d8c Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 22:11:06 +0100 Subject: [PATCH 28/51] try this --- jg/plucker/scrapers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 97aa948..7a6cae3 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -67,10 +67,12 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | Actor.log.info("Purging the default dataset") dataset = await Actor.open_dataset() await dataset.drop() + await Actor.open_dataset() Actor.log.info("Purging the default request queue") request_queue = await Actor.open_request_queue() await request_queue.drop() + await Actor.open_request_queue() Actor.log.info("Starting the spider") crawler_runner = CrawlerRunner(settings) From aa899967427a03ae9068c15a09cb977bedb71367 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Wed, 5 Feb 2025 22:14:02 +0100 Subject: [PATCH 29/51] give up on clearing the storages for now --- jg/plucker/scrapers.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 7a6cae3..76647f7 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -64,16 +64,6 @@ async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) - Actor.log.info("Purging the default dataset") - dataset = await Actor.open_dataset() - await dataset.drop() - await Actor.open_dataset() - - Actor.log.info("Purging the default request queue") - request_queue = await Actor.open_request_queue() - await request_queue.drop() - await Actor.open_request_queue() - Actor.log.info("Starting the spider") crawler_runner = CrawlerRunner(settings) await deferred_to_future(crawler_runner.crawl(spider_class)) From e35c9f0456d6145ffce1d7300ad100fdc221accb Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 6 Feb 2025 08:48:05 +0100 Subject: [PATCH 30/51] remove any artificial limitations --- jg/plucker/jobs_jobscz/spider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index ff16155..a7b3c55 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -85,11 +85,11 @@ class Spider(BaseSpider): name = "jobs-jobscz" - custom_settings = { - "CONCURRENT_REQUESTS_PER_DOMAIN": 2, - "DOWNLOAD_DELAY": 0.5, - "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.5, - } + # custom_settings = { + # "CONCURRENT_REQUESTS_PER_DOMAIN": 2, + # "DOWNLOAD_DELAY": 0.5, + # "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.5, + # } start_urls = [ "https://www.jobs.cz/prace/programator/", From b723244762189a520a112d62ed210628002acc13 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 6 Feb 2025 08:50:29 +0100 Subject: [PATCH 31/51] longer expiration --- jg/plucker/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jg/plucker/settings.py b/jg/plucker/settings.py index 97dba48..ded8f88 100644 --- a/jg/plucker/settings.py +++ b/jg/plucker/settings.py @@ -24,7 +24,7 @@ HTTPCACHE_ENABLED = True -HTTPCACHE_EXPIRATION_SECS = 18000 # 5 hours +HTTPCACHE_EXPIRATION_SECS = 43200 # 12 hours SPIDER_LOADER_CLASS = "jg.plucker.scrapers.SpiderLoader" From adadd29d3358320c1d7a0aa26b709852a453dd4f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 6 Feb 2025 12:23:24 +0100 Subject: [PATCH 32/51] remove custom settings --- jg/plucker/jobs_jobscz/spider.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index a7b3c55..2fe6b18 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -85,12 +85,6 @@ class Spider(BaseSpider): name = "jobs-jobscz" - # custom_settings = { - # "CONCURRENT_REQUESTS_PER_DOMAIN": 2, - # "DOWNLOAD_DELAY": 0.5, - # "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.5, - # } - start_urls = [ "https://www.jobs.cz/prace/programator/", "https://www.jobs.cz/prace/tester/", From 5c0dbceb3b8b12eaa6ea0fccd7af687081a8ccc7 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 6 Feb 2025 15:50:36 +0100 Subject: [PATCH 33/51] remove keys from cache when expired --- jg/plucker/scrapers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 76647f7..6017d6a 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -218,6 +218,8 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None if 0 < self.expiration_secs < time() - seconds: logger.debug("Cache expired", extra={"request": request}) + _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_data", None)) + _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_time", None)) return None value = _run_async_coro(self._eventloop, self._kv.get_value(f"{key}_data")) From 1b452f56ce5d35af04a3de345fa1fd1ca4440894 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 10:11:09 +0100 Subject: [PATCH 34/51] implement running spiders just under scrapy --- jg/plucker/cli.py | 15 ++++----- jg/plucker/scrapers.py | 72 +++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index 2802c28..3c0b2eb 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -1,4 +1,3 @@ -import asyncio import importlib import json import logging @@ -9,19 +8,20 @@ from typing import IO, Callable, Generator, Type import click -from apify.scrapy import run_scrapy_actor, setup_logging +from apify.scrapy import setup_logging from apify_client import ApifyClient from apify_shared.consts import ActorJobStatus, ActorSourceType from pydantic import BaseModel from scrapy import Item -from twisted.internet import asyncioreactor from jg.plucker.scrapers import ( StatsError, - actor_main, generate_schema, get_spider_module_name, iter_actor_paths, + run_as_actor, + run_as_spider, + start_reactor, ) @@ -84,12 +84,11 @@ def crawl( raise click.BadParameter( f"Actor {actor_path} not found! Valid actors: {actors}" ) - asyncioreactor.install(asyncio.get_event_loop()) - run_scrapy_actor(actor_main(spider_class, spider_params)) + run = run_as_actor(spider_class, spider_params) else: logger.info(f"Crawling as Scrapy spider {spider_name!r}") - raise NotImplementedError() - # TODO run_spider(settings, spider_class, spider_params) + run = run_as_spider(spider_class, spider_params) + start_reactor(run) except StatsError as e: logger.error(e) raise click.Abort() diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 6017d6a..dd9f00f 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -4,10 +4,11 @@ import threading from pathlib import Path from time import time -from typing import Any, Generator, Type +from typing import Any, Coroutine, Generator, Type from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy import run_scrapy_actor from apify.scrapy.scheduler import ( _TIMEOUT, _force_exit_event_loop, @@ -18,55 +19,66 @@ from apify.scrapy.utils import apply_apify_settings from apify.storages import KeyValueStore from scrapy import Item, Request, Spider -from scrapy.crawler import CrawlerProcess, CrawlerRunner +from scrapy.crawler import Crawler, CrawlerRunner from scrapy.http.headers import Headers from scrapy.http.response import Response from scrapy.responsetypes import responsetypes -from scrapy.settings import BaseSettings, Settings +from scrapy.settings import BaseSettings from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader -from scrapy.statscollectors import StatsCollector +from scrapy.statscollectors import StatsT from scrapy.utils.defer import deferred_to_future +from scrapy.utils.project import get_project_settings from scrapy.utils.reactor import is_asyncio_reactor_installed from scrapy.utils.request import RequestFingerprinterProtocol +from twisted.internet import asyncioreactor logger = logging.getLogger("jg.plucker") -def run_spider( - settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None +def start_reactor(coroutine: Coroutine) -> None: + asyncioreactor.install(asyncio.get_event_loop()) + run_scrapy_actor(coroutine) + + +async def run_as_spider( + spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: - # TODO use crawler runner instead? make run_spider() and run_actor() DRY? - logger.debug(f"Spider params: {spider_params!r}") + settings = get_project_settings() settings.set("SPIDER_PARAMS", spider_params) + logger.debug(f"Spider params: {spider_params!r}") - crawler_process = CrawlerProcess(settings, install_root_handler=False) - crawler_process.crawl(spider_class) - stats_collector = get_stats_collector(crawler_process) - crawler_process.start() + logger.info("Starting the spider") + runner = CrawlerRunner(settings) + crawler = runner.create_crawler(spider_class) - min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) - logger.debug(f"Min items required: {min_items}") + await deferred_to_future(runner.crawl(crawler)) - logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") - evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) - evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) + check_crawl_results(crawler) -async def actor_main(spider_class: Type[Spider], spider_params: dict[str, Any] | None): +async def run_as_actor( + spider_class: Type[Spider], spider_params: dict[str, Any] | None +): async with Actor: Actor.log.info(f"Starting actor for spider {spider_class.name}") params = spider_params or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) + Actor.log.debug(f"Proxy config: {proxy_config!r}") settings = apply_apify_settings(proxy_config=proxy_config) settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) + Actor.log.debug(f"Spider params: {spider_params!r}") Actor.log.info("Starting the spider") - crawler_runner = CrawlerRunner(settings) - await deferred_to_future(crawler_runner.crawl(spider_class)) + runner = CrawlerRunner(settings) + crawler = runner.create_crawler(spider_class) + + await deferred_to_future(runner.crawl(crawler)) + + check_crawl_results(crawler) def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: @@ -119,17 +131,27 @@ def generate_schema(item_class: Type[Item]) -> dict: } -def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector: - assert len(crawler_process.crawlers) == 1, "Exactly one crawler expected" - crawler = crawler_process.crawlers.pop() - return crawler.stats +def check_crawl_results(crawler: Crawler) -> None: + spider_class = crawler.spidercls + + assert crawler.stats is not None, "Stats collector not initialized" + stats = crawler.stats.get_stats() + assert stats, "Stats not collected" + + default_min_items = crawler.settings.getint("SPIDER_MIN_ITEMS") + min_items = getattr(spider_class, "min_items", default_min_items) + logger.debug(f"Min items required: {min_items}") + + logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") + evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) + evaluate_stats_fn(stats, min_items) class StatsError(RuntimeError): pass -def evaluate_stats(stats: dict[str, Any], min_items: int): +def evaluate_stats(stats: StatsT, min_items: int): item_count = stats.get("item_scraped_count", 0) if exc_count := stats.get("spider_exceptions"): raise StatsError(f"Exceptions raised: {exc_count}") From bee87284c3147c30fa759a2da77eeeb5ec1eccc2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 11:48:54 +0100 Subject: [PATCH 35/51] debug never ending looping of the request queue --- jg/plucker/jobs_jobscz/spider.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 2fe6b18..324c4f9 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -86,10 +86,12 @@ class Spider(BaseSpider): name = "jobs-jobscz" start_urls = [ - "https://www.jobs.cz/prace/programator/", - "https://www.jobs.cz/prace/tester/", - "https://www.jobs.cz/prace/datovy-analytik/", + # "https://www.jobs.cz/prace/programator/", + # "https://www.jobs.cz/prace/tester/", + # "https://www.jobs.cz/prace/datovy-analytik/", + "https://www.jobs.cz/prace/truhlar/", ] + min_items = 5 # TODO DEBUG ! employment_types_labels = [ "Typ pracovního poměru", From 6f17aa806508463ea77de87d0a6df781b903fe4f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 12:04:01 +0100 Subject: [PATCH 36/51] this is better --- jg/plucker/jobs_jobscz/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 324c4f9..8293b55 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -89,7 +89,7 @@ class Spider(BaseSpider): # "https://www.jobs.cz/prace/programator/", # "https://www.jobs.cz/prace/tester/", # "https://www.jobs.cz/prace/datovy-analytik/", - "https://www.jobs.cz/prace/truhlar/", + "https://www.jobs.cz/prace/kuchar/", ] min_items = 5 # TODO DEBUG ! From e2a74ba054dea3f2ee6d09f5f9b635114c27e67a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 14:51:53 +0100 Subject: [PATCH 37/51] fix parsing error --- jg/plucker/jobs_jobscz/spider.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 8293b55..c19acae 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -372,7 +372,9 @@ def select_widget(names: list[str]) -> str: def parse_widget_script_json(text: str) -> dict[str, Any] | None: for match in re.finditer(WIDGET_DATA_SCRIPT_JSON_RE, text): - data_text = re.sub(r"\'", r"\\'", match.group("data")) + data_text = match.group("data") + data_text = re.sub(r"\'", r"\\'", data_text) + data_text = re.sub(r'\\\\"', r"\"", data_text) data = json.loads(data_text) if "widgets" in data: return data From bbcb50a20ca76d68ef3849e4c5b16ea7ce8204fd Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 15:08:57 +0100 Subject: [PATCH 38/51] test specimen --- tests/jobs_jobscz/job_widget_script9.js | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/jobs_jobscz/job_widget_script9.js diff --git a/tests/jobs_jobscz/job_widget_script9.js b/tests/jobs_jobscz/job_widget_script9.js new file mode 100644 index 0000000..2a2d3f2 --- /dev/null +++ b/tests/jobs_jobscz/job_widget_script9.js @@ -0,0 +1 @@ +!function(n){var r={};function o(t){var e;return(r[t]||(e=r[t]={i:t,l:!1,exports:{}},n[t].call(e.exports,e,e.exports,o),e.l=!0,e)).exports}o.m=n,o.c=r,o.d=function(t,e,n){o.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},o.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},o.t=function(e,t){if(1&t&&(e=o(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(o.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var r in e)o.d(n,r,function(t){return e[t]}.bind(null,r));return n},o.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return o.d(e,"a",e),e},o.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},o.p="",o(o.s=199)}([function(t,e,n){var n=n(46),r=Function.prototype,o=r.call,r=n&&r.bind.bind(o,o);t.exports=n?r:function(t){return function(){return o.apply(t,arguments)}}},function(t,e,n){t.exports=function(t){try{return!!t()}catch(t){return!0}}},function(t,e,n){var u=n(4),l=n(50).f,f=n(24),d=n(13),p=n(63),h=n(97),v=n(55);t.exports=function(t,e){var n,r,o,i=t.target,a=t.global,s=t.stat,c=a?u:s?u[i]||p(i,{}):u[i]&&u[i].prototype;if(c)for(n in e){if(r=e[n],o=t.dontCallGetSet?(o=l(c,n))&&o.value:c[n],!v(a?n:i+(s?".":"#")+n,t.forced)&&void 0!==o){if(typeof r==typeof o)continue;h(r,o)}(t.sham||o&&o.sham)&&f(r,"sham",!0),d(c,n,r,t)}}},function(t,e,n){var r=n(4),o=n(28),i=n(7),a=n(64),s=n(29),n=n(89),c=r.Symbol,u=o("wks"),l=n?c.for||c:c&&c.withoutSetter||a;t.exports=function(t){return i(u,t)||(u[t]=s&&i(c,t)?c[t]:l("Symbol."+t)),u[t]}},function(n,t,e){!function(t){function e(t){return t&&t.Math===Math&&t}n.exports=e("object"==typeof globalThis&&globalThis)||e("object"==typeof window&&window)||e("object"==typeof self&&self)||e("object"==typeof t&&t)||e("object"==typeof this&&this)||function(){return this}()||Function("return this")()}.call(this,e(144))},function(t,e,n){var r="object"==typeof document&&document.all;t.exports=void 0===r&&void 0!==r?function(t){return"function"==typeof t||t===r}:function(t){return"function"==typeof t}},function(t,e,n){var n=n(46),r=Function.prototype.call;t.exports=n?r.bind(r):function(){return r.apply(r,arguments)}},function(t,e,n){var r=n(0),o=n(14),i=r({}.hasOwnProperty);t.exports=Object.hasOwn||function(t,e){return i(o(t),e)}},function(t,e,n){n=n(1);t.exports=!n(function(){return 7!==Object.defineProperty({},1,{get:function(){return 7}})[1]})},function(t,e,n){var r=n(11),o=String,i=TypeError;t.exports=function(t){if(r(t))return t;throw new i(o(t)+" is not an object")}},function(t,e,n){var r=n(49),o=String;t.exports=function(t){if("Symbol"===r(t))throw new TypeError("Cannot convert a Symbol value to a string");return o(t)}},function(t,e,n){var r=n(5);t.exports=function(t){return"object"==typeof t?null!==t:r(t)}},function(t,e,n){var r=n(8),o=n(90),i=n(91),a=n(9),s=n(65),c=TypeError,u=Object.defineProperty,l=Object.getOwnPropertyDescriptor,f="enumerable",d="configurable",p="writable";e.f=r?i?function(t,e,n){var r;return a(t),e=s(e),a(n),"function"==typeof t&&"prototype"===e&&"value"in n&&p in n&&!n[p]&&(r=l(t,e))&&r[p]&&(t[e]=n.value,n={configurable:(d in n?n:r)[d],enumerable:(f in n?n:r)[f],writable:!1}),u(t,e,n)}:u:function(t,e,n){if(a(t),e=s(e),a(n),o)try{return u(t,e,n)}catch(t){}if("get"in n||"set"in n)throw new c("Accessors not supported");return"value"in n&&(t[e]=n.value),t}},function(t,e,n){var a=n(5),s=n(12),c=n(93),u=n(63);t.exports=function(t,e,n,r){var o=(r=r||{}).enumerable,i=void 0!==r.name?r.name:e;if(a(n)&&c(n,i,r),r.global)o?t[e]=n:u(e,n);else{try{r.unsafe?t[e]&&(o=!0):delete t[e]}catch(t){}o?t[e]=n:s.f(t,e,{value:n,enumerable:!1,configurable:!r.nonConfigurable,writable:!r.nonWritable})}return t}},function(t,e,n){var r=n(15),o=Object;t.exports=function(t){return o(r(t))}},function(t,e,n){var r=n(21),o=TypeError;t.exports=function(t){if(r(t))throw new o("Can't call method on "+t);return t}},function(t,e,n){var r=n(4),o=n(5);t.exports=function(t,e){return arguments.length<2?(n=r[t],o(n)?n:void 0):r[t]&&r[t][e];var n}},function(t,e,n){var r=n(67),o=n(15);t.exports=function(t){return r(o(t))}},function(t,e,n){var n=n(0),r=n({}.toString),o=n("".slice);t.exports=function(t){return o(r(t),8,-1)}},function(t,e,n){var r=n(61),o=n(13),n=n(146);r||o(Object.prototype,"toString",n,{unsafe:!0})},function(t,e,n){t.exports=!1},function(t,e,n){t.exports=function(t){return null==t}},function(t,e,n){n=n(0);t.exports=n({}.isPrototypeOf)},function(t,e,n){var r=n(5),o=n(34),i=TypeError;t.exports=function(t){if(r(t))return t;throw new i(o(t)+" is not a function")}},function(t,e,n){var r=n(8),o=n(12),i=n(36);t.exports=r?function(t,e,n){return o.f(t,e,i(1,n))}:function(t,e,n){return t[e]=n,t}},function(t,e,n){var r=n(54);t.exports=function(t){return r(t.length)}},function(t,e,n){var r=n(23),o=n(21);t.exports=function(t,e){t=t[e];return o(t)?void 0:r(t)}},function(t,e,n){var r,o,i,a,s=n(145),c=n(4),u=n(11),l=n(24),f=n(7),d=n(62),p=n(47),n=n(48),h="Object already initialized",v=c.TypeError,c=c.WeakMap,g=s||d.state?((i=d.state||(d.state=new c)).get=i.get,i.has=i.has,i.set=i.set,r=function(t,e){if(i.has(t))throw new v(h);return e.facade=t,i.set(t,e),e},o=function(t){return i.get(t)||{}},function(t){return i.has(t)}):(n[a=p("state")]=!0,r=function(t,e){if(f(t,a))throw new v(h);return e.facade=t,l(t,a,e),e},o=function(t){return f(t,a)?t[a]:{}},function(t){return f(t,a)});t.exports={set:r,get:o,has:g,enforce:function(t){return g(t)?o(t):r(t,{})},getterFor:function(e){return function(t){if(u(t)&&(t=o(t)).type===e)return t;throw new v("Incompatible receiver, "+e+" required")}}}},function(t,e,n){var r=n(62);t.exports=function(t,e){return r[t]||(r[t]=e||{})}},function(t,e,n){var r=n(43),o=n(1),i=n(4).String;t.exports=!!Object.getOwnPropertySymbols&&!o(function(){var t=Symbol("symbol detection");return!i(t)||!(Object(t)instanceof Symbol)||!Symbol.sham&&r&&r<41})},function(t,e,n){var r=n(149);t.exports=function(t){t=+t;return t!=t||0==t?0:r(t)}},function(t,e,n){function r(){}function o(t){t.write(m("")),t.close();var e=t.parentWindow.Object;return t=null,e}var i,a=n(9),s=n(99),c=n(73),u=n(48),l=n(129),f=n(52),n=n(47),d=">",p="<",h="prototype",v="script",g=n("IE_PROTO"),m=function(t){return p+v+d+t+p+"/"+v+d},y=function(){try{i=new ActiveXObject("htmlfile")}catch(t){}y="undefined"==typeof document||document.domain&&i?o(i):(t=f("iframe"),e="java"+v+":",t.style.display="none",l.appendChild(t),t.src=String(e),(e=t.contentWindow.document).open(),e.write(m("document.F=Object")),e.close(),e.F);for(var t,e,n=c.length;n--;)delete y[h][c[n]];return y()};u[g]=!0,t.exports=Object.create||function(t,e){var n;return null!==t?(r[h]=a(t),n=new r,r[h]=null,n[g]=t):n=y(),void 0===e?n:s.f(n,e)}},function(t,e,n){t.exports="undefined"!=typeof navigator&&String(navigator.userAgent)||""},function(t,e,n){var r=n(16),o=n(5),i=n(22),n=n(89),a=Object;t.exports=n?function(t){return"symbol"==typeof t}:function(t){var e=r("Symbol");return o(e)&&i(e.prototype,a(t))}},function(t,e,n){var r=String;t.exports=function(t){try{return r(t)}catch(t){return"Object"}}},function(t,e,n){var r=n(8),n=n(7),o=Function.prototype,i=r&&Object.getOwnPropertyDescriptor,n=n(o,"name"),a=n&&"something"===function(){}.name,r=n&&(!r||i(o,"name").configurable);t.exports={EXISTS:n,PROPER:a,CONFIGURABLE:r}},function(t,e,n){t.exports=function(t,e){return{enumerable:!(1&t),configurable:!(2&t),writable:!(4&t),value:e}}},function(t,e,n){function r(d){var p=1===d,h=2===d,v=3===d,g=4===d,m=6===d,y=7===d,b=5===d||m;return function(t,e,n,r){for(var o,i,a=x(t),s=_(a),c=S(s),u=w(e,n),l=0,e=r||k,f=p?e(t,c):h||y?e(t,0):void 0;l")})||!n||f)},function(t,e,n){var r=n(17),o=n(79),i=n(42),a=n(27),s=n(12).f,c=n(114),u=n(117),l=n(20),n=n(8),f="Array Iterator",d=a.set,p=a.getterFor(f),a=(t.exports=c(Array,"Array",function(t,e){d(this,{type:f,target:r(t),index:0,kind:e})},function(){var t=p(this),e=t.target,n=t.index++;if(!e||n>=e.length)return t.target=void 0,u(void 0,!0);switch(t.kind){case"keys":return u(n,!1);case"values":return u(e[n],!1)}return u([n,e[n]],!1)},"values"),i.Arguments=i.Array);if(o("keys"),o("values"),o("entries"),!l&&n&&"values"!==a.name)try{s(a,"name",{value:"values"})}catch(t){}},function(t,e,n){var r={};r[n(3)("toStringTag")]="z",t.exports="[object z]"===String(r)},function(t,e,n){var r=n(20),o=n(4),n=n(63),i="__core-js_shared__",t=t.exports=o[i]||n(i,{});(t.versions||(t.versions=[])).push({version:"3.37.1",mode:r?"pure":"global",copyright:"© 2014-2024 Denis Pushkarev (zloirock.ru)",license:"https://github.com/zloirock/core-js/blob/v3.37.1/LICENSE",source:"https://github.com/zloirock/core-js"})},function(t,e,n){var r=n(4),o=Object.defineProperty;t.exports=function(e,n){try{o(r,e,{value:n,configurable:!0,writable:!0})}catch(t){r[e]=n}return n}},function(t,e,n){var n=n(0),r=0,o=Math.random(),i=n(1..toString);t.exports=function(t){return"Symbol("+(void 0===t?"":t)+")_"+i(++r+o,36)}},function(t,e,n){var r=n(92),o=n(33);t.exports=function(t){t=r(t,"string");return o(t)?t:t+""}},function(t,e,n){var r=n(148),o=n(23),i=n(46),a=r(r.bind);t.exports=function(t,e){return o(t),void 0===e?t:i?a(t,e):function(){return t.apply(e,arguments)}}},function(t,e,n){var r=n(0),o=n(1),i=n(18),a=Object,s=r("".split);t.exports=o(function(){return!a("z").propertyIsEnumerable(0)})?function(t){return"String"===i(t)?s(t,""):a(t)}:a},function(t,e,n){var r=n(150);t.exports=function(t,e){return new(r(t))(0===e?0:e)}},function(t,e,n){function r(){}function o(t){if(!c(t))return!1;try{return d(r,[],t),!0}catch(t){return!1}}function i(t){if(!c(t))return!1;switch(u(t)){case"AsyncFunction":case"GeneratorFunction":case"AsyncGeneratorFunction":return!1}try{return v||!!h(p,f(t))}catch(t){return!0}}var a=n(0),s=n(1),c=n(5),u=n(49),l=n(16),f=n(78),d=l("Reflect","construct"),p=/^\s*(?:class|function)\b/,h=a(p.exec),v=!p.test(r);i.sham=!0,t.exports=!d||s(function(){var t;return o(o.call)||!o(Object)||!o(function(){t=!0})||t})?i:o},function(t,e,n){var r=n(1);t.exports=function(t,e){var n=[][t];return!!n&&r(function(){n.call(null,e||function(){return 1},1)})}},function(t,e,n){var r=n(2),o=n(37).find,n=n(79),i="find",a=!0;i in[]&&Array(1)[i](function(){a=!1}),r({target:"Array",proto:!0,forced:a},{find:function(t){return o(this,t,1o;)!a(r,n=e[o++])||~c(i,n)||l(i,n);return i}},function(t,e,n){var r=n(8),o=n(91),s=n(12),c=n(9),u=n(17),l=n(75);e.f=r&&!o?Object.defineProperties:function(t,e){c(t);for(var n,r=u(e),o=l(e),i=o.length,a=0;ab)","g");return"b"!==t.exec("b").groups.a||"bc"!=="b".replace(t,"$c")})},function(t,e,n){var r=n(6),o=n(7),i=n(22),a=n(103),s=RegExp.prototype;t.exports=function(t){var e=t.flags;return void 0!==e||"flags"in s||o(t,"flags")||!i(s,t)?e:r(a,t)}},function(t,e,n){function r(o){return function(t,e){var n,t=a(s(t)),e=i(e),r=t.length;return e<0||r<=e?o?"":void 0:(n=u(t,e))<55296||56319=e.length?s(void 0,!0):(e=r(e,n),t.index+=e.length,s(e,!1))})},function(t,e,n){var r=n(49),o=n(26),i=n(21),a=n(42),s=n(3)("iterator");t.exports=function(t){if(!i(t))return o(t,s)||o(t,"@@iterator")||a[r(t)]}},function(t,e,n){var r=n(2),o=n(37).map;r({target:"Array",proto:!0,forced:!n(40)("map")},{map:function(t){return o(this,t,1o;o++)d(e,n=r[o])&&!d(t,n)&&w(t,n,b(e,n))}var i=n(2),a=n(20),s=n(8),c=n(4),u=n(110),l=n(0),f=n(55),d=n(7),p=n(118),h=n(22),v=n(33),g=n(92),m=n(1),y=n(39).f,b=n(50).f,w=n(12).f,_=n(134),x=n(87).trim,n="Number",S=c[n],k=u[n],P=S.prototype,O=c.TypeError,E=l("".slice),j=l("".charCodeAt),I=function(t){var e,n,r,o,i,a,s,c=g(t,"number");if(v(c))throw new O("Cannot convert a Symbol value to a number");if("string"==typeof c&&2=t.length?{done:!0}:{done:!1,value:t[i++]}},e:function(t){throw t},f:e};throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}function s(t,e){(null==e||e>t.length)&&(e=t.length);for(var n=0,r=Array(e);n]*>)/g,m=/\$([$&'`]|\d{1,2})/g;t.exports=function(i,a,s,c,u,t){var l=s+i.length,f=c.length,e=m;return void 0!==u&&(u=o(u),e=g),h(t,e,function(t,e){var n;switch(p(e,0)){case"$":return"$";case"&":return i;case"`":return v(a,0,s);case"'":return v(a,l);case"<":n=u[v(e,1,-1)];break;default:var r,o=+e;if(0==o)return t;if(f@^][^\s!#%&*+<=>@^]*>/,D=/a/g,z=/a/g,t=new k(D)!==D,C=a.MISSED_STICKY,B=a.UNSUPPORTED_Y,w=e&&(!t||C||_||x||g(function(){return z[S]=!1,k(D)!==D||k(z)===z||"/a/i"!==String(k(D,"i"))}));if(o("RegExp",w)){for(var R=function(t,e){var n,r,o=d(P,this),i=p(t),a=void 0===e,s=[],c=t;if(!o&&i&&a&&t.constructor===R)return t;if((i||d(P,t))&&(t=t.source,a)&&(e=v(c)),t=void 0===t?"":h(t),e=void 0===e?"":h(e),c=t,i=e=_&&"dotAll"in D&&(n=!!e&&-1"===e&&c:if(""===l||m(a,l))throw new O("Invalid capture group name");a[l]=!0,c=!(i[i.length]=[l,u]),l="";continue}c?l+=e:o+=e}return[o,i]}(t))[0],s=a[1]),a=u(k(t,e),o?this:P,R),(n||r||s.length)&&(e=y(a),n&&(e.dotAll=!0,e.raw=R(function(t){for(var e,n=t.length,r=0,o="",i=!1;r<=n;r++)"\\"===(e=E(t,r))?o+=e+E(t,++r):i||"."!==e?("["===e?i=!0:"]"===e&&(i=!1),o+=e):o+="[\\s\\S]";return o}(t),i)),r&&(e.sticky=!0),s.length)&&(e.groups=s),t!==c)try{l(a,"source",""===c?"(?:)":c)}catch(t){}return a},A=i(k),L=0;A.length>L;)s(R,k,A[L++]);(P.constructor=R).prototype=P,c(n,"RegExp",R,{constructor:!0})}b("RegExp")},function(t,e,n){var r=n(12).f;t.exports=function(t,e,n){n in t||r(t,n,{configurable:!0,get:function(){return e[n]},set:function(t){e[n]=t}})}},function(t,e,n){var r=n(2),o=n(87).trim;r({target:"String",proto:!0,forced:n(137)("trim")},{trim:function(){return o(this)}})},function(t,e,n){var i=n(6),a=n(9),s=n(26);t.exports=function(t,e,n){var r,o;a(t);try{if(!(r=s(t,"return"))){if("throw"===e)throw n;return n}r=i(r,t)}catch(t){o=!0,r=t}if("throw"===e)throw n;if(o)throw r;return a(r),n}},function(t,e,n){var r=n(3),o=n(42),i=r("iterator"),a=Array.prototype;t.exports=function(t){return void 0!==t&&(o.Array===t||a[i]===t)}},function(t,e,n){var r=n(6),o=n(23),i=n(9),a=n(34),s=n(121),c=TypeError;t.exports=function(t,e){e=arguments.length<2?s(t):e;if(o(e))return i(r(e,t));throw new c(a(t)+" is not iterable")}},function(t,e,n){var o=n(3)("iterator"),i=!1;try{var r=0,a={next:function(){return{done:!!r++}},return:function(){i=!0}};a[o]=function(){return this},Array.from(a,function(){throw 2})}catch(t){}t.exports=function(t,e){try{if(!e&&!i)return!1}catch(t){return!1}var n=!1;try{var r={};r[o]=function(){return{next:function(){return{done:n=!0}}}},t(r)}catch(t){}return n}},function(t,e,n){var r=n(2),o=n(0),s=n(23),c=n(14),u=n(25),l=n(102),f=n(10),i=n(1),d=n(174),a=n(70),p=n(175),h=n(176),v=n(43),g=n(177),m=[],y=o(m.sort),b=o(m.push),n=i(function(){m.sort(void 0)}),o=i(function(){m.sort(null)}),a=a("sort"),w=!i(function(){if(v)return v<70;if(!(p&&3f(e)?1:-1})),n=u(o),a=0;a .m-nav__link"),o=document.querySelector(".m-nav__overlay"),i=document.getElementById("m-showMenu"),a=document.getElementsByTagName("html")[0];function s(){i.checked?a.classList.add("js-menu-open"):a.classList.remove("js-menu-open")}for(var c=0,u=r.length;cn.left&&on.top&&ie.top&&te.top+r-o&&(a+="s"),i>e.left&&ie.left+n-o&&(a+="e");for(var s=this.get("handles").split(","),c=0;cO oddělení

Nazev medailonku

Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchaze

',e.insertBefore(n,t.nextSibling))}var r=document.querySelector("#widget_container"),o=-1!==window.location.href.indexOf("useExampleData");o&&(r&&r.addEventListener("LMC_career_widget_pageRendered",function(t){var e;t.detail&&(t=t.detail.pageType,e=document.querySelector(".cp-detail__content"),"detail"===t)&&n(e)}),setTimeout(function(){var t=document.querySelector("#vacancy-detail .cp-detail__content");n(t)},1500))}]); \ No newline at end of file From 81d272a38c17ce5e23702e474b7aedcb1567f9fd Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 15:32:35 +0100 Subject: [PATCH 39/51] logging --- jg/plucker/cli.py | 2 ++ jg/plucker/jobs_jobscz/spider.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index 3c0b2eb..b7363ec 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -42,6 +42,8 @@ def __str__(self) -> str: def main(debug: bool = False): setup_logging() logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO) + for name in ["asyncio", "filelock", "crawlee"]: + logging.getLogger(name).setLevel(logging.WARNING) @main.command(context_settings={"ignore_unknown_options": True}) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index c19acae..ca4d2c5 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -139,7 +139,7 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: def parse_job( self, response: HtmlResponse, item: Job, track_id: str ) -> Generator[Job | Request, None, None]: - self.track_logger(track_id).debug("Parsing job page") + self.track_logger(track_id).debug(f"Parsing job page {response.url}") loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) From 2865ed8f764699974b65b708331c5cc886ae8e8f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 15:34:35 +0100 Subject: [PATCH 40/51] update deps --- jg/plucker/cli.py | 2 +- poetry.lock | 40 ++++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index b7363ec..a8521df 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -8,7 +8,7 @@ from typing import IO, Callable, Generator, Type import click -from apify.scrapy import setup_logging +from apify.scrapy.logging_config import setup_logging from apify_client import ApifyClient from apify_shared.consts import ActorJobStatus, ActorSourceType from pydantic import BaseModel diff --git a/poetry.lock b/poetry.lock index af89eb4..983061e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -71,7 +71,7 @@ scrapy = ["scrapy (>=2.11.0)"] type = "git" url = "https://github.com/apify/apify-sdk-python.git" reference = "fixing-scrapy" -resolved_reference = "cc1af3d763621b5d46fe467fc41d413d102bbf8a" +resolved_reference = "4fb9f8740d6064febcd4a728fa67cf93cd319f49" [[package]] name = "apify-client" @@ -2411,29 +2411,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.9.4" +version = "0.9.5" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706"}, - {file = "ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf"}, - {file = "ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0"}, - {file = "ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402"}, - {file = "ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e"}, - {file = "ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41"}, - {file = "ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7"}, + {file = "ruff-0.9.5-py3-none-linux_armv6l.whl", hash = "sha256:d466d2abc05f39018d53f681fa1c0ffe9570e6d73cde1b65d23bb557c846f442"}, + {file = "ruff-0.9.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38840dbcef63948657fa7605ca363194d2fe8c26ce8f9ae12eee7f098c85ac8a"}, + {file = "ruff-0.9.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d56ba06da53536b575fbd2b56517f6f95774ff7be0f62c80b9e67430391eeb36"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7cb2a01da08244c50b20ccfaeb5972e4228c3c3a1989d3ece2bc4b1f996001"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:96d5c76358419bc63a671caac70c18732d4fd0341646ecd01641ddda5c39ca0b"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:deb8304636ed394211f3a6d46c0e7d9535b016f53adaa8340139859b2359a070"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:df455000bf59e62b3e8c7ba5ed88a4a2bc64896f900f311dc23ff2dc38156440"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de92170dfa50c32a2b8206a647949590e752aca8100a0f6b8cefa02ae29dce80"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d28532d73b1f3f627ba88e1456f50748b37f3a345d2be76e4c653bec6c3e393"}, + {file = "ruff-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c746d7d1df64f31d90503ece5cc34d7007c06751a7a3bbeee10e5f2463d52d2"}, + {file = "ruff-0.9.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11417521d6f2d121fda376f0d2169fb529976c544d653d1d6044f4c5562516ee"}, + {file = "ruff-0.9.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:5b9d71c3879eb32de700f2f6fac3d46566f644a91d3130119a6378f9312a38e1"}, + {file = "ruff-0.9.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:2e36c61145e70febcb78483903c43444c6b9d40f6d2f800b5552fec6e4a7bb9a"}, + {file = "ruff-0.9.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:2f71d09aeba026c922aa7aa19a08d7bd27c867aedb2f74285a2639644c1c12f5"}, + {file = "ruff-0.9.5-py3-none-win32.whl", hash = "sha256:134f958d52aa6fdec3b294b8ebe2320a950d10c041473c4316d2e7d7c2544723"}, + {file = "ruff-0.9.5-py3-none-win_amd64.whl", hash = "sha256:78cc6067f6d80b6745b67498fb84e87d32c6fc34992b52bffefbdae3442967d6"}, + {file = "ruff-0.9.5-py3-none-win_arm64.whl", hash = "sha256:18a29f1a005bddb229e580795627d297dfa99f16b30c7039e73278cf6b5f9fa9"}, + {file = "ruff-0.9.5.tar.gz", hash = "sha256:11aecd7a633932875ab3cb05a484c99970b9d52606ce9ea912b690b02653d56c"}, ] [[package]] From 6380e084dea4a22f2f8866c5c90b9bad53286f3e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 15:45:46 +0100 Subject: [PATCH 41/51] rename track_id to trk --- jg/plucker/jobs_jobscz/spider.py | 61 +++++++++++++++----------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index ca4d2c5..cb89442 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -102,7 +102,7 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get()) - track_id = get_track_id(url) + trk = get_trk(url) # logging track ID for each job loader = Loader(item=Job(), response=response) card_loader = loader.nested_xpath(f"{card_xpath}[{n}]") @@ -120,11 +120,11 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_loader.add_value("source_urls", url) item = loader.load_item() - self.track_logger(track_id).debug(f"Parsing card for {url}") + self.logger.debug(f"Parsing card for {url}", extra={"trk": trk}) yield response.follow( url, callback=self.parse_job, - cb_kwargs=dict(item=item, track_id=track_id), + cb_kwargs=dict(item=item, trk=trk), meta={"impersonate": "edge101"}, ) urls = [ @@ -137,17 +137,17 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: ) def parse_job( - self, response: HtmlResponse, item: Job, track_id: str + self, response: HtmlResponse, item: Job, trk: str ) -> Generator[Job | Request, None, None]: - self.track_logger(track_id).debug(f"Parsing job page {response.url}") + self.logger.debug(f"Parsing job page {response.url}", extra={"trk": trk}) loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) if "www.jobs.cz" not in response.url: - yield from self.parse_job_widget_data(response, item, track_id) + yield from self.parse_job_widget_data(response, item, trk) else: - self.track_logger(track_id).debug("Parsing as standard job page") + self.logger.debug("Parsing as standard job page", extra={"trk": trk}) for label in self.employment_types_labels: loader.add_xpath( "employment_types", @@ -157,7 +157,7 @@ def parse_job( loader.add_css("description_html", '[data-jobad="body"]') if response.css('[class*="CompanyProfileNavigation"]').get(): - self.track_logger(track_id).debug("Parsing as company job page") + self.logger.debug("Parsing as company job page", extra={"trk": trk}) loader.add_css( "company_logo_urls", ".CompanyProfileNavigation__logo img::attr(src)", @@ -172,14 +172,15 @@ def parse_job( yield loader.load_item() def parse_job_widget_data( - self, response: HtmlResponse, item: Job, track_id: str + self, response: HtmlResponse, item: Job, trk: str ) -> Generator[Request, None, None]: try: - self.track_logger(track_id).debug("Looking for widget data in the HTML") + self.logger.debug("Looking for widget data in the HTML", extra={"trk": trk}) widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0]) except IndexError: - self.track_logger(track_id).debug( - "Looking for widget data in attached JavaScript" + self.logger.debug( + "Looking for widget data in attached JavaScript", + extra={"trk": trk}, ) script_urls = sorted( map( @@ -190,7 +191,7 @@ def parse_job_widget_data( ), key=get_script_relevance, ) - self.track_logger(track_id).debug(f"Script URLs: {script_urls!r}") + self.logger.debug(f"Script URLs: {script_urls!r}", extra={"trk": trk}) yield response.follow( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -198,7 +199,7 @@ def parse_job_widget_data( item=item, url=response.url, script_urls=script_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) @@ -209,7 +210,7 @@ def parse_job_widget_data( widget_host=widget_data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["widgetId"], - track_id=track_id, + trk=trk, ) def parse_job_widget_script( @@ -218,7 +219,7 @@ def parse_job_widget_script( url: str, item: Job, script_urls: list[str], - track_id: str, + trk: str, ) -> Generator[Request, None, None]: if data := parse_widget_script_json(script_response.text): widget_name = select_widget(list(data["widgets"].keys())) @@ -229,7 +230,7 @@ def parse_job_widget_script( widget_host=data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["id"], - track_id=track_id, + trk=trk, ) elif mess := parse_widget_script_mess(script_response.text): yield from self.parse_job_widget( @@ -238,13 +239,13 @@ def parse_job_widget_script( widget_host=get_widget_host(url), widget_api_key=mess["widgetApiKey"], widget_id=mess["widgetId"], - track_id=track_id, + trk=trk, ) elif chunk_names := parse_react_chunk_names(script_response.text): chunk_urls = [ url.replace("react.min.js", chunk_name) for chunk_name in chunk_names ] - self.track_logger(track_id).debug(f"Chunk URLs: {chunk_urls!r}") + self.logger.debug(f"Chunk URLs: {chunk_urls!r}", extra={"trk": trk}) yield Request( chunk_urls.pop(0), callback=self.parse_job_widget_script, @@ -252,12 +253,12 @@ def parse_job_widget_script( item=item, url=url, script_urls=chunk_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) elif script_urls: - self.track_logger(track_id).debug(f"Script URLs: {script_urls!r}") + self.logger.debug(f"Script URLs: {script_urls!r}", extra={"trk": trk}) yield Request( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -265,7 +266,7 @@ def parse_job_widget_script( item=item, url=url, script_urls=script_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) @@ -279,14 +280,14 @@ def parse_job_widget( widget_host: str, widget_api_key: str, widget_id: str, - track_id: str, + trk: str, ) -> Generator[Request, None, None]: loader = Loader(item=item) loader.add_value("url", url) loader.add_value("company_url", f"https://{widget_host}") loader.add_value("source_urls", url) - self.track_logger(track_id).debug("Requesting data from job widget API") + self.logger.debug("Requesting data from job widget API", extra={"trk": trk}) params = get_params(url) yield Request( "https://api.capybara.lmc.cz/api/graphql/widget", @@ -318,14 +319,14 @@ def parse_job_widget( ) ), callback=self.parse_job_widget_api, - cb_kwargs=dict(item=loader.load_item(), track_id=track_id), + cb_kwargs=dict(item=loader.load_item(), trk=trk), meta={"impersonate": "edge101"}, ) def parse_job_widget_api( - self, response: TextResponse, item: Job, track_id: str + self, response: TextResponse, item: Job, trk: str ) -> Generator[Job, None, None]: - self.track_logger(track_id).debug("Parsing job widget API response") + self.logger.debug("Parsing job widget API response", extra={"trk": trk}) try: payload = cast(dict, response.json()) except json.JSONDecodeError as e: @@ -348,13 +349,9 @@ def parse_job_widget_api( yield loader.load_item() - def track_logger(self, track_id: str) -> logging.LoggerAdapter: - logger = logging.getLogger(f"{self.name}.{track_id}") - return logging.LoggerAdapter(logger, {"spider": self, "track_id": track_id}) - @lru_cache -def get_track_id(seed: str) -> str: +def get_trk(seed: str) -> str: return hashlib.sha1(seed.encode()).hexdigest()[:10] From 999d3e70a665d929eb5338be429a6b87fc78ffa1 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 16:25:44 +0100 Subject: [PATCH 42/51] logging --- jg/plucker/jobs_jobscz/spider.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index cb89442..8432175 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -1,7 +1,7 @@ import hashlib import json -import logging import re +from logging import Logger, LoggerAdapter import uuid from datetime import date, datetime from functools import lru_cache @@ -98,7 +98,11 @@ class Spider(BaseSpider): "Employment form", ] + def logger_trk(self, trk: str) -> Logger: + return self.logger.logger.getChild(trk) + def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: + self.logger.debug(f"Parsing listing {response.url}") card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get()) @@ -120,7 +124,7 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_loader.add_value("source_urls", url) item = loader.load_item() - self.logger.debug(f"Parsing card for {url}", extra={"trk": trk}) + self.logger_trk(trk).debug(f"Parsing card for {url}") yield response.follow( url, callback=self.parse_job, @@ -139,7 +143,7 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: def parse_job( self, response: HtmlResponse, item: Job, trk: str ) -> Generator[Job | Request, None, None]: - self.logger.debug(f"Parsing job page {response.url}", extra={"trk": trk}) + self.logger_trk(trk).debug(f"Parsing job page {response.url}") loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) @@ -147,7 +151,7 @@ def parse_job( if "www.jobs.cz" not in response.url: yield from self.parse_job_widget_data(response, item, trk) else: - self.logger.debug("Parsing as standard job page", extra={"trk": trk}) + self.logger_trk(trk).debug("Parsing as standard job page") for label in self.employment_types_labels: loader.add_xpath( "employment_types", @@ -157,7 +161,7 @@ def parse_job( loader.add_css("description_html", '[data-jobad="body"]') if response.css('[class*="CompanyProfileNavigation"]').get(): - self.logger.debug("Parsing as company job page", extra={"trk": trk}) + self.logger_trk(trk).debug("Parsing as company job page") loader.add_css( "company_logo_urls", ".CompanyProfileNavigation__logo img::attr(src)", @@ -175,13 +179,10 @@ def parse_job_widget_data( self, response: HtmlResponse, item: Job, trk: str ) -> Generator[Request, None, None]: try: - self.logger.debug("Looking for widget data in the HTML", extra={"trk": trk}) + self.logger_trk(trk).debug("Looking for widget data in the HTML") widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0]) except IndexError: - self.logger.debug( - "Looking for widget data in attached JavaScript", - extra={"trk": trk}, - ) + self.logger_trk(trk).debug("Looking for widget data in attached JavaScript") script_urls = sorted( map( response.urljoin, @@ -191,7 +192,7 @@ def parse_job_widget_data( ), key=get_script_relevance, ) - self.logger.debug(f"Script URLs: {script_urls!r}", extra={"trk": trk}) + self.logger_trk(trk).debug(f"Script URLs: {script_urls!r}") yield response.follow( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -245,7 +246,7 @@ def parse_job_widget_script( chunk_urls = [ url.replace("react.min.js", chunk_name) for chunk_name in chunk_names ] - self.logger.debug(f"Chunk URLs: {chunk_urls!r}", extra={"trk": trk}) + self.logger_trk(trk).debug(f"Chunk URLs: {chunk_urls!r}") yield Request( chunk_urls.pop(0), callback=self.parse_job_widget_script, @@ -258,7 +259,7 @@ def parse_job_widget_script( meta={"impersonate": "edge101"}, ) elif script_urls: - self.logger.debug(f"Script URLs: {script_urls!r}", extra={"trk": trk}) + self.logger_trk(trk).debug(f"Script URLs: {script_urls!r}") yield Request( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -287,7 +288,7 @@ def parse_job_widget( loader.add_value("company_url", f"https://{widget_host}") loader.add_value("source_urls", url) - self.logger.debug("Requesting data from job widget API", extra={"trk": trk}) + self.logger_trk(trk).debug("Requesting data from job widget API") params = get_params(url) yield Request( "https://api.capybara.lmc.cz/api/graphql/widget", @@ -326,7 +327,7 @@ def parse_job_widget( def parse_job_widget_api( self, response: TextResponse, item: Job, trk: str ) -> Generator[Job, None, None]: - self.logger.debug("Parsing job widget API response", extra={"trk": trk}) + self.logger_trk(trk).debug("Parsing job widget API response") try: payload = cast(dict, response.json()) except json.JSONDecodeError as e: From ab4939eb38a52c9973f63ff4aabffd425b39a031 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 16:26:03 +0100 Subject: [PATCH 43/51] imports --- jg/plucker/jobs_jobscz/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 8432175..54904c3 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -1,10 +1,10 @@ import hashlib import json import re -from logging import Logger, LoggerAdapter import uuid from datetime import date, datetime from functools import lru_cache +from logging import Logger from pathlib import Path from typing import Any, Generator, Iterable, cast from urllib.parse import urljoin, urlparse From a1354b1771a3ccac9482827792b98f7d774b3768 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 21:02:24 +0100 Subject: [PATCH 44/51] fix types --- jg/plucker/jobs_jobscz/spider.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 54904c3..38f8467 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -11,6 +11,7 @@ from itemloaders.processors import Compose, Identity, MapCompose, TakeFirst from scrapy import Request, Spider as BaseSpider +from scrapy.http.response import Response from scrapy.http.response.html import HtmlResponse from scrapy.http.response.text import TextResponse from scrapy.loader import ItemLoader @@ -101,8 +102,10 @@ class Spider(BaseSpider): def logger_trk(self, trk: str) -> Logger: return self.logger.logger.getChild(trk) - def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: + def parse(self, response: Response) -> Generator[Request, None, None]: + response = cast(HtmlResponse, response) self.logger.debug(f"Parsing listing {response.url}") + card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get()) @@ -141,9 +144,11 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: ) def parse_job( - self, response: HtmlResponse, item: Job, trk: str + self, response: Response, item: Job, trk: str ) -> Generator[Job | Request, None, None]: + response = cast(HtmlResponse, response) self.logger_trk(trk).debug(f"Parsing job page {response.url}") + loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) @@ -216,12 +221,14 @@ def parse_job_widget_data( def parse_job_widget_script( self, - script_response: TextResponse, + script_response: Response, url: str, item: Job, script_urls: list[str], trk: str, ) -> Generator[Request, None, None]: + script_response = cast(TextResponse, script_response) + if data := parse_widget_script_json(script_response.text): widget_name = select_widget(list(data["widgets"].keys())) widget_data = data["widgets"][widget_name] @@ -325,9 +332,11 @@ def parse_job_widget( ) def parse_job_widget_api( - self, response: TextResponse, item: Job, trk: str + self, response: Response, item: Job, trk: str ) -> Generator[Job, None, None]: + response = cast(TextResponse, response) self.logger_trk(trk).debug("Parsing job widget API response") + try: payload = cast(dict, response.json()) except json.JSONDecodeError as e: From 94b94b27414bcaca9faab0549395c7a3327788ce Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 7 Feb 2025 21:03:58 +0100 Subject: [PATCH 45/51] format --- jg/plucker/jobs_jobscz/spider.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 38f8467..dcf37c4 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -92,12 +92,8 @@ class Spider(BaseSpider): # "https://www.jobs.cz/prace/datovy-analytik/", "https://www.jobs.cz/prace/kuchar/", ] - min_items = 5 # TODO DEBUG ! - employment_types_labels = [ - "Typ pracovního poměru", - "Employment form", - ] + employment_types_labels = ["Typ pracovního poměru", "Employment form"] def logger_trk(self, trk: str) -> Logger: return self.logger.logger.getChild(trk) From f4fa0bdb5032b2e1d981ccecadaf68734a3ec591 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 15:49:16 +0100 Subject: [PATCH 46/51] change how pagination works, fix changes after SDK PR got merged --- jg/plucker/jobs_jobscz/spider.py | 32 +- jg/plucker/scrapers.py | 51 +- poetry.lock | 124 +- pyproject.toml | 6 +- tests/jobs_jobscz/listing_page.html | 5185 +++++++++++++++++++++ tests/jobs_jobscz/listing_page_first.html | 4285 +++++++++++++++++ tests/jobs_jobscz/listing_page_last.html | 4370 +++++++++++++++++ tests/jobs_jobscz/test_spider.py | 47 +- 8 files changed, 13989 insertions(+), 111 deletions(-) create mode 100644 tests/jobs_jobscz/listing_page.html create mode 100644 tests/jobs_jobscz/listing_page_first.html create mode 100644 tests/jobs_jobscz/listing_page_last.html diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index dcf37c4..06fce49 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -18,7 +18,7 @@ from jg.plucker.items import Job from jg.plucker.processors import first, split -from jg.plucker.url_params import get_params, strip_params +from jg.plucker.url_params import get_param, get_params, strip_params MULTIPLE_LOCATIONS_RE = re.compile( @@ -100,7 +100,8 @@ def logger_trk(self, trk: str) -> Logger: def parse(self, response: Response) -> Generator[Request, None, None]: response = cast(HtmlResponse, response) - self.logger.debug(f"Parsing listing {response.url}") + page = get_page(response.url) + self.logger.debug(f"Parsing listing {response.url} (page: {page})") card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): @@ -130,14 +131,18 @@ def parse(self, response: Response) -> Generator[Request, None, None]: cb_kwargs=dict(item=item, trk=trk), meta={"impersonate": "edge101"}, ) - urls = [ - response.urljoin(relative_url) - for relative_url in response.css(".Pagination__link::attr(href)").getall() - if "page=" in relative_url - ] - yield from response.follow_all( - urls, callback=self.parse, meta={"impersonate": "edge101"} - ) + self.logger.debug(f"Found {n} job cards on {response.url}") + + next_page_css = f'.Pagination__link[href*="page={page + 1}"]::attr(href)' + if next_page_link := response.css(next_page_css).get(): + yield response.follow( + next_page_link, + callback=self.parse, + cb_kwargs={"page": page + 1}, + meta={"impersonate": "edge101"}, + ) + else: + self.logger.debug(f"No next page found for {response.url}") def parse_job( self, response: Response, item: Job, trk: str @@ -356,7 +361,12 @@ def parse_job_widget_api( yield loader.load_item() -@lru_cache +def get_page(url: str) -> int: + if page := get_param(url, "page"): + return int(page) + return 1 + + def get_trk(seed: str) -> str: return hashlib.sha1(seed.encode()).hexdigest()[:10] diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 5168b33..dcf4c25 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,7 +1,7 @@ import asyncio import logging +import os import pickle -import threading from pathlib import Path from time import time from pathlib import Path @@ -11,13 +11,7 @@ from apify import Actor, Configuration from apify.apify_storage_client import ApifyStorageClient from apify.scrapy import run_scrapy_actor -from apify.scrapy.scheduler import ( - _TIMEOUT, - _force_exit_event_loop, - _run_async_coro, - _shutdown_async_tasks, - _start_event_loop, -) +from apify.scrapy._async_thread import AsyncThread from apify.scrapy.utils import apply_apify_settings from apify.storages import KeyValueStore from scrapy import Item, Request, Spider @@ -62,19 +56,22 @@ async def run_as_spider( async def run_as_actor( spider_class: Type[Spider], spider_params: dict[str, Any] | None ): + # workaround https://github.com/apify/apify-sdk-python/issues/401 + os.environ["SCRAPY_SETTINGS_MODULE"] = "jg.plucker.settings" + async with Actor: - Actor.log.info(f"Starting actor for spider {spider_class.name}") + logger.info(f"Starting actor for spider {spider_class.name}") params = spider_params or (await Actor.get_input()) or {} proxy_config = params.pop("proxyConfig", None) - Actor.log.debug(f"Proxy config: {proxy_config!r}") + logger.debug(f"Proxy config: {proxy_config!r}") settings = apply_apify_settings(proxy_config=proxy_config) settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) - Actor.log.debug(f"Spider params: {spider_params!r}") + logger.debug(f"Spider params: {spider_params!r}") - Actor.log.info("Starting the spider") + logger.info("Starting the spider") runner = CrawlerRunner(settings) crawler = runner.create_crawler(spider_class) @@ -184,11 +181,7 @@ def __init__(self, settings: BaseSettings): self._fingerprinter: RequestFingerprinterProtocol | None = None logger.debug("Starting background thread for cache storage's event loop") - self._eventloop = asyncio.new_event_loop() - self._thread = threading.Thread( - target=lambda: _start_event_loop(self._eventloop), daemon=True - ) - self._thread.start() + self._async_thread = AsyncThread() def open_spider(self, spider: Spider) -> None: logger.debug("Using Apify key value cache storage", extra={"spider": spider}) @@ -206,20 +199,12 @@ async def open_kv() -> KeyValueStore: return await KeyValueStore.open(name=kv_name) logger.debug(f"Opening cache storage's {kv_name!r} key value store") - self._kv = _run_async_coro(self._eventloop, open_kv()) + self._kv = self._async_thread.run_coro(open_kv()) def close_spider(self, spider: Spider) -> None: logger.debug("Closing cache storage...") try: - if self._eventloop.is_running(): - _run_async_coro(self._eventloop, _shutdown_async_tasks(self._eventloop)) - self._eventloop.call_soon_threadsafe(self._eventloop.stop) - self._thread.join(timeout=_TIMEOUT) - if self._thread.is_alive(): - logger.warning( - "Background thread for cache storage didn't exit cleanly! Forcing shutdown..." - ) - _force_exit_event_loop(self._eventloop, self._thread) + self._async_thread.close() except KeyboardInterrupt: logger.warning("Shutdown interrupted by KeyboardInterrupt!") except Exception: @@ -233,18 +218,18 @@ def retrieve_response(self, spider: Spider, request: Request) -> Response | None key = self._fingerprinter.fingerprint(request).hex() - seconds = _run_async_coro(self._eventloop, self._kv.get_value(f"{key}_time")) + seconds = self._async_thread.run_coro(self._kv.get_value(f"{key}_time")) if seconds is None: logger.debug("Cache miss", extra={"request": request}) return None if 0 < self.expiration_secs < time() - seconds: logger.debug("Cache expired", extra={"request": request}) - _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_data", None)) - _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_time", None)) + self._async_thread.run_coro(self._kv.set_value(f"{key}_data", None)) + self._async_thread.run_coro(self._kv.set_value(f"{key}_time", None)) return None - value = _run_async_coro(self._eventloop, self._kv.get_value(f"{key}_data")) + value = self._async_thread.run_coro(self._kv.get_value(f"{key}_data")) if value is None: logger.debug("Cache miss", extra={"request": request}) return None @@ -273,5 +258,5 @@ def store_response( "body": response.body, } value = pickle.dumps(data, protocol=4) - _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_data", value)) - _run_async_coro(self._eventloop, self._kv.set_value(f"{key}_time", time())) + self._async_thread.run_coro(self._kv.set_value(f"{key}_data", value)) + self._async_thread.run_coro(self._kv.set_value(f"{key}_time", time())) diff --git a/poetry.lock b/poetry.lock index 983061e..10aa9ab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,7 +45,7 @@ trio = ["trio (>=0.26.1)"] [[package]] name = "apify" -version = "2.2.2" +version = "2.3.0" description = "Apify SDK for Python" optional = false python-versions = "^3.9" @@ -53,7 +53,7 @@ files = [] develop = false [package.dependencies] -apify-client = ">=1.8.1" +apify-client = ">=1.9.1" apify-shared = ">=1.2.1" crawlee = "~0.5.1" cryptography = ">=42.0.0" @@ -70,23 +70,23 @@ scrapy = ["scrapy (>=2.11.0)"] [package.source] type = "git" url = "https://github.com/apify/apify-sdk-python.git" -reference = "fixing-scrapy" -resolved_reference = "4fb9f8740d6064febcd4a728fa67cf93cd319f49" +reference = "master" +resolved_reference = "9706c94193db588fd59ff91e7b39d7c4b3c5b6a4" [[package]] name = "apify-client" -version = "1.8.1" +version = "1.9.1" description = "Apify API client for Python" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "apify_client-1.8.1-py3-none-any.whl", hash = "sha256:cfa6df3816c436204e37457fba28981a0ef6a7602cde372463f0f078eee64747"}, - {file = "apify_client-1.8.1.tar.gz", hash = "sha256:2be1be7879570655bddeebf126833efe94cabb95b3755592845e92c20c70c674"}, + {file = "apify_client-1.9.1-py3-none-any.whl", hash = "sha256:7f1eaf0c66a077d47eb267f4af6a7ec53d10b683ea4c585daa2a2a01f7e74e3f"}, + {file = "apify_client-1.9.1.tar.gz", hash = "sha256:aadaa5800845ca1c5b1c7416f23358cc1fcbd2aab035521e40235f33721e2a26"}, ] [package.dependencies] apify-shared = ">=1.1.2" -httpx = ">=0.25.0" +httpx = ">=0.25" more_itertools = ">=10.0.0" [[package]] @@ -679,38 +679,42 @@ playwright = ["playwright (>=1.27.0)"] [[package]] name = "cryptography" -version = "44.0.0" +version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" files = [ - {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831c3c4d0774e488fdc83a1923b49b9957d33287de923d58ebd3cec47a0ae43f"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, - {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, - {file = "cryptography-44.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:abc998e0c0eee3c8a1904221d3f67dcfa76422b23620173e28c11d3e626c21bd"}, - {file = "cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:660cb7312a08bc38be15b696462fa7cc7cd85c3ed9c576e81f4dc4d8b2b31591"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404fdc66ee5f83a1388be54300ae978b2efd538018de18556dde92575e05defc"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, - {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, - {file = "cryptography-44.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:708ee5f1bafe76d041b53a4f95eb28cdeb8d18da17e597d46d7833ee59b97ede"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37d76e6863da3774cd9db5b409a9ecfd2c71c981c38788d3fcfaf177f447b731"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5e7cb1e5e56ca0933b4873c0220a78b773b24d40d186b6738080b73d3d0a756"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:8b3e6eae66cf54701ee7d9c83c30ac0a1e3fa17be486033000f2a73a12ab507c"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:be4ce505894d15d5c5037167ffb7f0ae90b7be6f2a98f9a5c3442395501c32fa"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:62901fb618f74d7d81bf408c8719e9ec14d863086efe4185afd07c352aee1d2c"}, - {file = "cryptography-44.0.0.tar.gz", hash = "sha256:cd4e834f340b4293430701e772ec543b0fbe6c2dea510a5286fe0acabe153a02"}, + {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, + {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, + {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, + {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, + {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, + {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, + {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, ] [package.dependencies] @@ -723,7 +727,7 @@ nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi (>=2024)", "cryptography-vectors (==44.0.0)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] +test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] @@ -921,13 +925,13 @@ files = [ [[package]] name = "html-text" -version = "0.6.2" +version = "0.7.0" description = "Extract text from HTML" optional = false python-versions = "*" files = [ - {file = "html_text-0.6.2-py2.py3-none-any.whl", hash = "sha256:d83d619ccd4b4d6172e21084d8a46e29e49ce87a08cc02161e7ca8c2918e7bca"}, - {file = "html_text-0.6.2.tar.gz", hash = "sha256:81455b4de5430cf63ce7c45a870fb8629e79ca8518e240f172d62409c2f2ff72"}, + {file = "html_text-0.7.0-py3-none-any.whl", hash = "sha256:11a95d5588a7b954aa229394bcd4a802d195c793d9970d5d8fc80d3d0ea9618e"}, + {file = "html_text-0.7.0.tar.gz", hash = "sha256:3dcb7006945d8ff06b4be639678f633a06ea70bc494163d256066995e1eb9182"}, ] [package.dependencies] @@ -2411,29 +2415,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.9.5" +version = "0.9.6" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.9.5-py3-none-linux_armv6l.whl", hash = "sha256:d466d2abc05f39018d53f681fa1c0ffe9570e6d73cde1b65d23bb557c846f442"}, - {file = "ruff-0.9.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38840dbcef63948657fa7605ca363194d2fe8c26ce8f9ae12eee7f098c85ac8a"}, - {file = "ruff-0.9.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d56ba06da53536b575fbd2b56517f6f95774ff7be0f62c80b9e67430391eeb36"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7cb2a01da08244c50b20ccfaeb5972e4228c3c3a1989d3ece2bc4b1f996001"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:96d5c76358419bc63a671caac70c18732d4fd0341646ecd01641ddda5c39ca0b"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:deb8304636ed394211f3a6d46c0e7d9535b016f53adaa8340139859b2359a070"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:df455000bf59e62b3e8c7ba5ed88a4a2bc64896f900f311dc23ff2dc38156440"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de92170dfa50c32a2b8206a647949590e752aca8100a0f6b8cefa02ae29dce80"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d28532d73b1f3f627ba88e1456f50748b37f3a345d2be76e4c653bec6c3e393"}, - {file = "ruff-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c746d7d1df64f31d90503ece5cc34d7007c06751a7a3bbeee10e5f2463d52d2"}, - {file = "ruff-0.9.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11417521d6f2d121fda376f0d2169fb529976c544d653d1d6044f4c5562516ee"}, - {file = "ruff-0.9.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:5b9d71c3879eb32de700f2f6fac3d46566f644a91d3130119a6378f9312a38e1"}, - {file = "ruff-0.9.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:2e36c61145e70febcb78483903c43444c6b9d40f6d2f800b5552fec6e4a7bb9a"}, - {file = "ruff-0.9.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:2f71d09aeba026c922aa7aa19a08d7bd27c867aedb2f74285a2639644c1c12f5"}, - {file = "ruff-0.9.5-py3-none-win32.whl", hash = "sha256:134f958d52aa6fdec3b294b8ebe2320a950d10c041473c4316d2e7d7c2544723"}, - {file = "ruff-0.9.5-py3-none-win_amd64.whl", hash = "sha256:78cc6067f6d80b6745b67498fb84e87d32c6fc34992b52bffefbdae3442967d6"}, - {file = "ruff-0.9.5-py3-none-win_arm64.whl", hash = "sha256:18a29f1a005bddb229e580795627d297dfa99f16b30c7039e73278cf6b5f9fa9"}, - {file = "ruff-0.9.5.tar.gz", hash = "sha256:11aecd7a633932875ab3cb05a484c99970b9d52606ce9ea912b690b02653d56c"}, + {file = "ruff-0.9.6-py3-none-linux_armv6l.whl", hash = "sha256:2f218f356dd2d995839f1941322ff021c72a492c470f0b26a34f844c29cdf5ba"}, + {file = "ruff-0.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b908ff4df65dad7b251c9968a2e4560836d8f5487c2f0cc238321ed951ea0504"}, + {file = "ruff-0.9.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b109c0ad2ececf42e75fa99dc4043ff72a357436bb171900714a9ea581ddef83"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de4367cca3dac99bcbd15c161404e849bb0bfd543664db39232648dc00112dc"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac3ee4d7c2c92ddfdaedf0bf31b2b176fa7aa8950efc454628d477394d35638b"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc1edd1775270e6aa2386119aea692039781429f0be1e0949ea5884e011aa8e"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4a091729086dffa4bd070aa5dab7e39cc6b9d62eb2bef8f3d91172d30d599666"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1bbc6808bf7b15796cef0815e1dfb796fbd383e7dbd4334709642649625e7c5"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:589d1d9f25b5754ff230dce914a174a7c951a85a4e9270613a2b74231fdac2f5"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc61dd5131742e21103fbbdcad683a8813be0e3c204472d520d9a5021ca8b217"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5e2d9126161d0357e5c8f30b0bd6168d2c3872372f14481136d13de9937f79b6"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:68660eab1a8e65babb5229a1f97b46e3120923757a68b5413d8561f8a85d4897"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c4cae6c4cc7b9b4017c71114115db0445b00a16de3bcde0946273e8392856f08"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:19f505b643228b417c1111a2a536424ddde0db4ef9023b9e04a46ed8a1cb4656"}, + {file = "ruff-0.9.6-py3-none-win32.whl", hash = "sha256:194d8402bceef1b31164909540a597e0d913c0e4952015a5b40e28c146121b5d"}, + {file = "ruff-0.9.6-py3-none-win_amd64.whl", hash = "sha256:03482d5c09d90d4ee3f40d97578423698ad895c87314c4de39ed2af945633caa"}, + {file = "ruff-0.9.6-py3-none-win_arm64.whl", hash = "sha256:0e2bb706a2be7ddfea4a4af918562fdc1bcb16df255e5fa595bbd800ce322a5a"}, + {file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"}, ] [[package]] @@ -3161,4 +3165,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.12.*" -content-hash = "27274f4fbccdbebda12a59ca4811760caaef849b6dc434e4e94e16665afe3389" +content-hash = "e7439726bea09577bbcdd4a19a7c54811a1ae80d4d659d618748b20421c8693a" diff --git a/pyproject.toml b/pyproject.toml index 098fb63..3a9d4ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,11 +14,10 @@ plucker = "jg.plucker.cli:main" [tool.poetry.dependencies] python = "3.12.*" -apify = { git = "https://github.com/apify/apify-sdk-python.git", branch = "fixing-scrapy", extras = ["scrapy"] } -apify-client = "1.8.1" # deployment of actors, monitoring, automation +apify = { git = "https://github.com/apify/apify-sdk-python.git", branch = "master", extras = ["scrapy"] } +apify-client = "1.9.1" # deployment of actors, monitoring, automation apify-shared = "*" # importing a few enums click = "8.1.8" -crawlee = "*" # importing MemoryStorageClient diskcache = "5.6.3" extruct = "0.18.0" feedparser = "6.0.11" @@ -44,6 +43,7 @@ addopts = "--import-mode=importlib --ff --ruff --ruff-format" filterwarnings = [ "ignore:twisted.web.http.HTTPClient was deprecated:DeprecationWarning", # scrapy "ignore:invalid escape sequence:SyntaxWarning", # extruct + "ignore:There is no current event loop:DeprecationWarning", # apify ] [tool.ruff] diff --git a/tests/jobs_jobscz/listing_page.html b/tests/jobs_jobscz/listing_page.html new file mode 100644 index 0000000..ee0afa1 --- /dev/null +++ b/tests/jobs_jobscz/listing_page.html @@ -0,0 +1,5185 @@ + + + + + + + + + + + + Nabídka práce Programátor – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 608 nabídek +

+ +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+
Významní zaměstnavatelé v IT +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+

+ PHP programátor/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + ALENSA, s.r.o. +
  • +
  • + + + Praha – Libeň +
  • +
+
+
+ + + + + + +
+
+

+ Technology Lead + +

+ +
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Asseco Central Europe, a.s. +
  • +
  • + + + Brno – Veveří +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + +
+
+

+ PROGRAMÁTOR - Java + +

+
+ 11. února +
+ + + + +
+
+ + + + 55 000 ‍–‍ 85 000 Kč + + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Syntea software group a.s. +
  • +
  • + + + Praha – Jinonice +
  • +
+
+
+ + + + + + +
+
+

+ Game UX Designer + +

+
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + SCS Software s.r.o. +
  • +
  • + + + Praha – Michle +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Pracovník technické podpory (PLC a MaR) + +

+
+ 11. února +
+ + + + +
+
+ + + + 40 000 ‍–‍ 50 000 Kč + + + Odpověď do 2 týdnů + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Unipi Technology s.r.o. +
  • +
  • + + + Brno – Lesná +
  • +
+
+
+ + + + + + +
+
+

+ Vývojář ERP HELIOS + +

+
+ 11. února +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + Možnost občasné práce z domova + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Gatema IT a.s +
  • +
  • + + + Boskovice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+
+

+ Vývojář produktů + +

+
+ 11. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + SCHURTER, společnost s ručením omezeným +
  • +
  • + + + Malá Skála +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/listing_page_first.html b/tests/jobs_jobscz/listing_page_first.html new file mode 100644 index 0000000..340ad3a --- /dev/null +++ b/tests/jobs_jobscz/listing_page_first.html @@ -0,0 +1,4285 @@ + + + + + + + + + + + + Nabídka práce Kuchař – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 58 nabídek +

+ +
+ + +
+ + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ Aktualizováno dnes +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Hospic sv. Štěpána, z.s. +
  • +
  • + + + Litoměřice – Předměstí +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ kuchař/ka + +

+
+ Přidáno včera +
+ + + + +
+
+ + + + 26 000 ‍–‍ 31 090 Kč + + + Odpověď do 2 týdnů + + + + +
+
+
    +
  • + + + Dětský diagnostický ústav, základní škola a školní jídelna, Praha 4, U Michelského lesa 222 +
  • +
  • + + + Praha – Michle +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + +
+
+

+ KUCHAŘ - PIZZAŘ + +

+
+ Přidáno včera +
+ + + + +
+
+ + + + 53 000 ‍–‍ 65 000 Kč + + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + MARINA RISTORANTE s.r.o. +
  • +
  • + + + Praha – Staré Město +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař ala cart + +

+
+ 11. února +
+ + + + +
+
+ + + + 45 000 ‍–‍ 55 000 Kč + + + + + + +
+
+
    +
  • + + + CI International s.r.o. +
  • +
  • + + + Praha – Dejvice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + GMF AQUAPARK PRAGUE, a.s. +
  • +
  • + + + Čestlice +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař + +

+
+ 11. února +
+ + + + +
+
+ + + + 50 000 Kč + + + + + + +
+
+
    +
  • + + + PRAGUE HOTELS, s.r.o. +
  • +
  • + + + Praha +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + 25 000 ‍–‍ 28 000 Kč + + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Obecně prospěšná společnost Důstojnost +
  • +
  • + + + Nýdek +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař + +

+
+ 10. února +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Štěpánka Dušánková +
  • +
  • + + + Praha – Kunratice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař + +

+
+ 10. února +
+ + + + +
+
+ + + + 50 000 ‍–‍ 65 000 Kč + + + + + + +
+
+
    +
  • + + + CZECH INN HOTELS s.r.o. +
  • +
  • + + + Brno – Ponava +
  • +
+
+
+ + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/listing_page_last.html b/tests/jobs_jobscz/listing_page_last.html new file mode 100644 index 0000000..1e8f408 --- /dev/null +++ b/tests/jobs_jobscz/listing_page_last.html @@ -0,0 +1,4370 @@ + + + + + + + + + + + + Nabídka práce Kuchař – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 58 nabídek +

+ +
+ + +
+ + + + + + + + +
+
+

+ Kuchař v Harvey Spa Hotelu + +

+
+ 10. února +
+ + + + +
+
+ + + + 36 000 ‍–‍ 40 000 Kč + + + + + + +
+
+
    +
  • + + + Needmore, s.r.o. +
  • +
  • + + + Františkovy Lázně +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař / MyChef Kitchen + +

+
+ 9. února +
+ + + + +
+
+ + + + 60 000 ‍–‍ 65 000 Kč + + + + + + +
+
+
    +
  • + + + Kristína Tayara +
  • +
  • + + + Praha – Kobylisy +
  • +
+
+
+ + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 8. února +
+ + + + +
+
+ + + + 29 400 Kč + + + + + + +
+
+
    +
  • + + + Rehabilitační centrum Čeladná s.r.o. +
  • +
  • + + + Čeladná +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař/Kuchařka + +

+
+ 7. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + GURDAU VINAŘSTVÍ S.R.O. +
  • +
  • + + + Kurdějov +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + + + + + +
+
+

+ Kuchař / kuchařka + +

+
+ Končí za 3 dny +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + RACEK SRBY s.r.o. +
  • +
  • + + + Tuchlovice – Srby +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + + + + + + + + +
+
+

+ kuchař/ka + +

+
+ Končí zítra +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Jan Barták +
  • +
  • + + + Francie +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař/Kuchařka + +

+
+ Končí zítra +
+ + + + +
+
+ + + + 22 580 ‍–‍ 32 760 Kč + + + + + + +
+
+
    +
  • + + + Centrum sociálních služeb Hrabyně +
  • +
  • + + + Hrabyně +
  • +
+
+
+ + + + + + +
+
+

+ Samostatná kuchařka/samostatný kuchař + +

+
+ Končí za 8 hodin +
+ + + + +
+
+ + + + 36 000 ‍–‍ 37 000 Kč + + + Odpověď do 2 týdnů + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Univerzita Tomáše Bati ve Zlíně +
  • +
  • + + + Zlín +
  • +
+
+
+ + + + + + + + + + + + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+ +
+ Končí za 8 hodin +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + PŘEDVÝBĚR.CZ s.r.o. +
  • +
  • + + + Špindlerův Mlýn +
  • +
+
+
+ + + + + + + + + + + + + +
+
+

+ Chef de Partie + +

+ +
+ Končí za 8 hodin +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + + + +
+
+
    +
  • + + + PŘEDVÝBĚR.CZ s.r.o. +
  • +
  • + + + Harrachov +
  • +
+
+
+ + + + +
+ + + + +

Toto jsou všechny nabídky, které odpovídají Vašemu zadání. +

+ + + +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/test_spider.py b/tests/jobs_jobscz/test_spider.py index 0b0bb13..65c2bfa 100644 --- a/tests/jobs_jobscz/test_spider.py +++ b/tests/jobs_jobscz/test_spider.py @@ -5,7 +5,8 @@ from typing import cast import pytest -from scrapy.http import HtmlResponse, TextResponse +from scrapy.http.response.html import HtmlResponse +from scrapy.http.response.text import TextResponse from jg.plucker.items import Job from jg.plucker.jobs_jobscz.spider import Spider, select_widget @@ -21,7 +22,7 @@ def test_spider_parse(): ) requests = list(Spider().parse(response)) - assert len(requests) == 30 + 4 # jobs + pagination (without page=1) + assert len(requests) == 30 + 1 # jobs + next page assert ( requests[1].url @@ -53,12 +54,50 @@ def test_spider_parse(): } assert ( - requests[30].url + requests[-1].url == "https://beta.www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=2" ) + + +def test_spider_parse_listing_page(): + url = "https://www.jobs.cz/prace/programator/?profession[0]=201100249&page=5" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 30 + 1 # jobs + next page + assert ( + requests[-1].url + == "https://www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=6" + ) + + +def test_spider_parse_listing_page_first(): + url = "https://www.jobs.cz/prace/kuchar/" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page_first.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 30 + 1 # jobs + next page + assert ( + requests[-1].url + == "https://www.jobs.cz/prace/kuchar/?profession%5B0%5D=201100136&page=2" + ) + + +def test_spider_parse_listing_page_last(): + url = "https://www.jobs.cz/prace/kuchar/?page=2" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page_last.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 28 + 0 # jobs + next page assert ( requests[-1].url - == "https://beta.www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=5" + == "https://www.jobs.cz/rpd/2000404882/?searchId=6cbd9a5a-f1fc-4944-a64f-4445894fdff0&rps=233" ) From 7490418575851f1fe169aaaee5223cda2c35a610 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 15:52:33 +0100 Subject: [PATCH 47/51] update deps --- poetry.lock | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/poetry.lock b/poetry.lock index 10aa9ab..514f9e5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -224,10 +224,6 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -240,14 +236,8 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -258,24 +248,8 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, - {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, - {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -285,10 +259,6 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -300,10 +270,6 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -316,10 +282,6 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -332,10 +294,6 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, From 36b3912083f99681a22fcaee524d7b7947562b46 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 15:52:38 +0100 Subject: [PATCH 48/51] format code --- jg/plucker/scrapers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index dcf4c25..66ec29f 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -4,8 +4,6 @@ import pickle from pathlib import Path from time import time -from pathlib import Path -from threading import Thread from typing import Any, Coroutine, Generator, Type from apify import Actor, Configuration From 77ead8a0432a1b4b558c09ba6fcb9a6625ca9574 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 15:57:44 +0100 Subject: [PATCH 49/51] clean up logging --- jg/plucker/cli.py | 3 +- jg/plucker/loggers.py | 81 ------------------------------------------- tests/test_loggers.py | 21 ----------- 3 files changed, 2 insertions(+), 103 deletions(-) delete mode 100644 jg/plucker/loggers.py delete mode 100644 tests/test_loggers.py diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index a8521df..6c0b952 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -8,7 +8,7 @@ from typing import IO, Callable, Generator, Type import click -from apify.scrapy.logging_config import setup_logging +from apify.scrapy.logging_config import _MAIN_LOGGER_NAMES, setup_logging from apify_client import ApifyClient from apify_shared.consts import ActorJobStatus, ActorSourceType from pydantic import BaseModel @@ -40,6 +40,7 @@ def __str__(self) -> str: @click.group() @click.option("-d", "--debug", default=False, is_flag=True) def main(debug: bool = False): + _MAIN_LOGGER_NAMES.append("jg.plucker") setup_logging() logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO) for name in ["asyncio", "filelock", "crawlee"]: diff --git a/jg/plucker/loggers.py b/jg/plucker/loggers.py deleted file mode 100644 index 6f75dca..0000000 --- a/jg/plucker/loggers.py +++ /dev/null @@ -1,81 +0,0 @@ -import logging -from functools import wraps -from typing import Callable - -from apify.log import ActorLogFormatter -from scrapy.settings import Settings -from scrapy.utils import log as scrapy_log - - -CUSTOM_LOGGER_NAMES = ["jg.plucker", "apify", "apify_client"] - -SCRAPY_LOGGER_NAMES = ["filelock", "hpack", "httpx", "scrapy", "twisted"] - -ALL_LOGGER_NAMES = CUSTOM_LOGGER_NAMES + SCRAPY_LOGGER_NAMES - - -def configure_logging(settings: Settings, argv: list[str]): - logging_level = get_logging_level(settings, argv) - - handler = logging.StreamHandler() - handler.setFormatter(ActorLogFormatter(include_logger_name=True)) - - # Local loggers have to be set up here and in the `reconfigure_scrapy_logging` as well to be able - # to use them both from CLI and Scrapy components. - for logger_name in CUSTOM_LOGGER_NAMES: - configure_logger(logger_name, logging_level, handler) - - # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` - # call here: https://github.com/scrapy/scrapy/blob/2.12.0/scrapy/utils/log.py#L117 (even though - # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method - # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because - # otherwise we would lose some log messages. - scrapy_log.configure_logging = reconfigure_scrapy_log(logging_level, handler)( - scrapy_log.configure_logging - ) - - -def reconfigure_scrapy_log( - logging_level: str, *handlers: logging.StreamHandler -) -> Callable: - def decorator(configure_logging: Callable) -> Callable: - @wraps(configure_logging) - def wrapper(*args, **kwargs): - # We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root - # logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary - # loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77 - configure_logging(*args, **kwargs) - - # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger` - # property within spiders. See details in the Spider logger property: - # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46. - configure_logger(None, logging_level, *handlers) - - # We modify other loggers only by setting up their log level. A custom log handler is added - # only to the root logger to avoid duplicate log messages. - for logger_name in ALL_LOGGER_NAMES: - configure_logger(logger_name, logging_level) - - # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless - # messages, especially when running on the platform. - configure_logger("httpx", "WARNING") - - return wrapper - - return decorator - - -def configure_logger( - logger_name: str | None, log_level: str, *handlers: logging.StreamHandler -) -> None: - logger = logging.getLogger(logger_name) - logger.setLevel(log_level) - logger.handlers = [] - for handler in handlers: - logger.addHandler(handler) - - -def get_logging_level(settings: Settings, argv: list[str]) -> str: - if "--debug" in argv or "-d" in argv: - return "DEBUG" - return settings.get("LOG_LEVEL") diff --git a/tests/test_loggers.py b/tests/test_loggers.py deleted file mode 100644 index 23ccd2c..0000000 --- a/tests/test_loggers.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from scrapy.settings import Settings - -from jg.plucker.loggers import get_logging_level - - -@pytest.mark.parametrize( - "settings, argv, expected", - [ - pytest.param({}, [], "DEBUG", id="default from Scrapy"), - ({"LOG_LEVEL": "DEBUG"}, [], "DEBUG"), - ({"LOG_LEVEL": "DEBUG"}, ["--debug"], "DEBUG"), - ({"LOG_LEVEL": "DEBUG"}, ["-d"], "DEBUG"), - ({"LOG_LEVEL": "INFO"}, [], "INFO"), - ({"LOG_LEVEL": "INFO"}, ["--debug"], "DEBUG"), - ({"LOG_LEVEL": "INFO"}, ["-d"], "DEBUG"), - ({"LOG_LEVEL": "WARNING"}, [], "WARNING"), - ], -) -def test_get_logging_level(settings: dict, argv: list[str], expected: str): - assert get_logging_level(Settings(settings), argv) == expected From d493099ceb32272dba5f1261d38d19df8ba6564f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 17:52:05 +0100 Subject: [PATCH 50/51] improve cache --- jg/plucker/cache.py | 156 +++++++++++++++++++++++++++++++ jg/plucker/cli.py | 11 ++- jg/plucker/jobs_jobscz/spider.py | 5 +- jg/plucker/scrapers.py | 111 +--------------------- jg/plucker/settings.py | 2 +- tests/test_cache.py | 36 +++++++ 6 files changed, 205 insertions(+), 116 deletions(-) create mode 100644 jg/plucker/cache.py create mode 100644 tests/test_cache.py diff --git a/jg/plucker/cache.py b/jg/plucker/cache.py new file mode 100644 index 0000000..0dfd735 --- /dev/null +++ b/jg/plucker/cache.py @@ -0,0 +1,156 @@ +import gzip +import io +import logging +import pickle +import struct +from time import time + +from apify import Configuration +from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy._async_thread import AsyncThread +from apify.storages import KeyValueStore +from scrapy import Request, Spider +from scrapy.http.headers import Headers +from scrapy.http.response import Response +from scrapy.responsetypes import responsetypes +from scrapy.settings import BaseSettings +from scrapy.utils.request import RequestFingerprinterProtocol + + +logger = logging.getLogger("jg.plucker.cache") + + +class CacheStorage: + def __init__(self, settings: BaseSettings): + self.expiration_max_items = 100 + self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") + self.spider: Spider | None = None + self._kv: KeyValueStore | None = None + self._fingerprinter: RequestFingerprinterProtocol | None = None + self._async_thread: AsyncThread | None = None + + def open_spider(self, spider: Spider) -> None: + logger.debug("Using Apify key value cache storage", extra={"spider": spider}) + self.spider = spider + self._fingerprinter = spider.crawler.request_fingerprinter + kv_name = f"httpcache-{spider.name}" + + async def open_kv() -> KeyValueStore: + config = Configuration.get_global_configuration() + if config.is_at_home: + storage_client = ApifyStorageClient.from_config(config) + return await KeyValueStore.open( + name=kv_name, storage_client=storage_client + ) + return await KeyValueStore.open(name=kv_name) + + logger.debug("Starting background thread for cache storage's event loop") + self._async_thread = AsyncThread() + logger.debug(f"Opening cache storage's {kv_name!r} key value store") + self._kv = self._async_thread.run_coro(open_kv()) + + def close_spider(self, spider: Spider, current_time: int | None = None) -> None: + assert self._async_thread is not None, "Async thread not initialized" + + logger.info(f"Cleaning up cache items (max {self.expiration_max_items})") + if 0 < self.expiration_secs: + if current_time is None: + current_time = int(time()) + + async def expire_kv() -> None: + assert self._kv is not None, "Key value store not initialized" + i = 0 + async for item in self._kv.iterate_keys(): + value = await self._kv.get_value(item.key) + try: + gzip_time = read_gzip_time(value) + except Exception as e: + logger.warning(f"Malformed cache item {item.key}: {e}") + await self._kv.set_value(item.key, None) + else: + if self.expiration_secs < current_time - gzip_time: + logger.debug(f"Expired cache item {item.key}") + await self._kv.set_value(item.key, None) + else: + logger.debug(f"Valid cache item {item.key}") + if i == self.expiration_max_items: + break + i += 1 + + self._async_thread.run_coro(expire_kv()) + + logger.debug("Closing cache storage") + try: + self._async_thread.close() + except KeyboardInterrupt: + logger.warning("Shutdown interrupted by KeyboardInterrupt!") + except Exception: + logger.exception("Exception occurred while shutting down cache storage") + finally: + logger.debug("Cache storage closed") + + def retrieve_response( + self, spider: Spider, request: Request, current_time: int | None = None + ) -> Response | None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + value = self._async_thread.run_coro(self._kv.get_value(key)) + + if value is None: + logger.debug("Cache miss", extra={"request": request}) + return None + + if current_time is None: + current_time = int(time()) + if 0 < self.expiration_secs < current_time - read_gzip_time(value): + logger.debug("Cache expired", extra={"request": request}) + return None + + data = from_gzip(value) + url = data["url"] + status = data["status"] + headers = Headers(data["headers"]) + body = data["body"] + respcls = responsetypes.from_args(headers=headers, url=url, body=body) + + logger.debug("Cache hit", extra={"request": request}) + return respcls(url=url, headers=headers, status=status, body=body) + + def store_response( + self, spider: Spider, request: Request, response: Response + ) -> None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + data = { + "status": response.status, + "url": response.url, + "headers": dict(response.headers), + "body": response.body, + } + value = to_gzip(data) + self._async_thread.run_coro(self._kv.set_value(key, value)) + + +def to_gzip(data: dict, mtime: int | None = None) -> bytes: + with io.BytesIO() as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode="wb", mtime=mtime) as gzip_file: + pickle.dump(data, gzip_file, protocol=4) + return byte_stream.getvalue() + + +def from_gzip(gzip_bytes: bytes) -> dict: + with io.BytesIO(gzip_bytes) as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode="rb") as gzip_file: + return pickle.load(gzip_file) + + +def read_gzip_time(gzip_bytes: bytes) -> int: + header = gzip_bytes[:10] + header_components = struct.unpack(" str: @click.group() @click.option("-d", "--debug", default=False, is_flag=True) def main(debug: bool = False): - _MAIN_LOGGER_NAMES.append("jg.plucker") setup_logging() - logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO) + level = logging.DEBUG if debug else logging.INFO + logging.getLogger().setLevel(level) + logger.setLevel(level) for name in ["asyncio", "filelock", "crawlee"]: logging.getLogger(name).setLevel(logging.WARNING) diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 06fce49..8d666ff 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -136,10 +136,7 @@ def parse(self, response: Response) -> Generator[Request, None, None]: next_page_css = f'.Pagination__link[href*="page={page + 1}"]::attr(href)' if next_page_link := response.css(next_page_css).get(): yield response.follow( - next_page_link, - callback=self.parse, - cb_kwargs={"page": page + 1}, - meta={"impersonate": "edge101"}, + next_page_link, callback=self.parse, meta={"impersonate": "edge101"} ) else: self.logger.debug(f"No next page found for {response.url}") diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 66ec29f..b57d6aa 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,29 +1,19 @@ import asyncio import logging import os -import pickle from pathlib import Path -from time import time from typing import Any, Coroutine, Generator, Type -from apify import Actor, Configuration -from apify.apify_storage_client import ApifyStorageClient +from apify import Actor from apify.scrapy import run_scrapy_actor -from apify.scrapy._async_thread import AsyncThread from apify.scrapy.utils import apply_apify_settings -from apify.storages import KeyValueStore -from scrapy import Item, Request, Spider +from scrapy import Item, Spider from scrapy.crawler import Crawler, CrawlerRunner -from scrapy.http.headers import Headers -from scrapy.http.response import Response -from scrapy.responsetypes import responsetypes from scrapy.settings import BaseSettings from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader from scrapy.statscollectors import StatsT from scrapy.utils.defer import deferred_to_future from scrapy.utils.project import get_project_settings -from scrapy.utils.reactor import is_asyncio_reactor_installed -from scrapy.utils.request import RequestFingerprinterProtocol from twisted.internet import asyncioreactor @@ -65,7 +55,7 @@ async def run_as_actor( logger.debug(f"Proxy config: {proxy_config!r}") settings = apply_apify_settings(proxy_config=proxy_config) - settings.set("HTTPCACHE_STORAGE", "jg.plucker.scrapers.CacheStorage") + settings.set("HTTPCACHE_STORAGE", "jg.plucker.cache.CacheStorage") settings.set("SPIDER_PARAMS", spider_params) logger.debug(f"Spider params: {spider_params!r}") @@ -163,98 +153,3 @@ def evaluate_stats(stats: StatsT, min_items: int): raise StatsError(f"Scraping finished with reason {reason!r}") if item_count := stats.get("item_dropped_reasons_count/MissingRequiredFields"): raise StatsError(f"Items missing required fields: {item_count}") - - -class CacheStorage: - def __init__(self, settings: BaseSettings): - if not is_asyncio_reactor_installed(): - raise ValueError( - f"{self.__class__.__qualname__} requires the asyncio Twisted reactor. " - "Make sure you have it configured in the TWISTED_REACTOR setting. See the asyncio " - "documentation of Scrapy for more information.", - ) - self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") - self.spider: Spider | None = None - self._kv: KeyValueStore | None = None - self._fingerprinter: RequestFingerprinterProtocol | None = None - - logger.debug("Starting background thread for cache storage's event loop") - self._async_thread = AsyncThread() - - def open_spider(self, spider: Spider) -> None: - logger.debug("Using Apify key value cache storage", extra={"spider": spider}) - self.spider = spider - self._fingerprinter = spider.crawler.request_fingerprinter - kv_name = f"httpcache-{spider.name}" - - async def open_kv() -> KeyValueStore: - config = Configuration.get_global_configuration() - if config.is_at_home: - storage_client = ApifyStorageClient.from_config(config) - return await KeyValueStore.open( - name=kv_name, storage_client=storage_client - ) - return await KeyValueStore.open(name=kv_name) - - logger.debug(f"Opening cache storage's {kv_name!r} key value store") - self._kv = self._async_thread.run_coro(open_kv()) - - def close_spider(self, spider: Spider) -> None: - logger.debug("Closing cache storage...") - try: - self._async_thread.close() - except KeyboardInterrupt: - logger.warning("Shutdown interrupted by KeyboardInterrupt!") - except Exception: - logger.exception("Exception occurred while shutting down cache storage") - finally: - logger.debug("Cache storage closed") - - def retrieve_response(self, spider: Spider, request: Request) -> Response | None: - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" - - key = self._fingerprinter.fingerprint(request).hex() - - seconds = self._async_thread.run_coro(self._kv.get_value(f"{key}_time")) - if seconds is None: - logger.debug("Cache miss", extra={"request": request}) - return None - - if 0 < self.expiration_secs < time() - seconds: - logger.debug("Cache expired", extra={"request": request}) - self._async_thread.run_coro(self._kv.set_value(f"{key}_data", None)) - self._async_thread.run_coro(self._kv.set_value(f"{key}_time", None)) - return None - - value = self._async_thread.run_coro(self._kv.get_value(f"{key}_data")) - if value is None: - logger.debug("Cache miss", extra={"request": request}) - return None - - data = pickle.loads(value) - url = data["url"] - status = data["status"] - headers = Headers(data["headers"]) - body = data["body"] - respcls = responsetypes.from_args(headers=headers, url=url, body=body) - - logger.debug("Cache hit", extra={"request": request}) - return respcls(url=url, headers=headers, status=status, body=body) - - def store_response( - self, spider: Spider, request: Request, response: Response - ) -> None: - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" - - key = self._fingerprinter.fingerprint(request).hex() - data = { - "status": response.status, - "url": response.url, - "headers": dict(response.headers), - "body": response.body, - } - value = pickle.dumps(data, protocol=4) - self._async_thread.run_coro(self._kv.set_value(f"{key}_data", value)) - self._async_thread.run_coro(self._kv.set_value(f"{key}_time", time())) diff --git a/jg/plucker/settings.py b/jg/plucker/settings.py index e764fc4..ded8f88 100644 --- a/jg/plucker/settings.py +++ b/jg/plucker/settings.py @@ -22,7 +22,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 403, 408, 429, 999] -# HTTPCACHE_ENABLED = True +HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 43200 # 12 hours diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..8f15980 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,36 @@ +from time import time + +from jg.plucker.cache import from_gzip, read_gzip_time, to_gzip + + +FIXTURE_BYTES = ( + b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xffk`\x99*\xcc\x00\x01\xb5SzX\xf2\x12s" + b"S\xa7\xf4\xb0:\xe6d&\xa7N)\xd6\x03\x00\x1c\xe8U\x9c\x1e\x00\x00\x00" +) + + +def test_gzip(): + assert from_gzip(to_gzip({"name": "Alice"})) == {"name": "Alice"} + + +def test_to_gzip(): + data_bytes = to_gzip({"name": "Alice"}, mtime=0) + + assert data_bytes == FIXTURE_BYTES + + +def test_from_gzip(): + data_dict = from_gzip(FIXTURE_BYTES) + + assert data_dict == {"name": "Alice"} + + +def test_read_gzip_time(): + assert read_gzip_time(FIXTURE_BYTES) == 0 + + +def test_read_gzip_time_non_zero(): + current_time = int(time()) + data_bytes = to_gzip({"name": "Alice"}, mtime=current_time) + + assert read_gzip_time(data_bytes) == current_time From bc451968f18bd92154957804741d589ae14fa091 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 13 Feb 2025 17:52:49 +0100 Subject: [PATCH 51/51] final fixes --- jg/plucker/cli.py | 6 +----- jg/plucker/jobs_jobscz/spider.py | 7 +++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/jg/plucker/cli.py b/jg/plucker/cli.py index cfd8067..49efbd8 100644 --- a/jg/plucker/cli.py +++ b/jg/plucker/cli.py @@ -8,11 +8,7 @@ from typing import IO, Callable, Generator, Type import click -from apify.scrapy.logging_config import ( - _ALL_LOGGER_NAMES, - _MAIN_LOGGER_NAMES, - setup_logging, -) +from apify.scrapy.logging_config import setup_logging from apify_client import ApifyClient from apify_shared.consts import ActorJobStatus, ActorSourceType from pydantic import BaseModel diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index 8d666ff..aaf509f 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -87,10 +87,9 @@ class Spider(BaseSpider): name = "jobs-jobscz" start_urls = [ - # "https://www.jobs.cz/prace/programator/", - # "https://www.jobs.cz/prace/tester/", - # "https://www.jobs.cz/prace/datovy-analytik/", - "https://www.jobs.cz/prace/kuchar/", + "https://www.jobs.cz/prace/programator/", + "https://www.jobs.cz/prace/tester/", + "https://www.jobs.cz/prace/datovy-analytik/", ] employment_types_labels = ["Typ pracovního poměru", "Employment form"]