diff --git a/jg/plucker/cache.py b/jg/plucker/cache.py new file mode 100644 index 0000000..0dfd735 --- /dev/null +++ b/jg/plucker/cache.py @@ -0,0 +1,156 @@ +import gzip +import io +import logging +import pickle +import struct +from time import time + +from apify import Configuration +from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy._async_thread import AsyncThread +from apify.storages import KeyValueStore +from scrapy import Request, Spider +from scrapy.http.headers import Headers +from scrapy.http.response import Response +from scrapy.responsetypes import responsetypes +from scrapy.settings import BaseSettings +from scrapy.utils.request import RequestFingerprinterProtocol + + +logger = logging.getLogger("jg.plucker.cache") + + +class CacheStorage: + def __init__(self, settings: BaseSettings): + self.expiration_max_items = 100 + self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") + self.spider: Spider | None = None + self._kv: KeyValueStore | None = None + self._fingerprinter: RequestFingerprinterProtocol | None = None + self._async_thread: AsyncThread | None = None + + def open_spider(self, spider: Spider) -> None: + logger.debug("Using Apify key value cache storage", extra={"spider": spider}) + self.spider = spider + self._fingerprinter = spider.crawler.request_fingerprinter + kv_name = f"httpcache-{spider.name}" + + async def open_kv() -> KeyValueStore: + config = Configuration.get_global_configuration() + if config.is_at_home: + storage_client = ApifyStorageClient.from_config(config) + return await KeyValueStore.open( + name=kv_name, storage_client=storage_client + ) + return await KeyValueStore.open(name=kv_name) + + logger.debug("Starting background thread for cache storage's event loop") + self._async_thread = AsyncThread() + logger.debug(f"Opening cache storage's {kv_name!r} key value store") + self._kv = self._async_thread.run_coro(open_kv()) + + def close_spider(self, spider: Spider, current_time: int | None = None) -> None: + assert self._async_thread is not None, "Async thread not initialized" + + logger.info(f"Cleaning up cache items (max {self.expiration_max_items})") + if 0 < self.expiration_secs: + if current_time is None: + current_time = int(time()) + + async def expire_kv() -> None: + assert self._kv is not None, "Key value store not initialized" + i = 0 + async for item in self._kv.iterate_keys(): + value = await self._kv.get_value(item.key) + try: + gzip_time = read_gzip_time(value) + except Exception as e: + logger.warning(f"Malformed cache item {item.key}: {e}") + await self._kv.set_value(item.key, None) + else: + if self.expiration_secs < current_time - gzip_time: + logger.debug(f"Expired cache item {item.key}") + await self._kv.set_value(item.key, None) + else: + logger.debug(f"Valid cache item {item.key}") + if i == self.expiration_max_items: + break + i += 1 + + self._async_thread.run_coro(expire_kv()) + + logger.debug("Closing cache storage") + try: + self._async_thread.close() + except KeyboardInterrupt: + logger.warning("Shutdown interrupted by KeyboardInterrupt!") + except Exception: + logger.exception("Exception occurred while shutting down cache storage") + finally: + logger.debug("Cache storage closed") + + def retrieve_response( + self, spider: Spider, request: Request, current_time: int | None = None + ) -> Response | None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + value = self._async_thread.run_coro(self._kv.get_value(key)) + + if value is None: + logger.debug("Cache miss", extra={"request": request}) + return None + + if current_time is None: + current_time = int(time()) + if 0 < self.expiration_secs < current_time - read_gzip_time(value): + logger.debug("Cache expired", extra={"request": request}) + return None + + data = from_gzip(value) + url = data["url"] + status = data["status"] + headers = Headers(data["headers"]) + body = data["body"] + respcls = responsetypes.from_args(headers=headers, url=url, body=body) + + logger.debug("Cache hit", extra={"request": request}) + return respcls(url=url, headers=headers, status=status, body=body) + + def store_response( + self, spider: Spider, request: Request, response: Response + ) -> None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + data = { + "status": response.status, + "url": response.url, + "headers": dict(response.headers), + "body": response.body, + } + value = to_gzip(data) + self._async_thread.run_coro(self._kv.set_value(key, value)) + + +def to_gzip(data: dict, mtime: int | None = None) -> bytes: + with io.BytesIO() as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode="wb", mtime=mtime) as gzip_file: + pickle.dump(data, gzip_file, protocol=4) + return byte_stream.getvalue() + + +def from_gzip(gzip_bytes: bytes) -> dict: + with io.BytesIO(gzip_bytes) as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode="rb") as gzip_file: + return pickle.load(gzip_file) + + +def read_gzip_time(gzip_bytes: bytes) -> int: + header = gzip_bytes[:10] + header_components = struct.unpack(" str: @click.group() @click.option("-d", "--debug", default=False, is_flag=True) def main(debug: bool = False): - pass # --debug is processed in configure_logging() + setup_logging() + level = logging.DEBUG if debug else logging.INFO + logging.getLogger().setLevel(level) + logger.setLevel(level) + for name in ["asyncio", "filelock", "crawlee"]: + logging.getLogger(name).setLevel(logging.WARNING) @main.command(context_settings={"ignore_unknown_options": True}) @@ -85,7 +80,6 @@ def crawl( logger.info("Reading spider params from stdin") spider_params = json.load(spider_params_f) - configure_async() try: if apify: logger.info(f"Crawling as Apify actor {actor_path}") @@ -94,10 +88,11 @@ def crawl( raise click.BadParameter( f"Actor {actor_path} not found! Valid actors: {actors}" ) - asyncio.run(run_actor(settings, spider_class, spider_params)) + run = run_as_actor(spider_class, spider_params) else: logger.info(f"Crawling as Scrapy spider {spider_name!r}") - run_spider(settings, spider_class, spider_params) + run = run_as_spider(spider_class, spider_params) + start_reactor(run) except StatsError as e: logger.error(e) raise click.Abort() diff --git a/jg/plucker/jobs_jobscz/spider.py b/jg/plucker/jobs_jobscz/spider.py index ff16155..aaf509f 100644 --- a/jg/plucker/jobs_jobscz/spider.py +++ b/jg/plucker/jobs_jobscz/spider.py @@ -1,23 +1,24 @@ import hashlib import json -import logging import re import uuid from datetime import date, datetime from functools import lru_cache +from logging import Logger from pathlib import Path from typing import Any, Generator, Iterable, cast from urllib.parse import urljoin, urlparse from itemloaders.processors import Compose, Identity, MapCompose, TakeFirst from scrapy import Request, Spider as BaseSpider +from scrapy.http.response import Response from scrapy.http.response.html import HtmlResponse from scrapy.http.response.text import TextResponse from scrapy.loader import ItemLoader from jg.plucker.items import Job from jg.plucker.processors import first, split -from jg.plucker.url_params import get_params, strip_params +from jg.plucker.url_params import get_param, get_params, strip_params MULTIPLE_LOCATIONS_RE = re.compile( @@ -85,28 +86,26 @@ class Spider(BaseSpider): name = "jobs-jobscz" - custom_settings = { - "CONCURRENT_REQUESTS_PER_DOMAIN": 2, - "DOWNLOAD_DELAY": 0.5, - "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.5, - } - start_urls = [ "https://www.jobs.cz/prace/programator/", "https://www.jobs.cz/prace/tester/", "https://www.jobs.cz/prace/datovy-analytik/", ] - employment_types_labels = [ - "Typ pracovního poměru", - "Employment form", - ] + employment_types_labels = ["Typ pracovního poměru", "Employment form"] + + def logger_trk(self, trk: str) -> Logger: + return self.logger.logger.getChild(trk) + + def parse(self, response: Response) -> Generator[Request, None, None]: + response = cast(HtmlResponse, response) + page = get_page(response.url) + self.logger.debug(f"Parsing listing {response.url} (page: {page})") - def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get()) - track_id = get_track_id(url) + trk = get_trk(url) # logging track ID for each job loader = Loader(item=Job(), response=response) card_loader = loader.nested_xpath(f"{card_xpath}[{n}]") @@ -124,34 +123,37 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_loader.add_value("source_urls", url) item = loader.load_item() - self.track_logger(track_id).debug(f"Parsing card for {url}") + self.logger_trk(trk).debug(f"Parsing card for {url}") yield response.follow( url, callback=self.parse_job, - cb_kwargs=dict(item=item, track_id=track_id), + cb_kwargs=dict(item=item, trk=trk), meta={"impersonate": "edge101"}, ) - urls = [ - response.urljoin(relative_url) - for relative_url in response.css(".Pagination__link::attr(href)").getall() - if "page=" in relative_url - ] - yield from response.follow_all( - urls, callback=self.parse, meta={"impersonate": "edge101"} - ) + self.logger.debug(f"Found {n} job cards on {response.url}") + + next_page_css = f'.Pagination__link[href*="page={page + 1}"]::attr(href)' + if next_page_link := response.css(next_page_css).get(): + yield response.follow( + next_page_link, callback=self.parse, meta={"impersonate": "edge101"} + ) + else: + self.logger.debug(f"No next page found for {response.url}") def parse_job( - self, response: HtmlResponse, item: Job, track_id: str + self, response: Response, item: Job, trk: str ) -> Generator[Job | Request, None, None]: - self.track_logger(track_id).debug("Parsing job page") + response = cast(HtmlResponse, response) + self.logger_trk(trk).debug(f"Parsing job page {response.url}") + loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) if "www.jobs.cz" not in response.url: - yield from self.parse_job_widget_data(response, item, track_id) + yield from self.parse_job_widget_data(response, item, trk) else: - self.track_logger(track_id).debug("Parsing as standard job page") + self.logger_trk(trk).debug("Parsing as standard job page") for label in self.employment_types_labels: loader.add_xpath( "employment_types", @@ -161,7 +163,7 @@ def parse_job( loader.add_css("description_html", '[data-jobad="body"]') if response.css('[class*="CompanyProfileNavigation"]').get(): - self.track_logger(track_id).debug("Parsing as company job page") + self.logger_trk(trk).debug("Parsing as company job page") loader.add_css( "company_logo_urls", ".CompanyProfileNavigation__logo img::attr(src)", @@ -176,15 +178,13 @@ def parse_job( yield loader.load_item() def parse_job_widget_data( - self, response: HtmlResponse, item: Job, track_id: str + self, response: HtmlResponse, item: Job, trk: str ) -> Generator[Request, None, None]: try: - self.track_logger(track_id).debug("Looking for widget data in the HTML") + self.logger_trk(trk).debug("Looking for widget data in the HTML") widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0]) except IndexError: - self.track_logger(track_id).debug( - "Looking for widget data in attached JavaScript" - ) + self.logger_trk(trk).debug("Looking for widget data in attached JavaScript") script_urls = sorted( map( response.urljoin, @@ -194,7 +194,7 @@ def parse_job_widget_data( ), key=get_script_relevance, ) - self.track_logger(track_id).debug(f"Script URLs: {script_urls!r}") + self.logger_trk(trk).debug(f"Script URLs: {script_urls!r}") yield response.follow( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -202,7 +202,7 @@ def parse_job_widget_data( item=item, url=response.url, script_urls=script_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) @@ -213,17 +213,19 @@ def parse_job_widget_data( widget_host=widget_data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["widgetId"], - track_id=track_id, + trk=trk, ) def parse_job_widget_script( self, - script_response: TextResponse, + script_response: Response, url: str, item: Job, script_urls: list[str], - track_id: str, + trk: str, ) -> Generator[Request, None, None]: + script_response = cast(TextResponse, script_response) + if data := parse_widget_script_json(script_response.text): widget_name = select_widget(list(data["widgets"].keys())) widget_data = data["widgets"][widget_name] @@ -233,7 +235,7 @@ def parse_job_widget_script( widget_host=data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["id"], - track_id=track_id, + trk=trk, ) elif mess := parse_widget_script_mess(script_response.text): yield from self.parse_job_widget( @@ -242,13 +244,13 @@ def parse_job_widget_script( widget_host=get_widget_host(url), widget_api_key=mess["widgetApiKey"], widget_id=mess["widgetId"], - track_id=track_id, + trk=trk, ) elif chunk_names := parse_react_chunk_names(script_response.text): chunk_urls = [ url.replace("react.min.js", chunk_name) for chunk_name in chunk_names ] - self.track_logger(track_id).debug(f"Chunk URLs: {chunk_urls!r}") + self.logger_trk(trk).debug(f"Chunk URLs: {chunk_urls!r}") yield Request( chunk_urls.pop(0), callback=self.parse_job_widget_script, @@ -256,12 +258,12 @@ def parse_job_widget_script( item=item, url=url, script_urls=chunk_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) elif script_urls: - self.track_logger(track_id).debug(f"Script URLs: {script_urls!r}") + self.logger_trk(trk).debug(f"Script URLs: {script_urls!r}") yield Request( script_urls.pop(0), callback=self.parse_job_widget_script, @@ -269,7 +271,7 @@ def parse_job_widget_script( item=item, url=url, script_urls=script_urls, - track_id=track_id, + trk=trk, ), meta={"impersonate": "edge101"}, ) @@ -283,14 +285,14 @@ def parse_job_widget( widget_host: str, widget_api_key: str, widget_id: str, - track_id: str, + trk: str, ) -> Generator[Request, None, None]: loader = Loader(item=item) loader.add_value("url", url) loader.add_value("company_url", f"https://{widget_host}") loader.add_value("source_urls", url) - self.track_logger(track_id).debug("Requesting data from job widget API") + self.logger_trk(trk).debug("Requesting data from job widget API") params = get_params(url) yield Request( "https://api.capybara.lmc.cz/api/graphql/widget", @@ -322,14 +324,16 @@ def parse_job_widget( ) ), callback=self.parse_job_widget_api, - cb_kwargs=dict(item=loader.load_item(), track_id=track_id), + cb_kwargs=dict(item=loader.load_item(), trk=trk), meta={"impersonate": "edge101"}, ) def parse_job_widget_api( - self, response: TextResponse, item: Job, track_id: str + self, response: Response, item: Job, trk: str ) -> Generator[Job, None, None]: - self.track_logger(track_id).debug("Parsing job widget API response") + response = cast(TextResponse, response) + self.logger_trk(trk).debug("Parsing job widget API response") + try: payload = cast(dict, response.json()) except json.JSONDecodeError as e: @@ -352,13 +356,14 @@ def parse_job_widget_api( yield loader.load_item() - def track_logger(self, track_id: str) -> logging.LoggerAdapter: - logger = logging.getLogger(f"{self.name}.{track_id}") - return logging.LoggerAdapter(logger, {"spider": self, "track_id": track_id}) +def get_page(url: str) -> int: + if page := get_param(url, "page"): + return int(page) + return 1 -@lru_cache -def get_track_id(seed: str) -> str: + +def get_trk(seed: str) -> str: return hashlib.sha1(seed.encode()).hexdigest()[:10] @@ -376,7 +381,9 @@ def select_widget(names: list[str]) -> str: def parse_widget_script_json(text: str) -> dict[str, Any] | None: for match in re.finditer(WIDGET_DATA_SCRIPT_JSON_RE, text): - data_text = re.sub(r"\'", r"\\'", match.group("data")) + data_text = match.group("data") + data_text = re.sub(r"\'", r"\\'", data_text) + data_text = re.sub(r'\\\\"', r"\"", data_text) data = json.loads(data_text) if "widgets" in data: return data diff --git a/jg/plucker/loggers.py b/jg/plucker/loggers.py deleted file mode 100644 index 5bd2478..0000000 --- a/jg/plucker/loggers.py +++ /dev/null @@ -1,81 +0,0 @@ -import logging -from functools import wraps -from typing import Callable - -from apify.log import ActorLogFormatter -from scrapy.settings import Settings -from scrapy.utils import log as scrapy_logging - - -CUSTOM_LOGGER_NAMES = ["jg.plucker", "apify", "apify_client"] - -SCRAPY_LOGGER_NAMES = ["filelock", "hpack", "httpx", "scrapy", "twisted"] - -ALL_LOGGER_NAMES = CUSTOM_LOGGER_NAMES + SCRAPY_LOGGER_NAMES - - -def configure_logging(settings: Settings, argv: list[str]): - logging_level = get_logging_level(settings, argv) - - handler = logging.StreamHandler() - handler.setFormatter(ActorLogFormatter(include_logger_name=True)) - - # Local loggers have to be set up here and in the `reconfigure_scrapy_logging` as well to be able - # to use them both from CLI and Scrapy components. - for logger_name in CUSTOM_LOGGER_NAMES: - configure_logger(logger_name, logging_level, handler) - - # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` - # call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though - # `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method - # like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because - # otherwise we would lose some log messages. - scrapy_logging.configure_logging = reconfigure_scrapy_logging( - logging_level, handler - )(scrapy_logging.configure_logging) - - -def reconfigure_scrapy_logging( - logging_level: str, *handlers: logging.StreamHandler -) -> Callable: - def decorator(configure_logging: Callable) -> Callable: - @wraps(configure_logging) - def wrapper(*args, **kwargs): - # We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root - # logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary - # loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77 - configure_logging(*args, **kwargs) - - # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger` - # property within spiders. See details in the Spider logger property: - # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46. - configure_logger(None, logging_level, *handlers) - - # We modify other loggers only by setting up their log level. A custom log handler is added - # only to the root logger to avoid duplicate log messages. - for logger_name in ALL_LOGGER_NAMES: - configure_logger(logger_name, logging_level) - - # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless - # messages, especially when running on the platform. - configure_logger("httpx", "WARNING") - - return wrapper - - return decorator - - -def configure_logger( - logger_name: str | None, log_level: str, *handlers: logging.StreamHandler -) -> None: - logger = logging.getLogger(logger_name) - logger.setLevel(log_level) - logger.handlers = [] - for handler in handlers: - logger.addHandler(handler) - - -def get_logging_level(settings: Settings, argv: list[str]) -> str: - if "--debug" in argv or "-d" in argv: - return "DEBUG" - return settings.get("LOG_LEVEL") diff --git a/jg/plucker/scrapers.py b/jg/plucker/scrapers.py index 1db50cc..b57d6aa 100644 --- a/jg/plucker/scrapers.py +++ b/jg/plucker/scrapers.py @@ -1,66 +1,71 @@ import asyncio import logging -import pickle +import os from pathlib import Path -from threading import Thread from typing import Any, Coroutine, Generator, Type -import nest_asyncio -from apify import Actor, Configuration -from apify.apify_storage_client import ApifyStorageClient +from apify import Actor +from apify.scrapy import run_scrapy_actor from apify.scrapy.utils import apply_apify_settings -from apify.storages import KeyValueStore -from crawlee.storage_clients import MemoryStorageClient # pyright: ignore -from scrapy import Item, Request, Spider -from scrapy.crawler import CrawlerProcess -from scrapy.http.headers import Headers -from scrapy.http.response import Response -from scrapy.responsetypes import responsetypes -from scrapy.settings import BaseSettings, Settings +from scrapy import Item, Spider +from scrapy.crawler import Crawler, CrawlerRunner +from scrapy.settings import BaseSettings from scrapy.spiderloader import SpiderLoader as BaseSpiderLoader -from scrapy.statscollectors import StatsCollector -from scrapy.utils.reactor import is_asyncio_reactor_installed -from scrapy.utils.request import RequestFingerprinterProtocol +from scrapy.statscollectors import StatsT +from scrapy.utils.defer import deferred_to_future +from scrapy.utils.project import get_project_settings +from twisted.internet import asyncioreactor logger = logging.getLogger("jg.plucker") -def run_spider( - settings: Settings, spider_class: type[Spider], spider_params: dict[str, Any] | None +def start_reactor(coroutine: Coroutine) -> None: + asyncioreactor.install(asyncio.get_event_loop()) + run_scrapy_actor(coroutine) + + +async def run_as_spider( + spider_class: Type[Spider], spider_params: dict[str, Any] | None ) -> None: - logger.debug(f"Spider params: {spider_params!r}") + settings = get_project_settings() settings.set("SPIDER_PARAMS", spider_params) + logger.debug(f"Spider params: {spider_params!r}") - crawler_process = CrawlerProcess(settings, install_root_handler=False) - crawler_process.crawl(spider_class) - stats_collector = get_stats_collector(crawler_process) - crawler_process.start() + logger.info("Starting the spider") + runner = CrawlerRunner(settings) + crawler = runner.create_crawler(spider_class) - min_items = getattr(spider_class, "min_items", settings.getint("SPIDER_MIN_ITEMS")) - logger.debug(f"Min items required: {min_items}") + await deferred_to_future(runner.crawl(crawler)) - logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") - evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) - evaluate_stats_fn(stats_collector.get_stats(), min_items=min_items) + check_crawl_results(crawler) -async def run_actor( - settings: Settings, spider_class: Type[Spider], spider_params: dict[str, Any] | None -) -> None: - config = Configuration.get_global_configuration() - config.purge_on_start = True +async def run_as_actor( + spider_class: Type[Spider], spider_params: dict[str, Any] | None +): + # workaround https://github.com/apify/apify-sdk-python/issues/401 + os.environ["SCRAPY_SETTINGS_MODULE"] = "jg.plucker.settings" + async with Actor: - Actor.log.info(f"Spider {spider_class.name}") - spider_params = dict(spider_params or (await Actor.get_input()) or {}) - proxy_config = spider_params.pop("proxyConfig", None) - settings = apply_apify_settings(settings=settings, proxy_config=proxy_config) - settings["HTTPCACHE_STORAGE"] = "jg.plucker.scrapers.KeyValueCacheStorage" - run_spider(settings, spider_class, spider_params) + logger.info(f"Starting actor for spider {spider_class.name}") + + params = spider_params or (await Actor.get_input()) or {} + proxy_config = params.pop("proxyConfig", None) + logger.debug(f"Proxy config: {proxy_config!r}") + settings = apply_apify_settings(proxy_config=proxy_config) + settings.set("HTTPCACHE_STORAGE", "jg.plucker.cache.CacheStorage") + settings.set("SPIDER_PARAMS", spider_params) + logger.debug(f"Spider params: {spider_params!r}") -def configure_async(): - nest_asyncio.apply() + logger.info("Starting the spider") + runner = CrawlerRunner(settings) + crawler = runner.create_crawler(spider_class) + + await deferred_to_future(runner.crawl(crawler)) + + check_crawl_results(crawler) def iter_actor_paths(path: Path | str) -> Generator[Path, None, None]: @@ -113,17 +118,27 @@ def generate_schema(item_class: Type[Item]) -> dict: } -def get_stats_collector(crawler_process: CrawlerProcess) -> StatsCollector: - assert len(crawler_process.crawlers) == 1, "Exactly one crawler expected" - crawler = crawler_process.crawlers.pop() - return crawler.stats +def check_crawl_results(crawler: Crawler) -> None: + spider_class = crawler.spidercls + + assert crawler.stats is not None, "Stats collector not initialized" + stats = crawler.stats.get_stats() + assert stats, "Stats not collected" + + default_min_items = crawler.settings.getint("SPIDER_MIN_ITEMS") + min_items = getattr(spider_class, "min_items", default_min_items) + logger.debug(f"Min items required: {min_items}") + + logger.debug(f"Custom evaluate_stats(): {hasattr(spider_class, 'evaluate_stats')}") + evaluate_stats_fn = getattr(spider_class, "evaluate_stats", evaluate_stats) + evaluate_stats_fn(stats, min_items) class StatsError(RuntimeError): pass -def evaluate_stats(stats: dict[str, Any], min_items: int): +def evaluate_stats(stats: StatsT, min_items: int): item_count = stats.get("item_scraped_count", 0) if exc_count := stats.get("spider_exceptions"): raise StatsError(f"Exceptions raised: {exc_count}") @@ -138,82 +153,3 @@ def evaluate_stats(stats: dict[str, Any], min_items: int): raise StatsError(f"Scraping finished with reason {reason!r}") if item_count := stats.get("item_dropped_reasons_count/MissingRequiredFields"): raise StatsError(f"Items missing required fields: {item_count}") - - -class KeyValueCacheStorage: - # TODO implement expiration as in https://github.com/scrapy/scrapy/blob/a8d9746f562681ed5a268148ec959dcf0881d859/scrapy/extensions/httpcache.py#L250 - # TODO implement gzipping - - def __init__(self, settings: BaseSettings): - if not is_asyncio_reactor_installed(): - raise ValueError( - f"{self.__class__.__qualname__} requires the asyncio Twisted reactor. " - "Make sure you have it configured in the TWISTED_REACTOR setting. See the asyncio " - "documentation of Scrapy for more information.", - ) - self.spider: Spider | None = None - self._kv: KeyValueStore | None = None - self._fingerprinter: RequestFingerprinterProtocol | None = None - - def open_spider(self, spider: Spider) -> None: - logger.debug("Using Apify key value cache storage", extra={"spider": spider}) - self.spider = spider - self._fingerprinter = spider.crawler.request_fingerprinter - - config = Configuration.get_global_configuration() - storage_client = ( - ApifyStorageClient.from_config(config) - if config.is_at_home - else MemoryStorageClient.from_config(config) - ) - self._kv = self._run_async( - KeyValueStore.open(configuration=config, storage_client=storage_client) - ) - - def close_spider(self, spider: Spider) -> None: - pass - - def retrieve_response(self, spider: Spider, request: Request) -> Response | None: - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" - - key = self._fingerprinter.fingerprint(request).hex() - value = self._run_async(self._kv.get_value(key)) - if value is None: - return None # not cached - - data = pickle.loads(value) - url = data["url"] - status = data["status"] - headers = Headers(data["headers"]) - body = data["body"] - respcls = responsetypes.from_args(headers=headers, url=url, body=body) - return respcls(url=url, headers=headers, status=status, body=body) - - def store_response( - self, spider: Spider, request: Request, response: Response - ) -> None: - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" - - key = self._fingerprinter.fingerprint(request).hex() - data = { - "status": response.status, - "url": response.url, - "headers": dict(response.headers), - "body": response.body, - } - value = pickle.dumps(data, protocol=4) - self._run_async(self._kv.set_value(key, value)) - - def _run_async(self, coroutine: Coroutine) -> Any: - result = None - - def run(): - nonlocal result - result = asyncio.run(coroutine) - - t = Thread(target=run) - t.start() - t.join() - return result diff --git a/jg/plucker/settings.py b/jg/plucker/settings.py index d18a276..ded8f88 100644 --- a/jg/plucker/settings.py +++ b/jg/plucker/settings.py @@ -22,9 +22,9 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 403, 408, 429, 999] -# HTTPCACHE_ENABLED = True +HTTPCACHE_ENABLED = True -HTTPCACHE_EXPIRATION_SECS = 18000 # 5 hours +HTTPCACHE_EXPIRATION_SECS = 43200 # 12 hours SPIDER_LOADER_CLASS = "jg.plucker.scrapers.SpiderLoader" diff --git a/poetry.lock b/poetry.lock index c455150..514f9e5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -45,44 +45,48 @@ trio = ["trio (>=0.26.1)"] [[package]] name = "apify" -version = "2.2.1" +version = "2.3.0" description = "Apify SDK for Python" optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "apify-2.2.1-py3-none-any.whl", hash = "sha256:60b190d6e7d438d2ccbeeb40151adcbbd1adfb3bf85936fa01c37e3fbb8e2edb"}, - {file = "apify-2.2.1.tar.gz", hash = "sha256:9a30828e5f908c020e85fc14f70c74e890ab1b20157ce20b50d199564b12d649"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -apify-client = ">=1.8.1" +apify-client = ">=1.9.1" apify-shared = ">=1.2.1" -crawlee = ">=0.5.1,<0.6.0" +crawlee = "~0.5.1" cryptography = ">=42.0.0" httpx = ">=0.27.0" lazy-object-proxy = ">=1.10.0" more_itertools = ">=10.2.0" -scrapy = {version = ">=2.11.0", optional = true, markers = "extra == \"scrapy\""} +scrapy = {version = ">=2.11.0", optional = true} typing-extensions = ">=4.1.0" -websockets = ">=10.0,<14.0.0" +websockets = ">=10.0 <14.0.0" [package.extras] scrapy = ["scrapy (>=2.11.0)"] +[package.source] +type = "git" +url = "https://github.com/apify/apify-sdk-python.git" +reference = "master" +resolved_reference = "9706c94193db588fd59ff91e7b39d7c4b3c5b6a4" + [[package]] name = "apify-client" -version = "1.8.1" +version = "1.9.1" description = "Apify API client for Python" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "apify_client-1.8.1-py3-none-any.whl", hash = "sha256:cfa6df3816c436204e37457fba28981a0ef6a7602cde372463f0f078eee64747"}, - {file = "apify_client-1.8.1.tar.gz", hash = "sha256:2be1be7879570655bddeebf126833efe94cabb95b3755592845e92c20c70c674"}, + {file = "apify_client-1.9.1-py3-none-any.whl", hash = "sha256:7f1eaf0c66a077d47eb267f4af6a7ec53d10b683ea4c585daa2a2a01f7e74e3f"}, + {file = "apify_client-1.9.1.tar.gz", hash = "sha256:aadaa5800845ca1c5b1c7416f23358cc1fcbd2aab035521e40235f33721e2a26"}, ] [package.dependencies] apify-shared = ">=1.1.2" -httpx = ">=0.25.0" +httpx = ">=0.25" more_itertools = ">=10.0.0" [[package]] @@ -153,17 +157,18 @@ visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"] [[package]] name = "beautifulsoup4" -version = "4.12.3" +version = "4.13.3" description = "Screen-scraping library" optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.7.0" files = [ - {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, - {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] [package.dependencies] soupsieve = ">1.2" +typing-extensions = ">=4.0.0" [package.extras] cchardet = ["cchardet"] @@ -219,10 +224,6 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, - {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -235,14 +236,8 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, - {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, - {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -253,24 +248,8 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, - {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, - {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, - {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, - {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, - {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, - {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -280,10 +259,6 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, - {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -295,10 +270,6 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, - {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -311,10 +282,6 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, - {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -327,10 +294,6 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, - {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -377,13 +340,13 @@ cffi = ">=1.0.0" [[package]] name = "certifi" -version = "2024.12.14" +version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, - {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, + {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, + {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, ] [[package]] @@ -636,13 +599,13 @@ rich = "*" [[package]] name = "crawlee" -version = "0.5.3" +version = "0.5.4" description = "Crawlee for Python" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "crawlee-0.5.3-py3-none-any.whl", hash = "sha256:03ef5e1e927b379abcea41eacee7102cb428b2f2c73df770bed994bb95d5e400"}, - {file = "crawlee-0.5.3.tar.gz", hash = "sha256:6921e161b47df268b9cf04f8a6e28ccafa40e3034fb22f307db679ef583ad620"}, + {file = "crawlee-0.5.4-py3-none-any.whl", hash = "sha256:2b02ebab913a9bbc74f1a52fc1f4c9c46ce40c0d7f0342482a0576b645aa0e9d"}, + {file = "crawlee-0.5.4.tar.gz", hash = "sha256:24b2c18e784fc94adfb3c7f061b9694e5148c3050b279349801971ae8f0db0d9"}, ] [package.dependencies] @@ -674,38 +637,42 @@ playwright = ["playwright (>=1.27.0)"] [[package]] name = "cryptography" -version = "44.0.0" +version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" files = [ - {file = "cryptography-44.0.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:84111ad4ff3f6253820e6d3e58be2cc2a00adb29335d4cacb5ab4d4d34f2a123"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15492a11f9e1b62ba9d73c210e2416724633167de94607ec6069ef724fad092"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831c3c4d0774e488fdc83a1923b49b9957d33287de923d58ebd3cec47a0ae43f"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, - {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, - {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, - {file = "cryptography-44.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:abc998e0c0eee3c8a1904221d3f67dcfa76422b23620173e28c11d3e626c21bd"}, - {file = "cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:660cb7312a08bc38be15b696462fa7cc7cd85c3ed9c576e81f4dc4d8b2b31591"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1923cb251c04be85eec9fda837661c67c1049063305d6be5721643c22dd4e2b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:404fdc66ee5f83a1388be54300ae978b2efd538018de18556dde92575e05defc"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, - {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, - {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, - {file = "cryptography-44.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:708ee5f1bafe76d041b53a4f95eb28cdeb8d18da17e597d46d7833ee59b97ede"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37d76e6863da3774cd9db5b409a9ecfd2c71c981c38788d3fcfaf177f447b731"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:f677e1268c4e23420c3acade68fac427fffcb8d19d7df95ed7ad17cdef8404f4"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5e7cb1e5e56ca0933b4873c0220a78b773b24d40d186b6738080b73d3d0a756"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:8b3e6eae66cf54701ee7d9c83c30ac0a1e3fa17be486033000f2a73a12ab507c"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:be4ce505894d15d5c5037167ffb7f0ae90b7be6f2a98f9a5c3442395501c32fa"}, - {file = "cryptography-44.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:62901fb618f74d7d81bf408c8719e9ec14d863086efe4185afd07c352aee1d2c"}, - {file = "cryptography-44.0.0.tar.gz", hash = "sha256:cd4e834f340b4293430701e772ec543b0fbe6c2dea510a5286fe0acabe153a02"}, + {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, + {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, + {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, + {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, + {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, + {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, + {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, ] [package.dependencies] @@ -718,7 +685,7 @@ nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2)"] pep8test = ["check-sdist", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi (>=2024)", "cryptography-vectors (==44.0.0)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] +test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] @@ -890,18 +857,18 @@ files = [ [[package]] name = "h2" -version = "4.1.0" -description = "HTTP/2 State-Machine based protocol implementation" +version = "4.2.0" +description = "Pure-Python HTTP/2 protocol implementation" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" files = [ - {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, - {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, + {file = "h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0"}, + {file = "h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f"}, ] [package.dependencies] -hpack = ">=4.0,<5" -hyperframe = ">=6.0,<7" +hpack = ">=4.1,<5" +hyperframe = ">=6.1,<7" [[package]] name = "hpack" @@ -916,13 +883,13 @@ files = [ [[package]] name = "html-text" -version = "0.6.2" +version = "0.7.0" description = "Extract text from HTML" optional = false python-versions = "*" files = [ - {file = "html_text-0.6.2-py2.py3-none-any.whl", hash = "sha256:d83d619ccd4b4d6172e21084d8a46e29e49ce87a08cc02161e7ca8c2918e7bca"}, - {file = "html_text-0.6.2.tar.gz", hash = "sha256:81455b4de5430cf63ce7c45a870fb8629e79ca8518e240f172d62409c2f2ff72"}, + {file = "html_text-0.7.0-py3-none-any.whl", hash = "sha256:11a95d5588a7b954aa229394bcd4a802d195c793d9970d5d8fc80d3d0ea9618e"}, + {file = "html_text-0.7.0.tar.gz", hash = "sha256:3dcb7006945d8ff06b4be639678f633a06ea70bc494163d256066995e1eb9182"}, ] [package.dependencies] @@ -1142,15 +1109,20 @@ readchar = ">=4.2.0" [[package]] name = "itemadapter" -version = "0.10.0" +version = "0.11.0" description = "Common interface for data container classes" optional = false python-versions = ">=3.9" files = [ - {file = "itemadapter-0.10.0-py3-none-any.whl", hash = "sha256:d404a91cd0ebf17b7983af1aae43116d375e8d831a1dcbe98de5723b2c66e36d"}, - {file = "itemadapter-0.10.0.tar.gz", hash = "sha256:2655c8c50f1a8405c9fa74b8cdc4da7fec541ca217bc821b90acc8451c98a9d2"}, + {file = "itemadapter-0.11.0-py3-none-any.whl", hash = "sha256:07bc1a26a51f124ec155b80ee3d170eda06ffccd7ceba99c08bea68ad4de5fcd"}, + {file = "itemadapter-0.11.0.tar.gz", hash = "sha256:3b0f27f4c5e2e8ae415d83e3d60d33adb7ba09b98c30638bc606fb1dff2ecdd2"}, ] +[package.extras] +attrs = ["attrs (>=18.1.0)"] +pydantic = ["pydantic (>=1.8)"] +scrapy = ["scrapy (>=2.2)"] + [[package]] name = "itemloaders" version = "1.3.2" @@ -2401,29 +2373,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.9.4" +version = "0.9.6" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706"}, - {file = "ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf"}, - {file = "ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214"}, - {file = "ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c"}, - {file = "ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0"}, - {file = "ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402"}, - {file = "ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e"}, - {file = "ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41"}, - {file = "ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7"}, + {file = "ruff-0.9.6-py3-none-linux_armv6l.whl", hash = "sha256:2f218f356dd2d995839f1941322ff021c72a492c470f0b26a34f844c29cdf5ba"}, + {file = "ruff-0.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b908ff4df65dad7b251c9968a2e4560836d8f5487c2f0cc238321ed951ea0504"}, + {file = "ruff-0.9.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b109c0ad2ececf42e75fa99dc4043ff72a357436bb171900714a9ea581ddef83"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1de4367cca3dac99bcbd15c161404e849bb0bfd543664db39232648dc00112dc"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac3ee4d7c2c92ddfdaedf0bf31b2b176fa7aa8950efc454628d477394d35638b"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc1edd1775270e6aa2386119aea692039781429f0be1e0949ea5884e011aa8e"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4a091729086dffa4bd070aa5dab7e39cc6b9d62eb2bef8f3d91172d30d599666"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1bbc6808bf7b15796cef0815e1dfb796fbd383e7dbd4334709642649625e7c5"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:589d1d9f25b5754ff230dce914a174a7c951a85a4e9270613a2b74231fdac2f5"}, + {file = "ruff-0.9.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc61dd5131742e21103fbbdcad683a8813be0e3c204472d520d9a5021ca8b217"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5e2d9126161d0357e5c8f30b0bd6168d2c3872372f14481136d13de9937f79b6"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:68660eab1a8e65babb5229a1f97b46e3120923757a68b5413d8561f8a85d4897"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c4cae6c4cc7b9b4017c71114115db0445b00a16de3bcde0946273e8392856f08"}, + {file = "ruff-0.9.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:19f505b643228b417c1111a2a536424ddde0db4ef9023b9e04a46ed8a1cb4656"}, + {file = "ruff-0.9.6-py3-none-win32.whl", hash = "sha256:194d8402bceef1b31164909540a597e0d913c0e4952015a5b40e28c146121b5d"}, + {file = "ruff-0.9.6-py3-none-win_amd64.whl", hash = "sha256:03482d5c09d90d4ee3f40d97578423698ad895c87314c4de39ed2af945633caa"}, + {file = "ruff-0.9.6-py3-none-win_arm64.whl", hash = "sha256:0e2bb706a2be7ddfea4a4af918562fdc1bcb16df255e5fa595bbd800ce322a5a"}, + {file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"}, ] [[package]] @@ -3151,4 +3123,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.12.*" -content-hash = "0fe178f41b420f134b9c3bbb105fc1b8eb302f68fda574981026e0e48963a9e7" +content-hash = "e7439726bea09577bbcdd4a19a7c54811a1ae80d4d659d618748b20421c8693a" diff --git a/pyproject.toml b/pyproject.toml index af9c7db..3a9d4ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,11 +14,10 @@ plucker = "jg.plucker.cli:main" [tool.poetry.dependencies] python = "3.12.*" -apify = {version = "2.2.1", extras = ["scrapy"]} -apify-client = "1.8.1" # deployment of actors, monitoring, automation +apify = { git = "https://github.com/apify/apify-sdk-python.git", branch = "master", extras = ["scrapy"] } +apify-client = "1.9.1" # deployment of actors, monitoring, automation apify-shared = "*" # importing a few enums click = "8.1.8" -crawlee = "*" # importing MemoryStorageClient diskcache = "5.6.3" extruct = "0.18.0" feedparser = "6.0.11" @@ -44,6 +43,7 @@ addopts = "--import-mode=importlib --ff --ruff --ruff-format" filterwarnings = [ "ignore:twisted.web.http.HTTPClient was deprecated:DeprecationWarning", # scrapy "ignore:invalid escape sequence:SyntaxWarning", # extruct + "ignore:There is no current event loop:DeprecationWarning", # apify ] [tool.ruff] diff --git a/tests/jobs_jobscz/job_widget_script9.js b/tests/jobs_jobscz/job_widget_script9.js new file mode 100644 index 0000000..2a2d3f2 --- /dev/null +++ b/tests/jobs_jobscz/job_widget_script9.js @@ -0,0 +1 @@ +!function(n){var r={};function o(t){var e;return(r[t]||(e=r[t]={i:t,l:!1,exports:{}},n[t].call(e.exports,e,e.exports,o),e.l=!0,e)).exports}o.m=n,o.c=r,o.d=function(t,e,n){o.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},o.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},o.t=function(e,t){if(1&t&&(e=o(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(o.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var r in e)o.d(n,r,function(t){return e[t]}.bind(null,r));return n},o.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return o.d(e,"a",e),e},o.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},o.p="",o(o.s=199)}([function(t,e,n){var n=n(46),r=Function.prototype,o=r.call,r=n&&r.bind.bind(o,o);t.exports=n?r:function(t){return function(){return o.apply(t,arguments)}}},function(t,e,n){t.exports=function(t){try{return!!t()}catch(t){return!0}}},function(t,e,n){var u=n(4),l=n(50).f,f=n(24),d=n(13),p=n(63),h=n(97),v=n(55);t.exports=function(t,e){var n,r,o,i=t.target,a=t.global,s=t.stat,c=a?u:s?u[i]||p(i,{}):u[i]&&u[i].prototype;if(c)for(n in e){if(r=e[n],o=t.dontCallGetSet?(o=l(c,n))&&o.value:c[n],!v(a?n:i+(s?".":"#")+n,t.forced)&&void 0!==o){if(typeof r==typeof o)continue;h(r,o)}(t.sham||o&&o.sham)&&f(r,"sham",!0),d(c,n,r,t)}}},function(t,e,n){var r=n(4),o=n(28),i=n(7),a=n(64),s=n(29),n=n(89),c=r.Symbol,u=o("wks"),l=n?c.for||c:c&&c.withoutSetter||a;t.exports=function(t){return i(u,t)||(u[t]=s&&i(c,t)?c[t]:l("Symbol."+t)),u[t]}},function(n,t,e){!function(t){function e(t){return t&&t.Math===Math&&t}n.exports=e("object"==typeof globalThis&&globalThis)||e("object"==typeof window&&window)||e("object"==typeof self&&self)||e("object"==typeof t&&t)||e("object"==typeof this&&this)||function(){return this}()||Function("return this")()}.call(this,e(144))},function(t,e,n){var r="object"==typeof document&&document.all;t.exports=void 0===r&&void 0!==r?function(t){return"function"==typeof t||t===r}:function(t){return"function"==typeof t}},function(t,e,n){var n=n(46),r=Function.prototype.call;t.exports=n?r.bind(r):function(){return r.apply(r,arguments)}},function(t,e,n){var r=n(0),o=n(14),i=r({}.hasOwnProperty);t.exports=Object.hasOwn||function(t,e){return i(o(t),e)}},function(t,e,n){n=n(1);t.exports=!n(function(){return 7!==Object.defineProperty({},1,{get:function(){return 7}})[1]})},function(t,e,n){var r=n(11),o=String,i=TypeError;t.exports=function(t){if(r(t))return t;throw new i(o(t)+" is not an object")}},function(t,e,n){var r=n(49),o=String;t.exports=function(t){if("Symbol"===r(t))throw new TypeError("Cannot convert a Symbol value to a string");return o(t)}},function(t,e,n){var r=n(5);t.exports=function(t){return"object"==typeof t?null!==t:r(t)}},function(t,e,n){var r=n(8),o=n(90),i=n(91),a=n(9),s=n(65),c=TypeError,u=Object.defineProperty,l=Object.getOwnPropertyDescriptor,f="enumerable",d="configurable",p="writable";e.f=r?i?function(t,e,n){var r;return a(t),e=s(e),a(n),"function"==typeof t&&"prototype"===e&&"value"in n&&p in n&&!n[p]&&(r=l(t,e))&&r[p]&&(t[e]=n.value,n={configurable:(d in n?n:r)[d],enumerable:(f in n?n:r)[f],writable:!1}),u(t,e,n)}:u:function(t,e,n){if(a(t),e=s(e),a(n),o)try{return u(t,e,n)}catch(t){}if("get"in n||"set"in n)throw new c("Accessors not supported");return"value"in n&&(t[e]=n.value),t}},function(t,e,n){var a=n(5),s=n(12),c=n(93),u=n(63);t.exports=function(t,e,n,r){var o=(r=r||{}).enumerable,i=void 0!==r.name?r.name:e;if(a(n)&&c(n,i,r),r.global)o?t[e]=n:u(e,n);else{try{r.unsafe?t[e]&&(o=!0):delete t[e]}catch(t){}o?t[e]=n:s.f(t,e,{value:n,enumerable:!1,configurable:!r.nonConfigurable,writable:!r.nonWritable})}return t}},function(t,e,n){var r=n(15),o=Object;t.exports=function(t){return o(r(t))}},function(t,e,n){var r=n(21),o=TypeError;t.exports=function(t){if(r(t))throw new o("Can't call method on "+t);return t}},function(t,e,n){var r=n(4),o=n(5);t.exports=function(t,e){return arguments.length<2?(n=r[t],o(n)?n:void 0):r[t]&&r[t][e];var n}},function(t,e,n){var r=n(67),o=n(15);t.exports=function(t){return r(o(t))}},function(t,e,n){var n=n(0),r=n({}.toString),o=n("".slice);t.exports=function(t){return o(r(t),8,-1)}},function(t,e,n){var r=n(61),o=n(13),n=n(146);r||o(Object.prototype,"toString",n,{unsafe:!0})},function(t,e,n){t.exports=!1},function(t,e,n){t.exports=function(t){return null==t}},function(t,e,n){n=n(0);t.exports=n({}.isPrototypeOf)},function(t,e,n){var r=n(5),o=n(34),i=TypeError;t.exports=function(t){if(r(t))return t;throw new i(o(t)+" is not a function")}},function(t,e,n){var r=n(8),o=n(12),i=n(36);t.exports=r?function(t,e,n){return o.f(t,e,i(1,n))}:function(t,e,n){return t[e]=n,t}},function(t,e,n){var r=n(54);t.exports=function(t){return r(t.length)}},function(t,e,n){var r=n(23),o=n(21);t.exports=function(t,e){t=t[e];return o(t)?void 0:r(t)}},function(t,e,n){var r,o,i,a,s=n(145),c=n(4),u=n(11),l=n(24),f=n(7),d=n(62),p=n(47),n=n(48),h="Object already initialized",v=c.TypeError,c=c.WeakMap,g=s||d.state?((i=d.state||(d.state=new c)).get=i.get,i.has=i.has,i.set=i.set,r=function(t,e){if(i.has(t))throw new v(h);return e.facade=t,i.set(t,e),e},o=function(t){return i.get(t)||{}},function(t){return i.has(t)}):(n[a=p("state")]=!0,r=function(t,e){if(f(t,a))throw new v(h);return e.facade=t,l(t,a,e),e},o=function(t){return f(t,a)?t[a]:{}},function(t){return f(t,a)});t.exports={set:r,get:o,has:g,enforce:function(t){return g(t)?o(t):r(t,{})},getterFor:function(e){return function(t){if(u(t)&&(t=o(t)).type===e)return t;throw new v("Incompatible receiver, "+e+" required")}}}},function(t,e,n){var r=n(62);t.exports=function(t,e){return r[t]||(r[t]=e||{})}},function(t,e,n){var r=n(43),o=n(1),i=n(4).String;t.exports=!!Object.getOwnPropertySymbols&&!o(function(){var t=Symbol("symbol detection");return!i(t)||!(Object(t)instanceof Symbol)||!Symbol.sham&&r&&r<41})},function(t,e,n){var r=n(149);t.exports=function(t){t=+t;return t!=t||0==t?0:r(t)}},function(t,e,n){function r(){}function o(t){t.write(m("")),t.close();var e=t.parentWindow.Object;return t=null,e}var i,a=n(9),s=n(99),c=n(73),u=n(48),l=n(129),f=n(52),n=n(47),d=">",p="<",h="prototype",v="script",g=n("IE_PROTO"),m=function(t){return p+v+d+t+p+"/"+v+d},y=function(){try{i=new ActiveXObject("htmlfile")}catch(t){}y="undefined"==typeof document||document.domain&&i?o(i):(t=f("iframe"),e="java"+v+":",t.style.display="none",l.appendChild(t),t.src=String(e),(e=t.contentWindow.document).open(),e.write(m("document.F=Object")),e.close(),e.F);for(var t,e,n=c.length;n--;)delete y[h][c[n]];return y()};u[g]=!0,t.exports=Object.create||function(t,e){var n;return null!==t?(r[h]=a(t),n=new r,r[h]=null,n[g]=t):n=y(),void 0===e?n:s.f(n,e)}},function(t,e,n){t.exports="undefined"!=typeof navigator&&String(navigator.userAgent)||""},function(t,e,n){var r=n(16),o=n(5),i=n(22),n=n(89),a=Object;t.exports=n?function(t){return"symbol"==typeof t}:function(t){var e=r("Symbol");return o(e)&&i(e.prototype,a(t))}},function(t,e,n){var r=String;t.exports=function(t){try{return r(t)}catch(t){return"Object"}}},function(t,e,n){var r=n(8),n=n(7),o=Function.prototype,i=r&&Object.getOwnPropertyDescriptor,n=n(o,"name"),a=n&&"something"===function(){}.name,r=n&&(!r||i(o,"name").configurable);t.exports={EXISTS:n,PROPER:a,CONFIGURABLE:r}},function(t,e,n){t.exports=function(t,e){return{enumerable:!(1&t),configurable:!(2&t),writable:!(4&t),value:e}}},function(t,e,n){function r(d){var p=1===d,h=2===d,v=3===d,g=4===d,m=6===d,y=7===d,b=5===d||m;return function(t,e,n,r){for(var o,i,a=x(t),s=_(a),c=S(s),u=w(e,n),l=0,e=r||k,f=p?e(t,c):h||y?e(t,0):void 0;l")})||!n||f)},function(t,e,n){var r=n(17),o=n(79),i=n(42),a=n(27),s=n(12).f,c=n(114),u=n(117),l=n(20),n=n(8),f="Array Iterator",d=a.set,p=a.getterFor(f),a=(t.exports=c(Array,"Array",function(t,e){d(this,{type:f,target:r(t),index:0,kind:e})},function(){var t=p(this),e=t.target,n=t.index++;if(!e||n>=e.length)return t.target=void 0,u(void 0,!0);switch(t.kind){case"keys":return u(n,!1);case"values":return u(e[n],!1)}return u([n,e[n]],!1)},"values"),i.Arguments=i.Array);if(o("keys"),o("values"),o("entries"),!l&&n&&"values"!==a.name)try{s(a,"name",{value:"values"})}catch(t){}},function(t,e,n){var r={};r[n(3)("toStringTag")]="z",t.exports="[object z]"===String(r)},function(t,e,n){var r=n(20),o=n(4),n=n(63),i="__core-js_shared__",t=t.exports=o[i]||n(i,{});(t.versions||(t.versions=[])).push({version:"3.37.1",mode:r?"pure":"global",copyright:"© 2014-2024 Denis Pushkarev (zloirock.ru)",license:"https://github.com/zloirock/core-js/blob/v3.37.1/LICENSE",source:"https://github.com/zloirock/core-js"})},function(t,e,n){var r=n(4),o=Object.defineProperty;t.exports=function(e,n){try{o(r,e,{value:n,configurable:!0,writable:!0})}catch(t){r[e]=n}return n}},function(t,e,n){var n=n(0),r=0,o=Math.random(),i=n(1..toString);t.exports=function(t){return"Symbol("+(void 0===t?"":t)+")_"+i(++r+o,36)}},function(t,e,n){var r=n(92),o=n(33);t.exports=function(t){t=r(t,"string");return o(t)?t:t+""}},function(t,e,n){var r=n(148),o=n(23),i=n(46),a=r(r.bind);t.exports=function(t,e){return o(t),void 0===e?t:i?a(t,e):function(){return t.apply(e,arguments)}}},function(t,e,n){var r=n(0),o=n(1),i=n(18),a=Object,s=r("".split);t.exports=o(function(){return!a("z").propertyIsEnumerable(0)})?function(t){return"String"===i(t)?s(t,""):a(t)}:a},function(t,e,n){var r=n(150);t.exports=function(t,e){return new(r(t))(0===e?0:e)}},function(t,e,n){function r(){}function o(t){if(!c(t))return!1;try{return d(r,[],t),!0}catch(t){return!1}}function i(t){if(!c(t))return!1;switch(u(t)){case"AsyncFunction":case"GeneratorFunction":case"AsyncGeneratorFunction":return!1}try{return v||!!h(p,f(t))}catch(t){return!0}}var a=n(0),s=n(1),c=n(5),u=n(49),l=n(16),f=n(78),d=l("Reflect","construct"),p=/^\s*(?:class|function)\b/,h=a(p.exec),v=!p.test(r);i.sham=!0,t.exports=!d||s(function(){var t;return o(o.call)||!o(Object)||!o(function(){t=!0})||t})?i:o},function(t,e,n){var r=n(1);t.exports=function(t,e){var n=[][t];return!!n&&r(function(){n.call(null,e||function(){return 1},1)})}},function(t,e,n){var r=n(2),o=n(37).find,n=n(79),i="find",a=!0;i in[]&&Array(1)[i](function(){a=!1}),r({target:"Array",proto:!0,forced:a},{find:function(t){return o(this,t,1o;)!a(r,n=e[o++])||~c(i,n)||l(i,n);return i}},function(t,e,n){var r=n(8),o=n(91),s=n(12),c=n(9),u=n(17),l=n(75);e.f=r&&!o?Object.defineProperties:function(t,e){c(t);for(var n,r=u(e),o=l(e),i=o.length,a=0;ab)","g");return"b"!==t.exec("b").groups.a||"bc"!=="b".replace(t,"$c")})},function(t,e,n){var r=n(6),o=n(7),i=n(22),a=n(103),s=RegExp.prototype;t.exports=function(t){var e=t.flags;return void 0!==e||"flags"in s||o(t,"flags")||!i(s,t)?e:r(a,t)}},function(t,e,n){function r(o){return function(t,e){var n,t=a(s(t)),e=i(e),r=t.length;return e<0||r<=e?o?"":void 0:(n=u(t,e))<55296||56319=e.length?s(void 0,!0):(e=r(e,n),t.index+=e.length,s(e,!1))})},function(t,e,n){var r=n(49),o=n(26),i=n(21),a=n(42),s=n(3)("iterator");t.exports=function(t){if(!i(t))return o(t,s)||o(t,"@@iterator")||a[r(t)]}},function(t,e,n){var r=n(2),o=n(37).map;r({target:"Array",proto:!0,forced:!n(40)("map")},{map:function(t){return o(this,t,1o;o++)d(e,n=r[o])&&!d(t,n)&&w(t,n,b(e,n))}var i=n(2),a=n(20),s=n(8),c=n(4),u=n(110),l=n(0),f=n(55),d=n(7),p=n(118),h=n(22),v=n(33),g=n(92),m=n(1),y=n(39).f,b=n(50).f,w=n(12).f,_=n(134),x=n(87).trim,n="Number",S=c[n],k=u[n],P=S.prototype,O=c.TypeError,E=l("".slice),j=l("".charCodeAt),I=function(t){var e,n,r,o,i,a,s,c=g(t,"number");if(v(c))throw new O("Cannot convert a Symbol value to a number");if("string"==typeof c&&2=t.length?{done:!0}:{done:!1,value:t[i++]}},e:function(t){throw t},f:e};throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method.")}function s(t,e){(null==e||e>t.length)&&(e=t.length);for(var n=0,r=Array(e);n]*>)/g,m=/\$([$&'`]|\d{1,2})/g;t.exports=function(i,a,s,c,u,t){var l=s+i.length,f=c.length,e=m;return void 0!==u&&(u=o(u),e=g),h(t,e,function(t,e){var n;switch(p(e,0)){case"$":return"$";case"&":return i;case"`":return v(a,0,s);case"'":return v(a,l);case"<":n=u[v(e,1,-1)];break;default:var r,o=+e;if(0==o)return t;if(f@^][^\s!#%&*+<=>@^]*>/,D=/a/g,z=/a/g,t=new k(D)!==D,C=a.MISSED_STICKY,B=a.UNSUPPORTED_Y,w=e&&(!t||C||_||x||g(function(){return z[S]=!1,k(D)!==D||k(z)===z||"/a/i"!==String(k(D,"i"))}));if(o("RegExp",w)){for(var R=function(t,e){var n,r,o=d(P,this),i=p(t),a=void 0===e,s=[],c=t;if(!o&&i&&a&&t.constructor===R)return t;if((i||d(P,t))&&(t=t.source,a)&&(e=v(c)),t=void 0===t?"":h(t),e=void 0===e?"":h(e),c=t,i=e=_&&"dotAll"in D&&(n=!!e&&-1"===e&&c:if(""===l||m(a,l))throw new O("Invalid capture group name");a[l]=!0,c=!(i[i.length]=[l,u]),l="";continue}c?l+=e:o+=e}return[o,i]}(t))[0],s=a[1]),a=u(k(t,e),o?this:P,R),(n||r||s.length)&&(e=y(a),n&&(e.dotAll=!0,e.raw=R(function(t){for(var e,n=t.length,r=0,o="",i=!1;r<=n;r++)"\\"===(e=E(t,r))?o+=e+E(t,++r):i||"."!==e?("["===e?i=!0:"]"===e&&(i=!1),o+=e):o+="[\\s\\S]";return o}(t),i)),r&&(e.sticky=!0),s.length)&&(e.groups=s),t!==c)try{l(a,"source",""===c?"(?:)":c)}catch(t){}return a},A=i(k),L=0;A.length>L;)s(R,k,A[L++]);(P.constructor=R).prototype=P,c(n,"RegExp",R,{constructor:!0})}b("RegExp")},function(t,e,n){var r=n(12).f;t.exports=function(t,e,n){n in t||r(t,n,{configurable:!0,get:function(){return e[n]},set:function(t){e[n]=t}})}},function(t,e,n){var r=n(2),o=n(87).trim;r({target:"String",proto:!0,forced:n(137)("trim")},{trim:function(){return o(this)}})},function(t,e,n){var i=n(6),a=n(9),s=n(26);t.exports=function(t,e,n){var r,o;a(t);try{if(!(r=s(t,"return"))){if("throw"===e)throw n;return n}r=i(r,t)}catch(t){o=!0,r=t}if("throw"===e)throw n;if(o)throw r;return a(r),n}},function(t,e,n){var r=n(3),o=n(42),i=r("iterator"),a=Array.prototype;t.exports=function(t){return void 0!==t&&(o.Array===t||a[i]===t)}},function(t,e,n){var r=n(6),o=n(23),i=n(9),a=n(34),s=n(121),c=TypeError;t.exports=function(t,e){e=arguments.length<2?s(t):e;if(o(e))return i(r(e,t));throw new c(a(t)+" is not iterable")}},function(t,e,n){var o=n(3)("iterator"),i=!1;try{var r=0,a={next:function(){return{done:!!r++}},return:function(){i=!0}};a[o]=function(){return this},Array.from(a,function(){throw 2})}catch(t){}t.exports=function(t,e){try{if(!e&&!i)return!1}catch(t){return!1}var n=!1;try{var r={};r[o]=function(){return{next:function(){return{done:n=!0}}}},t(r)}catch(t){}return n}},function(t,e,n){var r=n(2),o=n(0),s=n(23),c=n(14),u=n(25),l=n(102),f=n(10),i=n(1),d=n(174),a=n(70),p=n(175),h=n(176),v=n(43),g=n(177),m=[],y=o(m.sort),b=o(m.push),n=i(function(){m.sort(void 0)}),o=i(function(){m.sort(null)}),a=a("sort"),w=!i(function(){if(v)return v<70;if(!(p&&3f(e)?1:-1})),n=u(o),a=0;a .m-nav__link"),o=document.querySelector(".m-nav__overlay"),i=document.getElementById("m-showMenu"),a=document.getElementsByTagName("html")[0];function s(){i.checked?a.classList.add("js-menu-open"):a.classList.remove("js-menu-open")}for(var c=0,u=r.length;cn.left&&on.top&&ie.top&&te.top+r-o&&(a+="s"),i>e.left&&ie.left+n-o&&(a+="e");for(var s=this.get("handles").split(","),c=0;cO oddělení

Nazev medailonku

Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchazeči), co medailonkem představujete.Stručně popište (prodejte uchaze

',e.insertBefore(n,t.nextSibling))}var r=document.querySelector("#widget_container"),o=-1!==window.location.href.indexOf("useExampleData");o&&(r&&r.addEventListener("LMC_career_widget_pageRendered",function(t){var e;t.detail&&(t=t.detail.pageType,e=document.querySelector(".cp-detail__content"),"detail"===t)&&n(e)}),setTimeout(function(){var t=document.querySelector("#vacancy-detail .cp-detail__content");n(t)},1500))}]); \ No newline at end of file diff --git a/tests/jobs_jobscz/listing_page.html b/tests/jobs_jobscz/listing_page.html new file mode 100644 index 0000000..ee0afa1 --- /dev/null +++ b/tests/jobs_jobscz/listing_page.html @@ -0,0 +1,5185 @@ + + + + + + + + + + + + Nabídka práce Programátor – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 608 nabídek +

+ +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+
Významní zaměstnavatelé v IT +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+

+ PHP programátor/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + ALENSA, s.r.o. +
  • +
  • + + + Praha – Libeň +
  • +
+
+
+ + + + + + +
+
+

+ Technology Lead + +

+ +
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Asseco Central Europe, a.s. +
  • +
  • + + + Brno – Veveří +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + +
+
+

+ PROGRAMÁTOR - Java + +

+
+ 11. února +
+ + + + +
+
+ + + + 55 000 ‍–‍ 85 000 Kč + + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Syntea software group a.s. +
  • +
  • + + + Praha – Jinonice +
  • +
+
+
+ + + + + + +
+
+

+ Game UX Designer + +

+
+ 11. února +
+ + + + +
+
+ + + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + SCS Software s.r.o. +
  • +
  • + + + Praha – Michle +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Pracovník technické podpory (PLC a MaR) + +

+
+ 11. února +
+ + + + +
+
+ + + + 40 000 ‍–‍ 50 000 Kč + + + Odpověď do 2 týdnů + + Možnost občasné práce z domova + + + +
+
+
    +
  • + + + Unipi Technology s.r.o. +
  • +
  • + + + Brno – Lesná +
  • +
+
+
+ + + + + + +
+
+

+ Vývojář ERP HELIOS + +

+
+ 11. února +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + Možnost občasné práce z domova + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Gatema IT a.s +
  • +
  • + + + Boskovice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+
+

+ Vývojář produktů + +

+
+ 11. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + SCHURTER, společnost s ručením omezeným +
  • +
  • + + + Malá Skála +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/listing_page_first.html b/tests/jobs_jobscz/listing_page_first.html new file mode 100644 index 0000000..340ad3a --- /dev/null +++ b/tests/jobs_jobscz/listing_page_first.html @@ -0,0 +1,4285 @@ + + + + + + + + + + + + Nabídka práce Kuchař – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 58 nabídek +

+ +
+ + +
+ + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ Aktualizováno dnes +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Hospic sv. Štěpána, z.s. +
  • +
  • + + + Litoměřice – Předměstí +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ kuchař/ka + +

+
+ Přidáno včera +
+ + + + +
+
+ + + + 26 000 ‍–‍ 31 090 Kč + + + Odpověď do 2 týdnů + + + + +
+
+
    +
  • + + + Dětský diagnostický ústav, základní škola a školní jídelna, Praha 4, U Michelského lesa 222 +
  • +
  • + + + Praha – Michle +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + +
+
+

+ KUCHAŘ - PIZZAŘ + +

+
+ Přidáno včera +
+ + + + +
+
+ + + + 53 000 ‍–‍ 65 000 Kč + + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + MARINA RISTORANTE s.r.o. +
  • +
  • + + + Praha – Staré Město +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař ala cart + +

+
+ 11. února +
+ + + + +
+
+ + + + 45 000 ‍–‍ 55 000 Kč + + + + + + +
+
+
    +
  • + + + CI International s.r.o. +
  • +
  • + + + Praha – Dejvice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + GMF AQUAPARK PRAGUE, a.s. +
  • +
  • + + + Čestlice +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař + +

+
+ 11. února +
+ + + + +
+
+ + + + 50 000 Kč + + + + + + +
+
+
    +
  • + + + PRAGUE HOTELS, s.r.o. +
  • +
  • + + + Praha +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 11. února +
+ + + + +
+
+ + + + 25 000 ‍–‍ 28 000 Kč + + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Obecně prospěšná společnost Důstojnost +
  • +
  • + + + Nýdek +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař + +

+
+ 10. února +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Štěpánka Dušánková +
  • +
  • + + + Praha – Kunratice +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař + +

+
+ 10. února +
+ + + + +
+
+ + + + 50 000 ‍–‍ 65 000 Kč + + + + + + +
+
+
    +
  • + + + CZECH INN HOTELS s.r.o. +
  • +
  • + + + Brno – Ponava +
  • +
+
+
+ + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/listing_page_last.html b/tests/jobs_jobscz/listing_page_last.html new file mode 100644 index 0000000..1e8f408 --- /dev/null +++ b/tests/jobs_jobscz/listing_page_last.html @@ -0,0 +1,4370 @@ + + + + + + + + + + + + Nabídka práce Kuchař – Jobs.cz + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+
+ +
+
+ + + +
+ + + + + + + + + + + +
+ Přihlásit + +
+ + + + +
+ +
+ +
+
+ + +
+
+ + +
+
+
+ + + +
+ +
+
+
+ +
+ + + +
+

Našli jsme 58 nabídek +

+ +
+ + +
+ + + + + + + + +
+
+

+ Kuchař v Harvey Spa Hotelu + +

+
+ 10. února +
+ + + + +
+
+ + + + 36 000 ‍–‍ 40 000 Kč + + + + + + +
+
+
    +
  • + + + Needmore, s.r.o. +
  • +
  • + + + Františkovy Lázně +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař / MyChef Kitchen + +

+
+ 9. února +
+ + + + +
+
+ + + + 60 000 ‍–‍ 65 000 Kč + + + + + + +
+
+
    +
  • + + + Kristína Tayara +
  • +
  • + + + Praha – Kobylisy +
  • +
+
+
+ + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+
+ 8. února +
+ + + + +
+
+ + + + 29 400 Kč + + + + + + +
+
+
    +
  • + + + Rehabilitační centrum Čeladná s.r.o. +
  • +
  • + + + Čeladná +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař/Kuchařka + +

+
+ 7. února +
+ + + + +
+
+ + + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + GURDAU VINAŘSTVÍ S.R.O. +
  • +
  • + + + Kurdějov +
  • +
+
+
+ + + + + + + + + + + +
+ + + + + + + + + +
+
+

+ Kuchař / kuchařka + +

+
+ Končí za 3 dny +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + RACEK SRBY s.r.o. +
  • +
  • + + + Tuchlovice – Srby +
  • +
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ + + + + + + + + + +
+
+

+ kuchař/ka + +

+
+ Končí zítra +
+ + + + +
+
+ + + + + + +
+
+
    +
  • + + + Jan Barták +
  • +
  • + + + Francie +
  • +
+
+
+ + + + + + +
+
+

+ Kuchař/Kuchařka + +

+
+ Končí zítra +
+ + + + +
+
+ + + + 22 580 ‍–‍ 32 760 Kč + + + + + + +
+
+
    +
  • + + + Centrum sociálních služeb Hrabyně +
  • +
  • + + + Hrabyně +
  • +
+
+
+ + + + + + +
+
+

+ Samostatná kuchařka/samostatný kuchař + +

+
+ Končí za 8 hodin +
+ + + + +
+
+ + + + 36 000 ‍–‍ 37 000 Kč + + + Odpověď do 2 týdnů + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + Univerzita Tomáše Bati ve Zlíně +
  • +
  • + + + Zlín +
  • +
+
+
+ + + + + + + + + + + + + + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +

Životopis za 10 minut? +

+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+
+
+ +

Klik sem, klik tam a je to! Profi grafika, pdf ke stažení. A zdarma. To se pak práce hledá sama. +

+ + To chci zkusit + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Kuchař/ka + +

+ +
+ Končí za 8 hodin +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + + + Odpovězte teď a budete mezi prvními + +
+
+
    +
  • + + + PŘEDVÝBĚR.CZ s.r.o. +
  • +
  • + + + Špindlerův Mlýn +
  • +
+
+
+ + + + + + + + + + + + + +
+
+

+ Chef de Partie + +

+ +
+ Končí za 8 hodin +
+ + + + +
+
+ + + Odpověď do 2 týdnů + + + + +
+
+
    +
  • + + + PŘEDVÝBĚR.CZ s.r.o. +
  • +
  • + + + Harrachov +
  • +
+
+
+ + + + +
+ + + + +

Toto jsou všechny nabídky, které odpovídají Vašemu zadání. +

+ + + +
+ + + +
+
+ +
+
+

Vytvořením upozornění na nabídky poskytujete společnostem ze skupiny Alma Career, jako společným správcům, své osobní údaje za účelem usnadnění získání +nových pracovních nabídek na portálu Jobs.cz. +

+V souvislosti se zpracováním Vašich osobních údajů máte právo (i) na přístup k osobním údajům; (ii) na opravu nepřesných nebo +doplnění neúplných osobních údajů; (iii) na výmaz osobních údajů, nejsou-li již osobní údaje potřebné pro účely, +pro které byly shromážděny či jinak zpracovány, anebo zjistíte-li, že byly zpracovávány protiprávně; (iv) na +omezení zpracování osobních údajů ve zvláštních případech; (v) na přenositelnost údajů; (vi) vznést námitku, +po níž zpracování Vašich osobních údajů bude ukončeno, neprokáže-li se, že existují závažné oprávněné důvody pro +zpracování, jež převažují nad Vašimi zájmy nebo právy a svobodami zejména, je-li důvodem případné vymáhání právních +nároků; (vii) obrátit se na Úřad pro ochranu osobních údajů a (viii) odvolat souhlas, je-li zpracování založeno na souhlasu. +

+

+ Alma Career
+ Alma Career Czechia s.r.o., Menclova 2538/2, 180 00 Praha 8, Česko, IČO: 264 41 381
+Alma Career Slovakia s.r.o., Pribinova 19, 811 09 Bratislava, Slovensko, IČO: 35 800 861
+Alma Career Poland Sp. z o.o., Przeskok 2, 00-032 Varšava, Polsko, KRS 0000988078
+ +

+

Další informace o zpracování údajů jsou dostupné na +https://almacareer.com/gdpr. +

+
+
+
+ + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + diff --git a/tests/jobs_jobscz/test_spider.py b/tests/jobs_jobscz/test_spider.py index 0b0bb13..65c2bfa 100644 --- a/tests/jobs_jobscz/test_spider.py +++ b/tests/jobs_jobscz/test_spider.py @@ -5,7 +5,8 @@ from typing import cast import pytest -from scrapy.http import HtmlResponse, TextResponse +from scrapy.http.response.html import HtmlResponse +from scrapy.http.response.text import TextResponse from jg.plucker.items import Job from jg.plucker.jobs_jobscz.spider import Spider, select_widget @@ -21,7 +22,7 @@ def test_spider_parse(): ) requests = list(Spider().parse(response)) - assert len(requests) == 30 + 4 # jobs + pagination (without page=1) + assert len(requests) == 30 + 1 # jobs + next page assert ( requests[1].url @@ -53,12 +54,50 @@ def test_spider_parse(): } assert ( - requests[30].url + requests[-1].url == "https://beta.www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=2" ) + + +def test_spider_parse_listing_page(): + url = "https://www.jobs.cz/prace/programator/?profession[0]=201100249&page=5" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 30 + 1 # jobs + next page + assert ( + requests[-1].url + == "https://www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=6" + ) + + +def test_spider_parse_listing_page_first(): + url = "https://www.jobs.cz/prace/kuchar/" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page_first.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 30 + 1 # jobs + next page + assert ( + requests[-1].url + == "https://www.jobs.cz/prace/kuchar/?profession%5B0%5D=201100136&page=2" + ) + + +def test_spider_parse_listing_page_last(): + url = "https://www.jobs.cz/prace/kuchar/?page=2" + response = HtmlResponse( + url, body=Path(FIXTURES_DIR / "listing_page_last.html").read_bytes() + ) + requests = list(Spider().parse(response)) + + assert len(requests) == 28 + 0 # jobs + next page assert ( requests[-1].url - == "https://beta.www.jobs.cz/prace/programator/?profession%5B0%5D=201100249&page=5" + == "https://www.jobs.cz/rpd/2000404882/?searchId=6cbd9a5a-f1fc-4944-a64f-4445894fdff0&rps=233" ) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..8f15980 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,36 @@ +from time import time + +from jg.plucker.cache import from_gzip, read_gzip_time, to_gzip + + +FIXTURE_BYTES = ( + b"\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xffk`\x99*\xcc\x00\x01\xb5SzX\xf2\x12s" + b"S\xa7\xf4\xb0:\xe6d&\xa7N)\xd6\x03\x00\x1c\xe8U\x9c\x1e\x00\x00\x00" +) + + +def test_gzip(): + assert from_gzip(to_gzip({"name": "Alice"})) == {"name": "Alice"} + + +def test_to_gzip(): + data_bytes = to_gzip({"name": "Alice"}, mtime=0) + + assert data_bytes == FIXTURE_BYTES + + +def test_from_gzip(): + data_dict = from_gzip(FIXTURE_BYTES) + + assert data_dict == {"name": "Alice"} + + +def test_read_gzip_time(): + assert read_gzip_time(FIXTURE_BYTES) == 0 + + +def test_read_gzip_time_non_zero(): + current_time = int(time()) + data_bytes = to_gzip({"name": "Alice"}, mtime=current_time) + + assert read_gzip_time(data_bytes) == current_time diff --git a/tests/test_loggers.py b/tests/test_loggers.py deleted file mode 100644 index 23ccd2c..0000000 --- a/tests/test_loggers.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from scrapy.settings import Settings - -from jg.plucker.loggers import get_logging_level - - -@pytest.mark.parametrize( - "settings, argv, expected", - [ - pytest.param({}, [], "DEBUG", id="default from Scrapy"), - ({"LOG_LEVEL": "DEBUG"}, [], "DEBUG"), - ({"LOG_LEVEL": "DEBUG"}, ["--debug"], "DEBUG"), - ({"LOG_LEVEL": "DEBUG"}, ["-d"], "DEBUG"), - ({"LOG_LEVEL": "INFO"}, [], "INFO"), - ({"LOG_LEVEL": "INFO"}, ["--debug"], "DEBUG"), - ({"LOG_LEVEL": "INFO"}, ["-d"], "DEBUG"), - ({"LOG_LEVEL": "WARNING"}, [], "WARNING"), - ], -) -def test_get_logging_level(settings: dict, argv: list[str], expected: str): - assert get_logging_level(Settings(settings), argv) == expected