From 3af2e421d0c2326191675c140386e793226ee99b Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Wed, 9 Oct 2024 19:20:32 +0000 Subject: [PATCH 1/9] stress test --- stress_test.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/stress_test.py b/stress_test.py index c4017e3..10803e7 100644 --- a/stress_test.py +++ b/stress_test.py @@ -4,10 +4,39 @@ import tempfile import pstats import cProfile -from io import StringIO +import io from il_supermarket_scarper.scrappers_factory import ScraperFactory +def format_stats_as_json(pr, project_name): + stream = io.StringIO() + ps = pstats.Stats(pr, stream=stream) + ps.sort_stats(pstats.SortKey.CUMULATIVE) # Sort by cumulative time + ps.print_stats() + + # Convert the printed stats to a list of lines + stats_output = stream.getvalue().splitlines() + + # Filter the lines to include only functions within the project + project_stats = [] + for line in stats_output: + if project_name in line: # Filter for project-specific lines + # Extract relevant fields from the profiling output + # The typical format is (Function location, Number of calls, Total time, Cumulative time, etc.) + parts = line.split() + if len(parts) >= 5: # Basic sanity check for the parts + function_data = { + "function": parts[-1], # Function path + "ncalls": parts[0], # Number of calls + "tottime": parts[1], + "tottime_per_call": parts[2],# Time spent in function + "cumtime": parts[3], # Cumulative time including subcalls + "cumtime_per_call": parts[4] # + } + project_stats.append(function_data) + + return project_stats + if __name__ == "__main__": result = {} @@ -28,14 +57,9 @@ def full_execution(scraper): pr.disable() - stream = StringIO() - ps = pstats.Stats(pr, stream=stream) - ps.print_stats() - stream.seek(0) - end_time = time.time() result[scraper_name] = { - "status": stream.read(), + "status": format_stats_as_json(pr, "israeli-supermarket-scarpers"), "execution_time": execution_time, "start_time": start_time, "end_time": end_time, From 1185f32a629174d77c8611384ef8735c7d0ab872 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 05:12:15 +0000 Subject: [PATCH 2/9] . --- il_supermarket_scarper/engines/cerberus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index 302de4b..febdca6 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -73,6 +73,7 @@ def scrape( return results except Exception as e: # pylint: disable=broad-except self.on_download_fail(e, files=files) + Logger.error_execption(e) return [] def collect_files_details_from_site( From 12cd26085660b93c7200fb099cdf333258600e30 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 07:42:59 +0000 Subject: [PATCH 3/9] . --- il_supermarket_scarper/engines/cerberus.py | 10 ++--- il_supermarket_scarper/engines/engine.py | 45 +++++++++++++------ .../engines/multipage_web.py | 7 ++- .../engines/publishprice.py | 4 +- il_supermarket_scarper/engines/web.py | 13 +++--- il_supermarket_scarper/main.py | 6 +-- il_supermarket_scarper/scrapper_runner.py | 8 ++-- il_supermarket_scarper/scrappers/bareket.py | 2 +- .../scrappers/meshnat_yosef.py | 2 +- .../scrappers/nativ_hashed.py | 2 +- il_supermarket_scarper/scrappers/polizer.py | 2 +- .../scrappers/tests/test_cases.py | 18 ++++---- il_supermarket_scarper/scrappers/tivtaam.py | 2 +- il_supermarket_scarper/utils/gzip_utils.py | 2 +- stress_test.py | 21 ++++----- 15 files changed, 80 insertions(+), 64 deletions(-) diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index febdca6..9f447d1 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -41,7 +41,7 @@ def scrape( limit=None, files_types=None, store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, filter_null=False, filter_zero=False, @@ -52,7 +52,7 @@ def scrape( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) files = self.collect_files_details_from_site( limit=limit, @@ -60,7 +60,7 @@ def scrape( filter_null=filter_null, filter_zero=filter_zero, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) self.on_collected_details(files) @@ -83,7 +83,7 @@ def collect_files_details_from_site( filter_null=False, filter_zero=False, store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, ): """collect all files to download from the site""" @@ -118,7 +118,7 @@ def collect_files_details_from_site( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) Logger.info(f"After applying limit: Found {len(files)} files") diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index b36ecf8..a1ba629 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -1,7 +1,7 @@ from abc import ABC import os import re - +import datetime from il_supermarket_scarper.utils import ( get_output_folder, @@ -38,16 +38,16 @@ def get_storage_path(self): return self.storage_path def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): Logger.info( f"check if fail is allowd with, limit={limit}," - f"files_types={files_types},store_id={store_id},only_latest={only_latest}" + f"files_types={files_types},store_id={store_id},when_date={when_date}" ) return False def is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): """return true if its ok the scarper reuturn no enrty""" @@ -58,7 +58,7 @@ def is_validate_scraper_found_no_files( for file_type in files_types: if file_type in FileTypesFilters.all_full_files(): request_only_update_file = False - Logger.info(f"the value of {only_latest} should not affect.") + Logger.info(f"the value of {when_date} should not affect.") return ( limit == 0 or files_types == [] @@ -68,7 +68,7 @@ def is_validate_scraper_found_no_files( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) ) @@ -83,13 +83,13 @@ def apply_limit( files_types=None, by_function=lambda x: x, store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, ): """filter the list according to condition""" assert ( - not only_latest or limit is None - ), "only_latest flag can't be applied with limit." + when_date is not None or limit is None + ), "when_date flag can't be applied with limit." # filter files already downloaded intreable_ = self.filter_already_downloaded( @@ -124,8 +124,12 @@ def apply_limit( ) Logger.info(f"Number of entry after filter file type id is {len(intreable_)}") - if only_latest: + if isinstance(when_date, datetime.datetime): + intreable_ = self.get_by_date(when_date, by_function, intreable_) + elif isinstance(when_date, str) and when_date == "latest": intreable_ = self.get_only_latest(by_function, intreable_) + else: + raise ValueError(f"when_date should be datetime or bool, got {when_date}") Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}") @@ -144,12 +148,12 @@ def apply_limit( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) ): raise ValueError( f"No files to download for file files_types={files_types}," - f"limit={limit},store_id={store_id},only_latest={only_latest}" + f"limit={limit},store_id={store_id},when_date={when_date}" ) return intreable_ @@ -182,6 +186,19 @@ def get_only_latest(self, by_function, intreable_): groups_value[store_info] = file return list(groups_value.values()) + def get_by_date(self, requested_date, by_function, intreable_): + """get by date""" + + groups_value = [] + for file in intreable_: + name_split = by_function(file).split("-") + date_info = "-".join(name_split[2:]).rsplit(".", maxsplit=1)[-1] + + if date_info == requested_date: + groups_value.append(file) + + return groups_value + @classmethod def unique(cls, iterable, by_function=lambda x: x): """Returns the type of the file.""" @@ -210,7 +227,7 @@ def scrape( limit=None, files_types=None, store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, filter_null=False, filter_zero=False, @@ -222,7 +239,7 @@ def scrape( files_types=files_types, store_id=store_id, files_names_to_scrape=files_names_to_scrape, - only_latest=only_latest, + when_date=when_date, filter_null=filter_null, filter_zero=filter_zero, ) diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index 9b6742b..8145b81 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -77,7 +77,6 @@ def collect_files_details_from_site( files_types=None, store_id=None, when_date=None, - only_latest=False, files_names_to_scrape=None, ): self.post_scraping() @@ -109,7 +108,7 @@ def collect_files_details_from_site( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) @@ -125,7 +124,7 @@ def collect_files_details_from_page(self, html): return links, filenames def process_links_before_download( - self, page, limit=None, files_types=None, store_id=None, only_latest=None + self, page, limit=None, files_types=None, store_id=None, when_date=None ): """additional processing to the links before download""" response = self.session_with_cookies_by_chain(page) @@ -141,7 +140,7 @@ def process_links_before_download( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) Logger.info( diff --git a/il_supermarket_scarper/engines/publishprice.py b/il_supermarket_scarper/engines/publishprice.py index c8acd80..d87e7ab 100644 --- a/il_supermarket_scarper/engines/publishprice.py +++ b/il_supermarket_scarper/engines/publishprice.py @@ -88,14 +88,14 @@ def get_name_from_herf(x): return download_urls, file_names def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): return ( super()._is_validate_scraper_found_no_files( # what fails the rest limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) or ( # if we are looking for one store file in a weekend or holiday store_id and (_is_weekend_in_israel() or _is_holiday_in_israel()) diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index c232b56..5043ca5 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -45,7 +45,7 @@ def apply_limit_zip( files_types=None, by_function=lambda x: x[0], store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, ): """apply limit to zip""" @@ -55,7 +55,7 @@ def apply_limit_zip( files_types=files_types, by_function=by_function, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) if len(ziped) == 0: @@ -69,7 +69,6 @@ def collect_files_details_from_site( files_types=None, store_id=None, when_date=None, - only_latest=False, files_names_to_scrape=None, ): """collect all enteris to download from site""" @@ -95,7 +94,7 @@ def collect_files_details_from_site( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) @@ -109,7 +108,7 @@ def scrape( limit=None, files_types=None, store_id=None, - only_latest=False, + when_date=None, files_names_to_scrape=None, filter_null=False, filter_zero=False, @@ -121,7 +120,7 @@ def scrape( limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, filter_null=filter_null, filter_zero=filter_zero, ) @@ -130,7 +129,7 @@ def scrape( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=files_names_to_scrape, ) diff --git a/il_supermarket_scarper/main.py b/il_supermarket_scarper/main.py index b688941..3136359 100644 --- a/il_supermarket_scarper/main.py +++ b/il_supermarket_scarper/main.py @@ -10,7 +10,7 @@ def __init__( size_estimation_mode=False, enabled_scrapers=None, limit=None, - only_latest=False, + when_date=None, files_types=FileTypesFilters.all_types(), dump_folder_name=None, lookup_in_db=True, @@ -27,7 +27,7 @@ def __init__( self.dump_folder_name = dump_folder_name self.limit = limit self.files_types = files_types - self.only_latest = only_latest + self.when_date = when_date def get_dump_folder_name(self): """get the dump folder name""" @@ -36,5 +36,5 @@ def get_dump_folder_name(self): def start(self): """run the scraping""" return self.runner.run( - limit=self.limit, files_types=self.files_types, only_latest=self.only_latest + limit=self.limit, files_types=self.files_types, when_date=self.when_date ) diff --git a/il_supermarket_scarper/scrapper_runner.py b/il_supermarket_scarper/scrapper_runner.py index 3e6978b..141759c 100644 --- a/il_supermarket_scarper/scrapper_runner.py +++ b/il_supermarket_scarper/scrapper_runner.py @@ -38,7 +38,7 @@ def __init__( self.multiprocessing = multiprocessing self.lookup_in_db = lookup_in_db - def run(self, limit=None, files_types=None, only_latest=False): + def run(self, limit=None, files_types=None, when_date=False): """run the scraper""" Logger.info(f"Limit is {limit}") Logger.info(f"files_types is {files_types}") @@ -54,7 +54,7 @@ def run(self, limit=None, files_types=None, only_latest=False): { "limit": limit, "files_types": files_types, - "only_latest": only_latest, + "when_date": when_date, }, ), self.enabled_scrapers, @@ -77,7 +77,7 @@ def scrape_one( limit=None, files_types=None, store_id=None, - only_latest=False, + when_date=None, ): """scrape one""" chain_scrapper_constractor = ScraperFactory.get(chain_scrapper_class) @@ -94,7 +94,7 @@ def scrape_one( limit=limit, files_types=files_types, store_id=store_id, - only_latest=only_latest, + when_date=when_date, files_names_to_scrape=None, filter_null=False, filter_zero=False, diff --git a/il_supermarket_scarper/scrappers/bareket.py b/il_supermarket_scarper/scrappers/bareket.py index 2f2bcd9..88a7456 100644 --- a/il_supermarket_scarper/scrappers/bareket.py +++ b/il_supermarket_scarper/scrappers/bareket.py @@ -16,7 +16,7 @@ def __init__(self, folder_name=None): ) def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): # no data on shabat if you test a single store file. return _is_saturday_in_israel() or _is_holiday_in_israel() and store_id diff --git a/il_supermarket_scarper/scrappers/meshnat_yosef.py b/il_supermarket_scarper/scrappers/meshnat_yosef.py index 91a1c9a..b82d942 100644 --- a/il_supermarket_scarper/scrappers/meshnat_yosef.py +++ b/il_supermarket_scarper/scrappers/meshnat_yosef.py @@ -34,7 +34,7 @@ def extract_task_from_entry(self, all_trs): return download_urls, file_names def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): # no data on shabat return _is_saturday_in_israel() or _is_holiday_in_israel() diff --git a/il_supermarket_scarper/scrappers/nativ_hashed.py b/il_supermarket_scarper/scrappers/nativ_hashed.py index 81c9773..5a7f6fa 100644 --- a/il_supermarket_scarper/scrappers/nativ_hashed.py +++ b/il_supermarket_scarper/scrappers/nativ_hashed.py @@ -19,7 +19,7 @@ def __init__(self, folder_name=None): ) def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): # no data on shabat return _is_saturday_in_israel() or _is_holiday_in_israel() diff --git a/il_supermarket_scarper/scrappers/polizer.py b/il_supermarket_scarper/scrappers/polizer.py index f20d171..6f8819c 100644 --- a/il_supermarket_scarper/scrappers/polizer.py +++ b/il_supermarket_scarper/scrappers/polizer.py @@ -14,7 +14,7 @@ def __init__(self, folder_name=None): ) def _is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): # no data on shabat return ( diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 79064ce..77d0e35 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -42,7 +42,7 @@ def _make_sure_filter_work( file_type=None, limit=None, store_id=None, - only_latest=False, + when_date=None, ): """make sure the file type filter works""" if file_type: @@ -55,7 +55,7 @@ def _make_sure_filter_work( for file in files_found: store_mark.append(int(file.split("-")[1])) assert len(set(store_mark)) == 1 and len(store_mark) == len(files_found) - if only_latest: + if when_date: files_sources = [] for file in files_found: source = file.split("-")[:2] @@ -107,7 +107,7 @@ def _clean_scarpe_delete( store_id=None, limit=None, file_type=None, - only_latest=False, + when_date=None, ): with tempfile.TemporaryDirectory() as tmpdirname: self.__clean_scarpe_delete( @@ -116,7 +116,7 @@ def _clean_scarpe_delete( store_id=store_id, limit=limit, file_type=file_type, - only_latest=only_latest, + when_date=when_date, ) def __clean_scarpe_delete( @@ -126,7 +126,7 @@ def __clean_scarpe_delete( store_id=None, limit=None, file_type=None, - only_latest=False, + when_date=None, ): self._delete_download_folder(dump_path) os.makedirs(dump_path) @@ -142,7 +142,7 @@ def __clean_scarpe_delete( "limit": limit, "files_types": file_type, "store_id": store_id, - "only_latest": only_latest, + "when_date": when_date, "filter_null": True, "filter_zero": True, } @@ -164,14 +164,14 @@ def __clean_scarpe_delete( limit=limit, files_types=file_type, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) and not hasattr(scraper, "_is_flaky"): self._make_sure_filter_work( files_found, file_type=file_type, limit=limit, store_id=store_id, - only_latest=only_latest, + when_date=when_date, ) for file in files_found: @@ -255,7 +255,7 @@ def test_scrape_file_from_single_store_last(self): self._clean_scarpe_delete( scraper_enum, store_id=store_id, - only_latest=True, + when_date="lastast", ) return TestScapers diff --git a/il_supermarket_scarper/scrappers/tivtaam.py b/il_supermarket_scarper/scrappers/tivtaam.py index 3fbf6ba..f6b174f 100644 --- a/il_supermarket_scarper/scrappers/tivtaam.py +++ b/il_supermarket_scarper/scrappers/tivtaam.py @@ -19,7 +19,7 @@ def __init__(self, folder_name=None): ) def is_validate_scraper_found_no_files( - self, limit=None, files_types=None, store_id=None, only_latest=False + self, limit=None, files_types=None, store_id=None, when_date=None ): return ( _is_saturday_in_israel() diff --git a/il_supermarket_scarper/utils/gzip_utils.py b/il_supermarket_scarper/utils/gzip_utils.py index 2f8ad52..20aeffe 100644 --- a/il_supermarket_scarper/utils/gzip_utils.py +++ b/il_supermarket_scarper/utils/gzip_utils.py @@ -22,7 +22,7 @@ def extract_xml_file_from_gz_file(file_save_path): with open(target_file_name, "wb") as f_out: f_out.write(the_file.read()) - except ( # pylint: disable=broad-except,redefined-outer-name + except ( # pylint: disable=broad-except,redefined-outer-name Exception ) as exception: report_failed_zip(exception, file_save_path, target_file_name) diff --git a/stress_test.py b/stress_test.py index 10803e7..fdc9b52 100644 --- a/stress_test.py +++ b/stress_test.py @@ -8,9 +8,10 @@ from il_supermarket_scarper.scrappers_factory import ScraperFactory -def format_stats_as_json(pr, project_name): +def format_stats_as_json(profile, project_name): + """get the stats from the profiler and format them as json""" stream = io.StringIO() - ps = pstats.Stats(pr, stream=stream) + ps = pstats.Stats(profile, stream=stream) ps.sort_stats(pstats.SortKey.CUMULATIVE) # Sort by cumulative time ps.print_stats() @@ -21,22 +22,22 @@ def format_stats_as_json(pr, project_name): project_stats = [] for line in stats_output: if project_name in line: # Filter for project-specific lines - # Extract relevant fields from the profiling output - # The typical format is (Function location, Number of calls, Total time, Cumulative time, etc.) + parts = line.split() if len(parts) >= 5: # Basic sanity check for the parts function_data = { - "function": parts[-1], # Function path - "ncalls": parts[0], # Number of calls - "tottime": parts[1], - "tottime_per_call": parts[2],# Time spent in function - "cumtime": parts[3], # Cumulative time including subcalls - "cumtime_per_call": parts[4] # + "function": parts[-1], # Function path + "ncalls": parts[0], # Number of calls + "tottime": parts[1], + "tottime_per_call": parts[2], # Time spent in function + "cumtime": parts[3], # Cumulative time including subcalls + "cumtime_per_call": parts[4], # } project_stats.append(function_data) return project_stats + if __name__ == "__main__": result = {} From 5dfaa479719108f4a55d1012625230a1122a7a41 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 07:55:25 +0000 Subject: [PATCH 4/9] . --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6de0864..081b398 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ tests_require=dev_required, extras_require={"test": ["pytest", "pytest-xdist"]}, # *strongly* suggested for sharing - version="0.4.8", + version="0.4.9", # The license can be anything you like license="MIT", description="python package that implement a scraping for israeli supermarket data", From a65f6fe82c9fd53cb2b231abc14a37890aee061d Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 15:21:59 +0000 Subject: [PATCH 5/9] . --- il_supermarket_scarper/engines/apsx.py | 55 ++++++++++--------- il_supermarket_scarper/engines/engine.py | 2 +- il_supermarket_scarper/engines/web.py | 1 + .../scrappers/tests/test_cases.py | 4 +- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index 53a6eca..12e6978 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -44,7 +44,7 @@ def _build_query_url(self, query_params): res.append(base + self.aspx_page + query_params) return res - def _get_all_possible_query_string_params( + def _get_all_possible_query_string_params( #pylint: disable=unused-argument self, files_types=None, store_id=None, when_date=None ): """get the arguments need to add to the url""" @@ -52,35 +52,38 @@ def _get_all_possible_query_string_params( res = [] for c_id in self.chain_id: res.append(f"?code={c_id}") - return res + chains_urls = res + chains_urls = [f"?code={self.chain_id}"] - # add file types to url - if files_types: - chains_urls_with_types = [] - for files_type in files_types: - file_type_id = self.file_type_id(files_type) - chains_urls_with_types.extend( - [ - f"{chain_url}&WFileType={file_type_id}" - for chain_url in chains_urls - ] - ) - chains_urls = chains_urls_with_types - - # add store id - if store_id: - for chain_url in chains_urls: - chain_url += f"&WStore={store_id}" - - # posting date - if when_date: - for chain_url in chains_urls: - chain_url += ( - f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}" - ) return chains_urls + # # add file types to url + # if files_types: + # chains_urls_with_types = [] + # for files_type in files_types: + # file_type_id = self.file_type_id(files_type) + # chains_urls_with_types.extend( + # [ + # f"{chain_url}&WFileType={file_type_id}" + # for chain_url in chains_urls + # ] + # ) + # chains_urls = chains_urls_with_types + + # # add store id + # if store_id: + # for chain_url in chains_urls: + # chain_url += f"&WStore={store_id}" + + # # posting date + # if when_date: + # for chain_url in chains_urls: + # chain_url += ( + # f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}" + # ) + # return chains_urls + def get_request_url(self, files_types=None, store_id=None, when_date=None): result = [] for query_params in self._get_all_possible_query_string_params( diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index a1ba629..c958fea 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -194,7 +194,7 @@ def get_by_date(self, requested_date, by_function, intreable_): name_split = by_function(file).split("-") date_info = "-".join(name_split[2:]).rsplit(".", maxsplit=1)[-1] - if date_info == requested_date: + if date_info.startswith(requested_date.strftime("%Y%d%m")): groups_value.append(file) return groups_value diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index 5043ca5..476ed89 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -152,4 +152,5 @@ def scrape( return results except Exception as e: # pylint: disable=broad-except self.on_download_fail(e, download_urls=download_urls, file_names=file_names) + Logger.error_execption(e) return [] diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 77d0e35..03c38f2 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -4,7 +4,7 @@ import os import uuid import xml.etree.ElementTree as ET -from il_supermarket_scarper.utils import FileTypesFilters, Logger, DumpFolderNames +from il_supermarket_scarper.utils import FileTypesFilters, Logger, DumpFolderNames, _now from il_supermarket_scarper.scrappers_factory import ScraperFactory @@ -255,7 +255,7 @@ def test_scrape_file_from_single_store_last(self): self._clean_scarpe_delete( scraper_enum, store_id=store_id, - when_date="lastast", + when_date=_now(), ) return TestScapers From 187b80e35a6ce26c9172b4bf1a23569569468a39 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 15:23:50 +0000 Subject: [PATCH 6/9] reduce time --- il_supermarket_scarper/engines/apsx.py | 2 +- il_supermarket_scarper/engines/engine.py | 6 +++--- il_supermarket_scarper/scrappers/tests/test_cases.py | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index 12e6978..34970df 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -44,7 +44,7 @@ def _build_query_url(self, query_params): res.append(base + self.aspx_page + query_params) return res - def _get_all_possible_query_string_params( #pylint: disable=unused-argument + def _get_all_possible_query_string_params( # pylint: disable=unused-argument self, files_types=None, store_id=None, when_date=None ): """get the arguments need to add to the url""" diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index c958fea..85cf55b 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -87,9 +87,9 @@ def apply_limit( files_names_to_scrape=None, ): """filter the list according to condition""" - assert ( - when_date is not None or limit is None - ), "when_date flag can't be applied with limit." + # assert ( + # when_date is not None or limit is None + # ), "when_date flag can't be applied with limit." # filter files already downloaded intreable_ = self.filter_already_downloaded( diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 03c38f2..8e8d0c2 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -253,9 +253,7 @@ def test_scrape_file_from_single_store(self): def test_scrape_file_from_single_store_last(self): """test fetching latest file only""" self._clean_scarpe_delete( - scraper_enum, - store_id=store_id, - when_date=_now(), + scraper_enum, store_id=store_id, when_date=_now(), limit=1 ) return TestScapers From fb980994624f7beca9324399f3ce38583defab8d Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 15:51:39 +0000 Subject: [PATCH 7/9] . --- il_supermarket_scarper/engines/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index 85cf55b..29c95f7 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -128,7 +128,7 @@ def apply_limit( intreable_ = self.get_by_date(when_date, by_function, intreable_) elif isinstance(when_date, str) and when_date == "latest": intreable_ = self.get_only_latest(by_function, intreable_) - else: + elif when_date is not None: raise ValueError(f"when_date should be datetime or bool, got {when_date}") Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}") From 0089414ba1bf6b46b4f7021a74dbb42ab96efbf1 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Thu, 10 Oct 2024 17:20:48 +0000 Subject: [PATCH 8/9] runtime --- il_supermarket_scarper/engines/engine.py | 4 ++-- il_supermarket_scarper/scrappers/tests/test_cases.py | 7 +++---- il_supermarket_scarper/utils/connection.py | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index 29c95f7..2832c40 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -134,7 +134,7 @@ def apply_limit( Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}") # filter by limit if the 'files_types' filter is not on. - if limit and files_types is None: + if limit: assert limit > 0, "Limit must be greater than 0" Logger.info(f"Limit: {limit}") intreable_ = intreable_[: min(limit, len(list(intreable_)))] @@ -192,7 +192,7 @@ def get_by_date(self, requested_date, by_function, intreable_): groups_value = [] for file in intreable_: name_split = by_function(file).split("-") - date_info = "-".join(name_split[2:]).rsplit(".", maxsplit=1)[-1] + date_info = name_split[-1].rsplit(".", maxsplit=1)[0] if date_info.startswith(requested_date.strftime("%Y%d%m")): groups_value.append(file) diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 8e8d0c2..9bd0dd7 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -63,9 +63,8 @@ def _make_sure_filter_work( store_mark.append(source) assert ( - not limit or len(files_found) == limit - ), f""" Found {files_found} - f"files but should be {limit}""" + limit is None or len(files_found) == limit + ), f""" Found {files_found} f"files but should be {limit}""" def _make_sure_file_contain_chain_ids(self, chain_ids, file): """make sure the scraper download only the chain id""" @@ -201,7 +200,7 @@ def test_scrape_one(self): def test_scrape_ten(self): """scrape ten file and make sure they exists""" - self._clean_scarpe_delete(scraper_enum, limit=None) + self._clean_scarpe_delete(scraper_enum, limit=10) def test_scrape_promo(self): """scrape one promo file and make sure it exists""" diff --git a/il_supermarket_scarper/utils/connection.py b/il_supermarket_scarper/utils/connection.py index 5e17bcd..785e842 100644 --- a/il_supermarket_scarper/utils/connection.py +++ b/il_supermarket_scarper/utils/connection.py @@ -292,12 +292,12 @@ def collect_from_ftp(ftp_host, ftp_username, ftp_password, ftp_path, timeout=60 @download_connection_retry() def fetch_temporary_gz_file_from_ftp( - ftp_host, ftp_username, ftp_password, ftp_path, temporary_gz_file_path + ftp_host, ftp_username, ftp_password, ftp_path, temporary_gz_file_path, timeout=15 ): """download a file from a cerberus base site.""" with open(temporary_gz_file_path, "wb") as file_ftp: file_name = ntpath.basename(temporary_gz_file_path) - ftp = FTP_TLS(ftp_host, ftp_username, ftp_password) + ftp = FTP_TLS(ftp_host, ftp_username, ftp_password, timeout=timeout) ftp.trust_server_pasv_ipv4_address = True ftp.cwd(ftp_path) ftp.retrbinary("RETR " + file_name, file_ftp.write) From 966f57baee929cf7add32ed89f4cbb7216fbf623 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Fri, 11 Oct 2024 09:03:00 +0000 Subject: [PATCH 9/9] . --- il_supermarket_scarper/engines/engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index 2832c40..a55741f 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -188,13 +188,15 @@ def get_only_latest(self, by_function, intreable_): def get_by_date(self, requested_date, by_function, intreable_): """get by date""" - + # + date_format = requested_date.strftime("%Y%m%d") + # groups_value = [] for file in intreable_: - name_split = by_function(file).split("-") + name_split = by_function(file).split("-", maxsplit=2) date_info = name_split[-1].rsplit(".", maxsplit=1)[0] - if date_info.startswith(requested_date.strftime("%Y%d%m")): + if date_info.startswith(date_format): groups_value.append(file) return groups_value