From 6666b3df56bd2ffd4e215dbddbdbe4d42bbb7ed8 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 05:43:17 +0000 Subject: [PATCH 01/20] remove asyncio --- il_supermarket_scarper/engines/cerberus.py | 4 +- .../engines/multipage_web.py | 4 +- il_supermarket_scarper/engines/web.py | 4 +- il_supermarket_scarper/utils/__init__.py | 2 +- il_supermarket_scarper/utils/loop.py | 62 +++++-------------- 5 files changed, 24 insertions(+), 52 deletions(-) diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index ca91676..5d47b1b 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -4,7 +4,7 @@ from il_supermarket_scarper.utils import ( extract_xml_file_from_gz_file, Logger, - execute_in_event_loop, + execute_in_parallels, collect_from_ftp, fetch_temporary_gz_file_from_ftp, retry_files, @@ -62,7 +62,7 @@ def scrape( ) self.on_collected_details(files) - results = execute_in_event_loop( + results = execute_in_parallels( self.persist_from_ftp, files, max_workers=self.max_workers ) self.on_download_completed(results=results) diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index ccd3b5c..f7982bc 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -10,7 +10,7 @@ from il_supermarket_scarper.utils import ( Logger, - execute_in_event_loop, + execute_in_parallels, multiple_page_aggregtion, ) from .web import WebBase @@ -87,7 +87,7 @@ def collect_files_details_from_site( ) ) - download_urls, file_names = execute_in_event_loop( + download_urls, file_names = execute_in_parallels( self.process_links_before_download, pages_to_scrape, aggregtion_function=multiple_page_aggregtion, diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index 474a019..20742e5 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -1,7 +1,7 @@ from bs4 import BeautifulSoup from il_supermarket_scarper.utils import ( Logger, - execute_in_event_loop, + execute_in_parallels, session_and_check_status, retry_files, ) @@ -129,7 +129,7 @@ def scrape( Logger.info(f"collected {len(download_urls)} to download.") if len(download_urls) > 0: - results = execute_in_event_loop( + results = execute_in_parallels( self.save_and_extract, zip(download_urls, file_names), max_workers=self.max_workers, diff --git a/il_supermarket_scarper/utils/__init__.py b/il_supermarket_scarper/utils/__init__.py index 97acc5c..b2ccb25 100644 --- a/il_supermarket_scarper/utils/__init__.py +++ b/il_supermarket_scarper/utils/__init__.py @@ -23,7 +23,7 @@ fetch_temporary_gz_file_from_ftp, wget_file, ) -from .loop import execute_in_event_loop, multiple_page_aggregtion +from .loop import execute_in_parallels, multiple_page_aggregtion from .exceptions import RestartSessionError from .retry import retry_files from .marking import FlakyScraper diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index a8a5f92..02091ed 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -1,16 +1,4 @@ -import asyncio import concurrent.futures -from .logger import Logger - - -def get_event_loop(): - """get the current running event loop""" - try: - return asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop def defualt_aggregtion_function(all_done): @@ -35,7 +23,7 @@ def multiple_page_aggregtion(pages_to_scrape): return download_urls, file_names -def execute_in_event_loop( +def execute_in_parallels( function_to_execute, iterable, max_workers=None, @@ -43,43 +31,27 @@ def execute_in_event_loop( ): """execute a job in the event loop""" - loop = get_event_loop() - return loop.run_until_complete( - run_task_async( - function_to_execute, - iterable, - max_workers=max_workers, - aggregtion_function=aggregtion_function, - ) + results = run_tasks( + function_to_execute, + iterable, + max_workers=max_workers, ) + + all_done = aggregtion_function(results) + print(f"Done with {len(all_done)} tasks") + return all_done - -async def run_task_async( +def run_tasks( function_to_execute, iterable, - max_workers=None, - aggregtion_function=defualt_aggregtion_function, + max_workers: int = None, ): - """run task in multi-thread""" - loop = get_event_loop() - + """Run tasks in multi-thread or sequentially""" if max_workers: - # use multi-thread - futures = [] + # Use multi-thread with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - for arg in iterable: - futures.append(loop.run_in_executor(executor, function_to_execute, arg)) - - if len(futures) == 0: - return [] - all_done, not_done = await asyncio.wait(futures) - assert len(not_done) == 0, "Not all tasks are done, should be blocking." + futures = [executor.submit(function_to_execute, arg) for arg in iterable] + return [future.result() for future in concurrent.futures.as_completed(futures)] else: - # or just itreate over all - all_done = [] - for arg in iterable: - all_done.append(function_to_execute(arg)) - all_done = aggregtion_function(list(all_done)) - - Logger.info(f"Done with {len(all_done)} files") - return all_done + # Or just iterate over all + return [function_to_execute(arg) for arg in iterable] \ No newline at end of file From 05e1ddaf1bd8e46ab18a53d9fa132c7733f897b3 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 05:51:26 +0000 Subject: [PATCH 02/20] . --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 489255c..76b5823 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ tests_require=dev_required, extras_require={"test": ["pytest"]}, # *strongly* suggested for sharing - version="0.4.5", + version="0.4.6", # The license can be anything you like license="MIT", description="python package that implement a scraping for israeli supermarket data", From 9ba86d03999e9b3057daa40ab3e4329951154f82 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 06:00:40 +0000 Subject: [PATCH 03/20] increase thread --- il_supermarket_scarper/engines/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index cd8016b..e1c2148 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -29,7 +29,7 @@ def __init__(self, chain, chain_id, folder_name=None): super().__init__(chain.value, "status", folder_name=folder_name) self.chain = chain self.chain_id = chain_id - self.max_workers = 5 + self.max_workers = 10 self.storage_path = get_output_folder(self.chain.value, folder_name=folder_name) Logger.info(f"Storage path: {self.storage_path}") From b8904ad2953a96ae1b5c5044f8e0edbafa94cfd9 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 10:54:36 +0000 Subject: [PATCH 04/20] . --- il_supermarket_scarper/engines/cerberus.py | 8 ++++---- il_supermarket_scarper/engines/multipage_web.py | 4 ++-- il_supermarket_scarper/engines/web.py | 4 ++-- il_supermarket_scarper/scrappers/tests/test_cases.py | 2 +- il_supermarket_scarper/utils/__init__.py | 2 +- il_supermarket_scarper/utils/logger.py | 10 +++++++++- il_supermarket_scarper/utils/loop.py | 9 +++++---- 7 files changed, 24 insertions(+), 15 deletions(-) diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index 5d47b1b..27a8ff8 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -4,7 +4,7 @@ from il_supermarket_scarper.utils import ( extract_xml_file_from_gz_file, Logger, - execute_in_parallels, + execute_in_parallel, collect_from_ftp, fetch_temporary_gz_file_from_ftp, retry_files, @@ -54,15 +54,15 @@ def scrape( files = self.collect_files_details_from_site( limit=limit, files_types=files_types, - filter_null=True, - filter_zero=True, + filter_null=False, + filter_zero=False, store_id=store_id, only_latest=only_latest, files_names_to_scrape=files_names_to_scrape, ) self.on_collected_details(files) - results = execute_in_parallels( + results = execute_in_parallel( self.persist_from_ftp, files, max_workers=self.max_workers ) self.on_download_completed(results=results) diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index f7982bc..ad3fcb3 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -10,7 +10,7 @@ from il_supermarket_scarper.utils import ( Logger, - execute_in_parallels, + execute_in_parallel, multiple_page_aggregtion, ) from .web import WebBase @@ -87,7 +87,7 @@ def collect_files_details_from_site( ) ) - download_urls, file_names = execute_in_parallels( + download_urls, file_names = execute_in_parallel( self.process_links_before_download, pages_to_scrape, aggregtion_function=multiple_page_aggregtion, diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index 20742e5..ca272fc 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -1,7 +1,7 @@ from bs4 import BeautifulSoup from il_supermarket_scarper.utils import ( Logger, - execute_in_parallels, + execute_in_parallel, session_and_check_status, retry_files, ) @@ -129,7 +129,7 @@ def scrape( Logger.info(f"collected {len(download_urls)} to download.") if len(download_urls) > 0: - results = execute_in_parallels( + results = execute_in_parallel( self.save_and_extract, zip(download_urls, file_names), max_workers=self.max_workers, diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 3ad23d0..37fab63 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -199,7 +199,7 @@ def test_scrape_one(self): def test_scrape_ten(self): """scrape ten file and make sure they exists""" - self._clean_scarpe_delete(scraper_enum, limit=10) + self._clean_scarpe_delete(scraper_enum, limit=None) def test_scrape_promo(self): """scrape one promo file and make sure it exists""" diff --git a/il_supermarket_scarper/utils/__init__.py b/il_supermarket_scarper/utils/__init__.py index b2ccb25..97956ee 100644 --- a/il_supermarket_scarper/utils/__init__.py +++ b/il_supermarket_scarper/utils/__init__.py @@ -23,7 +23,7 @@ fetch_temporary_gz_file_from_ftp, wget_file, ) -from .loop import execute_in_parallels, multiple_page_aggregtion +from .loop import execute_in_parallel, multiple_page_aggregtion from .exceptions import RestartSessionError from .retry import retry_files from .marking import FlakyScraper diff --git a/il_supermarket_scarper/utils/logger.py b/il_supermarket_scarper/utils/logger.py index ae5f747..bd2d8ff 100644 --- a/il_supermarket_scarper/utils/logger.py +++ b/il_supermarket_scarper/utils/logger.py @@ -10,7 +10,7 @@ def build_logger(): if not logger.handlers: logger.setLevel(logging.DEBUG) # set logger level log_formatter = logging.Formatter( - "%(name)-12s %(asctime)s %(levelname)-8s %(filename)s:%(funcName)s %(message)s" + "%(name)-12s %(asctime)s %(levelname)-8s [%(threadName)s] %(filename)s:%(funcName)s %(message)s" ) console_handler = logging.StreamHandler( sys.stdout @@ -42,6 +42,14 @@ def info(cls, msg, *args, **kwargs): if cls.enabled: cls.logger.info(msg, *args, **kwargs) + + @classmethod + def debug(cls, msg, *args, **kwargs): + """log info""" + if cls.enabled: + cls.logger.debug(msg, *args, **kwargs) + + @classmethod def error(cls, msg, *args, **kwargs): """log error""" diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index 02091ed..aff36a3 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -1,5 +1,5 @@ import concurrent.futures - +from il_supermarket_scarper.utils import Logger def defualt_aggregtion_function(all_done): """format the scraping result to the final input""" @@ -23,7 +23,7 @@ def multiple_page_aggregtion(pages_to_scrape): return download_urls, file_names -def execute_in_parallels( +def execute_in_parallel( function_to_execute, iterable, max_workers=None, @@ -31,6 +31,7 @@ def execute_in_parallels( ): """execute a job in the event loop""" + Logger.info(f"Running {len(iterable)} tasks in parallel") results = run_tasks( function_to_execute, iterable, @@ -38,7 +39,7 @@ def execute_in_parallels( ) all_done = aggregtion_function(results) - print(f"Done with {len(all_done)} tasks") + print(f"Done with {len(all_done)} tasks in parallel") return all_done def run_tasks( @@ -49,7 +50,7 @@ def run_tasks( """Run tasks in multi-thread or sequentially""" if max_workers: # Use multi-thread - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers,thread_name_prefix="PullingThread") as executor: futures = [executor.submit(function_to_execute, arg) for arg in iterable] return [future.result() for future in concurrent.futures.as_completed(futures)] else: From 324b4879cc249fbfd6ed365e2e826deef1773f08 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 13:54:24 +0000 Subject: [PATCH 05/20] , --- il_supermarket_scarper/engines/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index e1c2148..49debb1 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -29,7 +29,7 @@ def __init__(self, chain, chain_id, folder_name=None): super().__init__(chain.value, "status", folder_name=folder_name) self.chain = chain self.chain_id = chain_id - self.max_workers = 10 + self.max_workers = 20 self.storage_path = get_output_folder(self.chain.value, folder_name=folder_name) Logger.info(f"Storage path: {self.storage_path}") From c97ec69b6fd0490c52a2545f70ccdbaaac740358 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 15:42:27 +0000 Subject: [PATCH 06/20] max_threads --- il_supermarket_scarper/engines/apsx.py | 8 +++++-- il_supermarket_scarper/engines/cerberus.py | 5 +++-- il_supermarket_scarper/engines/engine.py | 4 ++-- .../engines/multipage_web.py | 7 ++++-- .../engines/publishprice.py | 11 +++++++++- il_supermarket_scarper/engines/web.py | 6 ++--- il_supermarket_scarper/utils/logger.py | 2 -- il_supermarket_scarper/utils/loop.py | 22 ++++++++++++------- 8 files changed, 43 insertions(+), 22 deletions(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index c507be0..d822799 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -7,8 +7,12 @@ class Aspx(WebBase, ABC): """class for aspx scapers""" - def __init__(self, chain, chain_id, url, aspx_page, folder_name=None): - super().__init__(chain, chain_id, url, folder_name=folder_name) + def __init__( + self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5 + ): + super().__init__( + chain, chain_id, url, folder_name=folder_name, max_threads=max_threads + ) self.aspx_page = aspx_page def extract_task_from_entry(self, all_trs): diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index 27a8ff8..fbfdb6b 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -26,8 +26,9 @@ def __init__( ftp_path="/", ftp_username="", ftp_password="", + max_threads=5, ): - super().__init__(chain, chain_id, folder_name) + super().__init__(chain, chain_id, folder_name, max_threads) self.ftp_host = ftp_host self.ftp_path = ftp_path self.ftp_username = ftp_username @@ -63,7 +64,7 @@ def scrape( self.on_collected_details(files) results = execute_in_parallel( - self.persist_from_ftp, files, max_workers=self.max_workers + self.persist_from_ftp, files, max_threads=self.max_threads ) self.on_download_completed(results=results) self.on_scrape_completed(self.get_storage_path()) diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index 49debb1..b04db4c 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -21,7 +21,7 @@ class Engine(ScraperStatus, ABC): """base engine for scraping""" - def __init__(self, chain, chain_id, folder_name=None): + def __init__(self, chain, chain_id, folder_name=None, max_threads=10): assert DumpFolderNames.is_valid_folder_name( chain ), "chain name can contain only abc and -" @@ -29,7 +29,7 @@ def __init__(self, chain, chain_id, folder_name=None): super().__init__(chain.value, "status", folder_name=folder_name) self.chain = chain self.chain_id = chain_id - self.max_workers = 20 + self.max_threads = max_threads self.storage_path = get_output_folder(self.chain.value, folder_name=folder_name) Logger.info(f"Storage path: {self.storage_path}") diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index ad3fcb3..ceafb3a 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -31,8 +31,11 @@ def __init__( total_page_xpath="""//*[@id="gridContainer"]/table/ tfoot/tr/td/a[6]/@href""", total_pages_pattern=r"^\/\?page\=([0-9]{3})$", + max_threads=5, ): - super().__init__(chain, chain_id, url=url, folder_name=folder_name) + super().__init__( + chain, chain_id, url=url, folder_name=folder_name, max_threads=max_threads + ) self.total_page_xpath = total_page_xpath self.total_pages_pattern = total_pages_pattern @@ -91,7 +94,7 @@ def collect_files_details_from_site( self.process_links_before_download, pages_to_scrape, aggregtion_function=multiple_page_aggregtion, - max_workers=self.max_workers, + max_threads=self.max_threads, ) file_names, download_urls = self.apply_limit_zip( file_names, diff --git a/il_supermarket_scarper/engines/publishprice.py b/il_supermarket_scarper/engines/publishprice.py index a958bb7..c8acd80 100644 --- a/il_supermarket_scarper/engines/publishprice.py +++ b/il_supermarket_scarper/engines/publishprice.py @@ -17,12 +17,21 @@ class PublishPrice(WebBase): but this is not implemented. """ - def __init__(self, chain, chain_id, site_infix, folder_name=None, domain="prices"): + def __init__( + self, + chain, + chain_id, + site_infix, + folder_name=None, + domain="prices", + max_threads=5, + ): super().__init__( chain, chain_id, url=f"https://{domain}.{site_infix}.co.il/", folder_name=folder_name, + max_threads=max_threads, ) self.folder = None diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index ca272fc..07d9da6 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -12,8 +12,8 @@ class WebBase(Engine): """scrape the file of websites that the only why to download them is via web""" - def __init__(self, chain, chain_id, url, folder_name=None): - super().__init__(chain, chain_id, folder_name) + def __init__(self, chain, chain_id, url, folder_name=None, max_threads=5): + super().__init__(chain, chain_id, folder_name, max_threads=max_threads) self.url = url self.max_retry = 2 @@ -132,7 +132,7 @@ def scrape( results = execute_in_parallel( self.save_and_extract, zip(download_urls, file_names), - max_workers=self.max_workers, + max_threads=self.max_threads, ) else: results = [] diff --git a/il_supermarket_scarper/utils/logger.py b/il_supermarket_scarper/utils/logger.py index bd2d8ff..ce8bf4c 100644 --- a/il_supermarket_scarper/utils/logger.py +++ b/il_supermarket_scarper/utils/logger.py @@ -42,14 +42,12 @@ def info(cls, msg, *args, **kwargs): if cls.enabled: cls.logger.info(msg, *args, **kwargs) - @classmethod def debug(cls, msg, *args, **kwargs): """log info""" if cls.enabled: cls.logger.debug(msg, *args, **kwargs) - @classmethod def error(cls, msg, *args, **kwargs): """log error""" diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index aff36a3..55c355e 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -1,6 +1,7 @@ import concurrent.futures from il_supermarket_scarper.utils import Logger + def defualt_aggregtion_function(all_done): """format the scraping result to the final input""" result = [] @@ -26,7 +27,7 @@ def multiple_page_aggregtion(pages_to_scrape): def execute_in_parallel( function_to_execute, iterable, - max_workers=None, + max_threads=None, aggregtion_function=defualt_aggregtion_function, ): """execute a job in the event loop""" @@ -35,24 +36,29 @@ def execute_in_parallel( results = run_tasks( function_to_execute, iterable, - max_workers=max_workers, + max_threads=max_threads, ) - + all_done = aggregtion_function(results) print(f"Done with {len(all_done)} tasks in parallel") return all_done + def run_tasks( function_to_execute, iterable, - max_workers: int = None, + max_threads: int = None, ): """Run tasks in multi-thread or sequentially""" - if max_workers: + if max_threads: # Use multi-thread - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers,thread_name_prefix="PullingThread") as executor: + with concurrent.futures.ThreadPoolExecutor( + max_threads=max_threads, thread_name_prefix="PullingThread" + ) as executor: futures = [executor.submit(function_to_execute, arg) for arg in iterable] - return [future.result() for future in concurrent.futures.as_completed(futures)] + return [ + future.result() for future in concurrent.futures.as_completed(futures) + ] else: # Or just iterate over all - return [function_to_execute(arg) for arg in iterable] \ No newline at end of file + return [function_to_execute(arg) for arg in iterable] From 6a7a9148f7e8146340119d1fb03c9e587cfea2fa Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 15:47:01 +0000 Subject: [PATCH 07/20] . --- il_supermarket_scarper/scrappers/ramilevy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/il_supermarket_scarper/scrappers/ramilevy.py b/il_supermarket_scarper/scrappers/ramilevy.py index 3c203f6..ba43834 100644 --- a/il_supermarket_scarper/scrappers/ramilevy.py +++ b/il_supermarket_scarper/scrappers/ramilevy.py @@ -11,4 +11,5 @@ def __init__(self, folder_name=None): chain_id="7290058140886", folder_name=folder_name, ftp_username="RamiLevi", + max_threads=10, ) From 843cda71c26091353b5abc56d2b7952317bbf8ee Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 16:06:52 +0000 Subject: [PATCH 08/20] . --- il_supermarket_scarper/engines/cerberus.py | 2 +- .../engines/multipage_web.py | 2 +- il_supermarket_scarper/engines/web.py | 2 +- il_supermarket_scarper/utils/loop.py | 2 +- stress_test.py | 31 +++++++++++++++++++ 5 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 stress_test.py diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index fbfdb6b..4539f2a 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -64,7 +64,7 @@ def scrape( self.on_collected_details(files) results = execute_in_parallel( - self.persist_from_ftp, files, max_threads=self.max_threads + self.persist_from_ftp, list(files), max_threads=self.max_threads ) self.on_download_completed(results=results) self.on_scrape_completed(self.get_storage_path()) diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index ceafb3a..cb2520a 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -92,7 +92,7 @@ def collect_files_details_from_site( download_urls, file_names = execute_in_parallel( self.process_links_before_download, - pages_to_scrape, + list(pages_to_scrape), aggregtion_function=multiple_page_aggregtion, max_threads=self.max_threads, ) diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index 07d9da6..d8c05e0 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -131,7 +131,7 @@ def scrape( if len(download_urls) > 0: results = execute_in_parallel( self.save_and_extract, - zip(download_urls, file_names), + list(zip(download_urls, file_names)), max_threads=self.max_threads, ) else: diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index 55c355e..51c847f 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -53,7 +53,7 @@ def run_tasks( if max_threads: # Use multi-thread with concurrent.futures.ThreadPoolExecutor( - max_threads=max_threads, thread_name_prefix="PullingThread" + max_workers=max_threads, thread_name_prefix="PullingThread" ) as executor: futures = [executor.submit(function_to_execute, arg) for arg in iterable] return [ diff --git a/stress_test.py b/stress_test.py new file mode 100644 index 0000000..4989077 --- /dev/null +++ b/stress_test.py @@ -0,0 +1,31 @@ +from il_supermarket_scarper.scrappers_factory import ScraperFactory +import time,json +import datetime + +if __name__ == "__main__": + + result = {} + for scraper in ScraperFactory.all_scrapers_name(): + + def full_execution(): + initer = ScraperFactory.get(scraper)() + return initer.scrape(limit=None) + + execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + start_time = time.time() + files = full_execution() + end_time = time.time() + result[scraper] = { + "execution_time":execution_time, + "start_time":start_time, + "end_time":end_time, + "time": end_time - start_time, + "files": len(files) + } + + with open("stress_test_results.json", "w") as f: + json.dump(result, f) + + + + From 02c302e8b50c5503364a428cbb83ebcd3da08b9c Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 16:09:46 +0000 Subject: [PATCH 09/20] . --- il_supermarket_scarper/utils/logger.py | 3 ++- stress_test.py | 35 +++++++++++++------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/il_supermarket_scarper/utils/logger.py b/il_supermarket_scarper/utils/logger.py index ce8bf4c..63dce82 100644 --- a/il_supermarket_scarper/utils/logger.py +++ b/il_supermarket_scarper/utils/logger.py @@ -10,7 +10,8 @@ def build_logger(): if not logger.handlers: logger.setLevel(logging.DEBUG) # set logger level log_formatter = logging.Formatter( - "%(name)-12s %(asctime)s %(levelname)-8s [%(threadName)s] %(filename)s:%(funcName)s %(message)s" + "%(name)-12s %(asctime)s %(levelname)-8s " + "[%(threadName)s] %(filename)s:%(funcName)s %(message)s" ) console_handler = logging.StreamHandler( sys.stdout diff --git a/stress_test.py b/stress_test.py index 4989077..c3d9efd 100644 --- a/stress_test.py +++ b/stress_test.py @@ -1,31 +1,30 @@ -from il_supermarket_scarper.scrappers_factory import ScraperFactory -import time,json +import time +import json import datetime +from il_supermarket_scarper.scrappers_factory import ScraperFactory + if __name__ == "__main__": result = {} - for scraper in ScraperFactory.all_scrapers_name(): + for scraper_name in ScraperFactory.all_scrapers_name(): - def full_execution(): + def full_execution(scraper): + """full execution of the scraper""" initer = ScraperFactory.get(scraper)() - return initer.scrape(limit=None) - + return initer.scrape() + execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") start_time = time.time() - files = full_execution() + files = full_execution(scraper_name) end_time = time.time() - result[scraper] = { - "execution_time":execution_time, - "start_time":start_time, - "end_time":end_time, + result[scraper_name] = { + "execution_time": execution_time, + "start_time": start_time, + "end_time": end_time, "time": end_time - start_time, - "files": len(files) + "files": len(files), } - - with open("stress_test_results.json", "w") as f: - json.dump(result, f) - - - + with open("stress_test_results.json", "w", encoding="utf-8") as f: + json.dump(result, f) From 20d53fee6a7da4260d35e0247ec5dbf1995ddc45 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 16:33:41 +0000 Subject: [PATCH 10/20] . --- il_supermarket_scarper/utils/retry.py | 2 +- stress_test.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/il_supermarket_scarper/utils/retry.py b/il_supermarket_scarper/utils/retry.py index b4b23cb..b898268 100644 --- a/il_supermarket_scarper/utils/retry.py +++ b/il_supermarket_scarper/utils/retry.py @@ -233,7 +233,7 @@ def __retry_files( # next iteration retry_list, other_results = compute_retry(results) - all_results.append(other_results) + all_results.extend(other_results) # if there is not files in the retry list, break if len(retry_list) == 0: break diff --git a/stress_test.py b/stress_test.py index c3d9efd..8afd864 100644 --- a/stress_test.py +++ b/stress_test.py @@ -1,9 +1,11 @@ import time import json import datetime +import tempfile from il_supermarket_scarper.scrappers_factory import ScraperFactory + if __name__ == "__main__": result = {} @@ -11,8 +13,9 @@ def full_execution(scraper): """full execution of the scraper""" - initer = ScraperFactory.get(scraper)() - return initer.scrape() + with tempfile.TemporaryDirectory() as tmpdirname: + initer = ScraperFactory.get(scraper)(folder_name=tmpdirname) + return initer.scrape() execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") start_time = time.time() From 000e1823f76ea53dab5a45c55a1a391e5070775c Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sat, 28 Sep 2024 18:34:58 +0000 Subject: [PATCH 11/20] . --- il_supermarket_scarper/engines/apsx.py | 54 +++++++++++++++++-- .../engines/multipage_web.py | 3 +- il_supermarket_scarper/engines/web.py | 5 +- il_supermarket_scarper/scrappers/yellow.py | 1 + .../utils/databases/json_file.py | 51 +++++++++++++----- .../utils/scraper_status.py | 8 ++- stress_test.py | 18 +++++++ 7 files changed, 118 insertions(+), 22 deletions(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index d822799..3c454e8 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from il_supermarket_scarper.utils import Logger +from il_supermarket_scarper.utils import Logger, FileTypesFilters from .web import WebBase @@ -15,6 +15,20 @@ def __init__( ) self.aspx_page = aspx_page + def file_type_id(self, file_type): + """get the file type id""" + if file_type == FileTypesFilters.STORE_FILE.name: + return 1 + if file_type == FileTypesFilters.PRICE_FILE.name: + return 2 + if file_type == FileTypesFilters.PROMO_FILE.name: + return 3 + if file_type == FileTypesFilters.PRICE_FULL_FILE.name: + return 4 + if file_type == FileTypesFilters.PROMO_FULL_FILE.name: + return 5 + raise ValueError(f"file type {file_type} not supported") + def extract_task_from_entry(self, all_trs): download_urls: list = list( map(lambda x: self.url + self.get_href_from_entry(x), all_trs) @@ -30,18 +44,48 @@ def _build_query_url(self, query_params): res.append(base + self.aspx_page + query_params) return res - def _get_all_possible_query_string_params(self): + def _get_all_possible_query_string_params( + self, files_types=None, store_id=None, when_date=None + ): """get the arguments need to add to the url""" if isinstance(self.chain_id, list): res = [] for c_id in self.chain_id: res.append(f"?code=={c_id}") return res - return [f"?code={self.chain_id}"] + chains_urls = [f"?code={self.chain_id}"] + + # add file types to url + if files_types: + chains_urls_with_types = [] + for files_type in files_types: + file_type_id = self.file_type_id(files_type) + chains_urls_with_types.extend( + [ + f"{chain_url}&WFileType={file_type_id}" + for chain_url in chains_urls + ] + ) + chains_urls = chains_urls_with_types + + # add store id + if store_id: + for chain_url in chains_urls: + chain_url += f"&WStore={store_id}" + + # posting date + if when_date: + for chain_url in chains_urls: + chain_url += ( + f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}" + ) + return chains_urls - def get_request_url(self): + def get_request_url(self, files_types=None, store_id=None, when_date=None): result = [] - for query_params in self._get_all_possible_query_string_params(): + for query_params in self._get_all_possible_query_string_params( + files_types=files_types, store_id=store_id, when_date=when_date + ): result.extend(self._build_query_url(query_params)) Logger.info(f"Request url: {result}") return result diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index cb2520a..81b159a 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -74,11 +74,12 @@ def collect_files_details_from_site( limit=None, files_types=None, store_id=None, + when_date=None, only_latest=False, files_names_to_scrape=None, ): self.post_scraping() - url = self.get_request_url() + url = self.get_request_url(files_types=files_types, store_id=store_id, when_date=when_date) total_pages = self.get_number_of_pages(url[0]) Logger.info(f"Found {total_pages} pages") diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index d8c05e0..d74faec 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -66,11 +66,14 @@ def collect_files_details_from_site( limit=None, files_types=None, store_id=None, + when_date=None, only_latest=False, files_names_to_scrape=None, ): """collect all enteris to download from site""" - urls_to_collect_link_from = self.get_request_url() + urls_to_collect_link_from = self.get_request_url( + files_types, store_id, when_date + ) all_trs = [] for url in urls_to_collect_link_from: diff --git a/il_supermarket_scarper/scrappers/yellow.py b/il_supermarket_scarper/scrappers/yellow.py index 65b60cd..1c14187 100644 --- a/il_supermarket_scarper/scrappers/yellow.py +++ b/il_supermarket_scarper/scrappers/yellow.py @@ -12,4 +12,5 @@ def __init__(self, folder_name=None): folder_name=folder_name, ftp_username="Paz_bo", ftp_password="paz468", + max_threads=10, ) diff --git a/il_supermarket_scarper/utils/databases/json_file.py b/il_supermarket_scarper/utils/databases/json_file.py index f7b2588..98e6ce0 100644 --- a/il_supermarket_scarper/utils/databases/json_file.py +++ b/il_supermarket_scarper/utils/databases/json_file.py @@ -30,21 +30,47 @@ def _get_database_file_path(self): """Get the full path to the database JSON file.""" return os.path.join(self.base_path, self.database_file) - def insert_document(self, collection_name, document): + def _read_database(self): + """Read the JSON database file and return its contents.""" + file_path = self._get_database_file_path() + data = {} + + # Load existing data from the file + if os.path.exists(file_path): + with open(file_path, "r", encoding="utf-8") as file: + try: + data = json.load(file) + except json.JSONDecodeError: + Logger.warning(f"File {file_path} is corrupted, resetting it.") + data = {} + return data + + def _write_database(self, data): + """Write data to the JSON database file.""" + file_path = self._get_database_file_path() + + with open(file_path, "w", encoding="utf-8") as file: + json.dump(dict(sorted(data.items())), file, default=str, indent=4) + + def insert_documents(self, collection_name, document): """Insert a document into a collection inside the JSON database.""" if self.collection_status: - file_path = self._get_database_file_path() - data = {} + + data = self._read_database() + # Ensure the collection exists in the database + if collection_name not in data: + data[collection_name] = [] - # Load existing data from the file - if os.path.exists(file_path): - with open(file_path, "r", encoding="utf-8") as file: - try: - data = json.load(file) - except json.JSONDecodeError: - Logger.warning(f"File {file_path} is corrupted, resetting it.") - data = {} + # Add the new document to the collection + data[collection_name].extend(document) + # Save the updated data back to the file + self._write_database(data) + + def insert_document(self, collection_name, document): + """Insert a document into a collection inside the JSON database.""" + if self.collection_status: + data = self._read_database() # Ensure the collection exists in the database if collection_name not in data: data[collection_name] = [] @@ -53,8 +79,7 @@ def insert_document(self, collection_name, document): data[collection_name].append(document) # Save the updated data back to the file - with open(file_path, "w", encoding="utf-8") as file: - json.dump(dict(sorted(data.items())), file, default=str, indent=4) + self._write_database(data) def find_document(self, collection_name, query): """Find a document in a collection based on a query.""" diff --git a/il_supermarket_scarper/utils/scraper_status.py b/il_supermarket_scarper/utils/scraper_status.py index 3a7efbc..a0078a1 100644 --- a/il_supermarket_scarper/utils/scraper_status.py +++ b/il_supermarket_scarper/utils/scraper_status.py @@ -100,12 +100,16 @@ def _add_downloaded_files_to_list(self, results, **_): """Add downloaded files to the MongoDB collection.""" if self.database.is_collection_enabled(): when = _now() + + documents = [] for res in results: if res["extract_succefully"]: - self.database.insert_document( - self.VERIFIED_DOWNLOADS, + documents.append( {"file_name": res["file_name"], "when": when}, ) + self.database.insert_documents( + self.VERIFIED_DOWNLOADS, + documents) @lock_by_string() def on_scrape_completed(self, folder_name): diff --git a/stress_test.py b/stress_test.py index 8afd864..1bddb40 100644 --- a/stress_test.py +++ b/stress_test.py @@ -1,8 +1,14 @@ import time import json +import sys import datetime import tempfile from il_supermarket_scarper.scrappers_factory import ScraperFactory +import pstats +import cProfile +from io import StringIO + + @@ -19,9 +25,21 @@ def full_execution(scraper): execution_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") start_time = time.time() + pr = cProfile.Profile() + pr.enable() + files = full_execution(scraper_name) + + pr.disable() + + stream = StringIO() + ps = pstats.Stats(pr, stream=stream) + ps.print_stats() + stream.seek(0) + end_time = time.time() result[scraper_name] = { + "status": stream.read(), "execution_time": execution_time, "start_time": start_time, "end_time": end_time, From 0b4c19a3fc221717a0f828031f9f1eb7ab7c5c3b Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Sun, 29 Sep 2024 17:24:56 +0000 Subject: [PATCH 12/20] . --- il_supermarket_scarper/engines/apsx.py | 4 ++-- il_supermarket_scarper/engines/multipage_web.py | 4 +++- il_supermarket_scarper/engines/web.py | 4 ++-- il_supermarket_scarper/utils/databases/json_file.py | 4 ++-- il_supermarket_scarper/utils/scraper_status.py | 4 +--- stress_test.py | 6 +----- 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index 3c454e8..72b9a7b 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -67,12 +67,12 @@ def _get_all_possible_query_string_params( ] ) chains_urls = chains_urls_with_types - + # add store id if store_id: for chain_url in chains_urls: chain_url += f"&WStore={store_id}" - + # posting date if when_date: for chain_url in chains_urls: diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py index 81b159a..b42c6b2 100644 --- a/il_supermarket_scarper/engines/multipage_web.py +++ b/il_supermarket_scarper/engines/multipage_web.py @@ -79,7 +79,9 @@ def collect_files_details_from_site( files_names_to_scrape=None, ): self.post_scraping() - url = self.get_request_url(files_types=files_types, store_id=store_id, when_date=when_date) + url = self.get_request_url( + files_types=files_types, store_id=store_id, when_date=when_date + ) total_pages = self.get_number_of_pages(url[0]) Logger.info(f"Found {total_pages} pages") diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index d74faec..d513e9b 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -22,7 +22,7 @@ def get_data_from_page(self, req_res): soup = BeautifulSoup(req_res.text, features="lxml") return soup.find_all("tr")[1:] - def get_request_url(self): + def get_request_url(self,files_types=None, store_id=None, when_date=None): #pylint: disable=unused-argument """get all links to collect download links from""" return [self.url] @@ -72,7 +72,7 @@ def collect_files_details_from_site( ): """collect all enteris to download from site""" urls_to_collect_link_from = self.get_request_url( - files_types, store_id, when_date + files_types=files_types, store_id=store_id, when_date=when_date ) all_trs = [] diff --git a/il_supermarket_scarper/utils/databases/json_file.py b/il_supermarket_scarper/utils/databases/json_file.py index 98e6ce0..b50ce41 100644 --- a/il_supermarket_scarper/utils/databases/json_file.py +++ b/il_supermarket_scarper/utils/databases/json_file.py @@ -44,7 +44,7 @@ def _read_database(self): Logger.warning(f"File {file_path} is corrupted, resetting it.") data = {} return data - + def _write_database(self, data): """Write data to the JSON database file.""" file_path = self._get_database_file_path() @@ -55,7 +55,7 @@ def _write_database(self, data): def insert_documents(self, collection_name, document): """Insert a document into a collection inside the JSON database.""" if self.collection_status: - + data = self._read_database() # Ensure the collection exists in the database if collection_name not in data: diff --git a/il_supermarket_scarper/utils/scraper_status.py b/il_supermarket_scarper/utils/scraper_status.py index a0078a1..a138540 100644 --- a/il_supermarket_scarper/utils/scraper_status.py +++ b/il_supermarket_scarper/utils/scraper_status.py @@ -107,9 +107,7 @@ def _add_downloaded_files_to_list(self, results, **_): documents.append( {"file_name": res["file_name"], "when": when}, ) - self.database.insert_documents( - self.VERIFIED_DOWNLOADS, - documents) + self.database.insert_documents(self.VERIFIED_DOWNLOADS, documents) @lock_by_string() def on_scrape_completed(self, folder_name): diff --git a/stress_test.py b/stress_test.py index 1bddb40..c4017e3 100644 --- a/stress_test.py +++ b/stress_test.py @@ -1,15 +1,11 @@ import time import json -import sys import datetime import tempfile -from il_supermarket_scarper.scrappers_factory import ScraperFactory import pstats import cProfile from io import StringIO - - - +from il_supermarket_scarper.scrappers_factory import ScraperFactory if __name__ == "__main__": From f573a1a3b721aa09af0f8db8b66a676e8e67cf23 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Tue, 1 Oct 2024 20:28:19 +0000 Subject: [PATCH 13/20] change sore --- il_supermarket_scarper/scrappers/tests/test_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/scrappers/tests/test_all.py b/il_supermarket_scarper/scrappers/tests/test_all.py index fddbf0e..dd96f72 100644 --- a/il_supermarket_scarper/scrappers/tests/test_all.py +++ b/il_supermarket_scarper/scrappers/tests/test_all.py @@ -6,7 +6,7 @@ class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)): """Test case for ScraperFactory.BAREKET.""" -class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 6)): +class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 9032)): """Test case for ScraperFactory.YAYNO_BITAN.""" From 3f59fa946de725eccd1bc9c76834e3d73a97cda8 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Tue, 1 Oct 2024 20:35:54 +0000 Subject: [PATCH 14/20] . --- il_supermarket_scarper/engines/cerberus.py | 6 ++++-- il_supermarket_scarper/engines/engine.py | 4 ++++ il_supermarket_scarper/engines/web.py | 8 +++++++- il_supermarket_scarper/scrapper_runner.py | 2 ++ il_supermarket_scarper/scrappers/tests/test_cases.py | 2 ++ 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py index 4539f2a..302de4b 100644 --- a/il_supermarket_scarper/engines/cerberus.py +++ b/il_supermarket_scarper/engines/cerberus.py @@ -43,6 +43,8 @@ def scrape( store_id=None, only_latest=False, files_names_to_scrape=None, + filter_null=False, + filter_zero=False, ): files = [] try: @@ -55,8 +57,8 @@ def scrape( files = self.collect_files_details_from_site( limit=limit, files_types=files_types, - filter_null=False, - filter_zero=False, + filter_null=filter_null, + filter_zero=filter_zero, store_id=store_id, only_latest=only_latest, files_names_to_scrape=files_names_to_scrape, diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py index b04db4c..b36ecf8 100644 --- a/il_supermarket_scarper/engines/engine.py +++ b/il_supermarket_scarper/engines/engine.py @@ -212,6 +212,8 @@ def scrape( store_id=None, only_latest=False, files_names_to_scrape=None, + filter_null=False, + filter_zero=False, ): """run the scraping logic""" self.post_scraping() @@ -221,6 +223,8 @@ def scrape( store_id=store_id, files_names_to_scrape=files_names_to_scrape, only_latest=only_latest, + filter_null=filter_null, + filter_zero=filter_zero, ) Logger.info(f"Starting scraping for {self.chain}") self.make_storage_path_dir() diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py index d513e9b..c232b56 100644 --- a/il_supermarket_scarper/engines/web.py +++ b/il_supermarket_scarper/engines/web.py @@ -22,7 +22,9 @@ def get_data_from_page(self, req_res): soup = BeautifulSoup(req_res.text, features="lxml") return soup.find_all("tr")[1:] - def get_request_url(self,files_types=None, store_id=None, when_date=None): #pylint: disable=unused-argument + def get_request_url( + self, files_types=None, store_id=None, when_date=None + ): # pylint: disable=unused-argument """get all links to collect download links from""" return [self.url] @@ -109,6 +111,8 @@ def scrape( store_id=None, only_latest=False, files_names_to_scrape=None, + filter_null=False, + filter_zero=False, ): """scarpe the files from multipage sites""" download_urls, file_names = [], [] @@ -118,6 +122,8 @@ def scrape( files_types=files_types, store_id=store_id, only_latest=only_latest, + filter_null=filter_null, + filter_zero=filter_zero, ) download_urls, file_names = self.collect_files_details_from_site( diff --git a/il_supermarket_scarper/scrapper_runner.py b/il_supermarket_scarper/scrapper_runner.py index b0a5a34..3e6978b 100644 --- a/il_supermarket_scarper/scrapper_runner.py +++ b/il_supermarket_scarper/scrapper_runner.py @@ -96,6 +96,8 @@ def scrape_one( store_id=store_id, only_latest=only_latest, files_names_to_scrape=None, + filter_null=False, + filter_zero=False, ) Logger.info(f"done scraping {chain_name}") diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py index 37fab63..79064ce 100644 --- a/il_supermarket_scarper/scrappers/tests/test_cases.py +++ b/il_supermarket_scarper/scrappers/tests/test_cases.py @@ -143,6 +143,8 @@ def __clean_scarpe_delete( "files_types": file_type, "store_id": store_id, "only_latest": only_latest, + "filter_null": True, + "filter_zero": True, } scraper.scrape(**kwarg) From db81cd11961acfecd9da325742fbb7d07cc509f5 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Tue, 1 Oct 2024 20:52:23 +0000 Subject: [PATCH 15/20] . --- il_supermarket_scarper/engines/apsx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py index 72b9a7b..53a6eca 100644 --- a/il_supermarket_scarper/engines/apsx.py +++ b/il_supermarket_scarper/engines/apsx.py @@ -51,7 +51,7 @@ def _get_all_possible_query_string_params( if isinstance(self.chain_id, list): res = [] for c_id in self.chain_id: - res.append(f"?code=={c_id}") + res.append(f"?code={c_id}") return res chains_urls = [f"?code={self.chain_id}"] From 6297b9bade15b34cd0e74408672d8db270982397 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Tue, 1 Oct 2024 22:34:15 +0000 Subject: [PATCH 16/20] fixed --- il_supermarket_scarper/utils/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index 51c847f..3a1babc 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -1,5 +1,5 @@ import concurrent.futures -from il_supermarket_scarper.utils import Logger +from logger import Logger def defualt_aggregtion_function(all_done): From 01c385913b54a6eabaff4b5c8fc4b7612e7809bf Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Wed, 2 Oct 2024 13:20:02 +0000 Subject: [PATCH 17/20] . --- il_supermarket_scarper/utils/loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index 3a1babc..d04bffb 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -1,5 +1,5 @@ import concurrent.futures -from logger import Logger +from .logger import Logger def defualt_aggregtion_function(all_done): From 8d9ce41904161e50f62f343314abacd206903931 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Wed, 2 Oct 2024 16:33:07 +0000 Subject: [PATCH 18/20] . --- il_supermarket_scarper/utils/loop.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/il_supermarket_scarper/utils/loop.py b/il_supermarket_scarper/utils/loop.py index d04bffb..b8c40be 100644 --- a/il_supermarket_scarper/utils/loop.py +++ b/il_supermarket_scarper/utils/loop.py @@ -18,7 +18,10 @@ def multiple_page_aggregtion(pages_to_scrape): download_urls = [] file_names = [] for result in pages_to_scrape: - page_download_urls, page_file_names = result.result() + if hasattr(result, "result"): + page_download_urls, page_file_names = result.result() + else: + page_download_urls, page_file_names = result download_urls.extend(page_download_urls) file_names.extend(page_file_names) return download_urls, file_names From 4020d46a5dd0d9df35ba2d0959902047b56ef1b5 Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Wed, 2 Oct 2024 19:39:54 +0000 Subject: [PATCH 19/20] add execption inforamtion --- il_supermarket_scarper/utils/scraper_status.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/il_supermarket_scarper/utils/scraper_status.py b/il_supermarket_scarper/utils/scraper_status.py index a138540..dd733ab 100644 --- a/il_supermarket_scarper/utils/scraper_status.py +++ b/il_supermarket_scarper/utils/scraper_status.py @@ -1,4 +1,5 @@ import os +import traceback from .logger import Logger from .status import log_folder_details @@ -120,7 +121,10 @@ def on_scrape_completed(self, folder_name): def on_download_fail(self, execption, **additional_info): """report when the scraping in failed""" self._insert_an_update( - ScraperStatus.FAILED, execption=str(execption), **additional_info + ScraperStatus.FAILED, + execption=str(execption), + traceback=traceback.format_exc(), + **additional_info ) def _insert_an_update(self, status, **additional_info): From e9d3b4b94e1517e8472e506574cb012414b45e7d Mon Sep 17 00:00:00 2001 From: Sefi Erlich Date: Wed, 2 Oct 2024 19:40:10 +0000 Subject: [PATCH 20/20] . --- il_supermarket_scarper/utils/scraper_status.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/il_supermarket_scarper/utils/scraper_status.py b/il_supermarket_scarper/utils/scraper_status.py index dd733ab..92ece14 100644 --- a/il_supermarket_scarper/utils/scraper_status.py +++ b/il_supermarket_scarper/utils/scraper_status.py @@ -121,10 +121,10 @@ def on_scrape_completed(self, folder_name): def on_download_fail(self, execption, **additional_info): """report when the scraping in failed""" self._insert_an_update( - ScraperStatus.FAILED, - execption=str(execption), + ScraperStatus.FAILED, + execption=str(execption), traceback=traceback.format_exc(), - **additional_info + **additional_info, ) def _insert_an_update(self, status, **additional_info):