OpenIsraeliSupermarkets · erlichsefi · Oct 3, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024
diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py
@@ -1,16 +1,34 @@
 from abc import ABC, abstractmethod
-from il_supermarket_scarper.utils import Logger
+from il_supermarket_scarper.utils import Logger, FileTypesFilters
 
 from .web import WebBase
 
 
 class Aspx(WebBase, ABC):
     """class for aspx scapers"""
 
-    def __init__(self, chain, chain_id, url, aspx_page, folder_name=None):
-        super().__init__(chain, chain_id, url, folder_name=folder_name)
+    def __init__(
+        self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5
+    ):
+        super().__init__(
+            chain, chain_id, url, folder_name=folder_name, max_threads=max_threads
+        )
         self.aspx_page = aspx_page
 
+    def file_type_id(self, file_type):
+        """get the file type id"""
+        if file_type == FileTypesFilters.STORE_FILE.name:
+            return 1
+        if file_type == FileTypesFilters.PRICE_FILE.name:
+            return 2
+        if file_type == FileTypesFilters.PROMO_FILE.name:
+            return 3
+        if file_type == FileTypesFilters.PRICE_FULL_FILE.name:
+            return 4
+        if file_type == FileTypesFilters.PROMO_FULL_FILE.name:
+            return 5
+        raise ValueError(f"file type {file_type} not supported")
+
     def extract_task_from_entry(self, all_trs):
         download_urls: list = list(
             map(lambda x: self.url + self.get_href_from_entry(x), all_trs)
@@ -26,18 +44,48 @@ def _build_query_url(self, query_params):
             res.append(base + self.aspx_page + query_params)
         return res
 
-    def _get_all_possible_query_string_params(self):
+    def _get_all_possible_query_string_params(
+        self, files_types=None, store_id=None, when_date=None
+    ):
         """get the arguments need to add to the url"""
         if isinstance(self.chain_id, list):
             res = []
             for c_id in self.chain_id:
-                res.append(f"?code=={c_id}")
+                res.append(f"?code={c_id}")
             return res
-        return [f"?code={self.chain_id}"]
+        chains_urls = [f"?code={self.chain_id}"]
+
+        # add file types to url
+        if files_types:
+            chains_urls_with_types = []
+            for files_type in files_types:
+                file_type_id = self.file_type_id(files_type)
+                chains_urls_with_types.extend(
+                    [
+                        f"{chain_url}&WFileType={file_type_id}"
+                        for chain_url in chains_urls
+                    ]
+                )
+            chains_urls = chains_urls_with_types
+
+        # add store id
+        if store_id:
+            for chain_url in chains_urls:
+                chain_url += f"&WStore={store_id}"
+
+        # posting date
+        if when_date:
+            for chain_url in chains_urls:
+                chain_url += (
+                    f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
+                )
+        return chains_urls
 
-    def get_request_url(self):
+    def get_request_url(self, files_types=None, store_id=None, when_date=None):
         result = []
-        for query_params in self._get_all_possible_query_string_params():
+        for query_params in self._get_all_possible_query_string_params(
+            files_types=files_types, store_id=store_id, when_date=when_date
+        ):
             result.extend(self._build_query_url(query_params))
         Logger.info(f"Request url: {result}")
         return result

diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py
@@ -4,7 +4,7 @@
 from il_supermarket_scarper.utils import (
     extract_xml_file_from_gz_file,
     Logger,
-    execute_in_event_loop,
+    execute_in_parallel,
     collect_from_ftp,
     fetch_temporary_gz_file_from_ftp,
     retry_files,
@@ -26,8 +26,9 @@ def __init__(
         ftp_path="/",
         ftp_username="",
         ftp_password="",
+        max_threads=5,
     ):
-        super().__init__(chain, chain_id, folder_name)
+        super().__init__(chain, chain_id, folder_name, max_threads)
         self.ftp_host = ftp_host
         self.ftp_path = ftp_path
         self.ftp_username = ftp_username
@@ -42,6 +43,8 @@ def scrape(
         store_id=None,
         only_latest=False,
         files_names_to_scrape=None,
+        filter_null=False,
+        filter_zero=False,
     ):
         files = []
         try:
@@ -54,16 +57,16 @@ def scrape(
             files = self.collect_files_details_from_site(
                 limit=limit,
                 files_types=files_types,
-                filter_null=True,
-                filter_zero=True,
+                filter_null=filter_null,
+                filter_zero=filter_zero,
                 store_id=store_id,
                 only_latest=only_latest,
                 files_names_to_scrape=files_names_to_scrape,
             )
             self.on_collected_details(files)
 
-            results = execute_in_event_loop(
-                self.persist_from_ftp, files, max_workers=self.max_workers
+            results = execute_in_parallel(
+                self.persist_from_ftp, list(files), max_threads=self.max_threads
             )
             self.on_download_completed(results=results)
             self.on_scrape_completed(self.get_storage_path())

diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py
@@ -21,15 +21,15 @@
 class Engine(ScraperStatus, ABC):
     """base engine for scraping"""
 
-    def __init__(self, chain, chain_id, folder_name=None):
+    def __init__(self, chain, chain_id, folder_name=None, max_threads=10):
         assert DumpFolderNames.is_valid_folder_name(
             chain
         ), "chain name can contain only abc and -"
 
         super().__init__(chain.value, "status", folder_name=folder_name)
         self.chain = chain
         self.chain_id = chain_id
-        self.max_workers = 5
+        self.max_threads = max_threads
         self.storage_path = get_output_folder(self.chain.value, folder_name=folder_name)
         Logger.info(f"Storage path: {self.storage_path}")
 
@@ -212,6 +212,8 @@ def scrape(
         store_id=None,
         only_latest=False,
         files_names_to_scrape=None,
+        filter_null=False,
+        filter_zero=False,
     ):
         """run the scraping logic"""
         self.post_scraping()
@@ -221,6 +223,8 @@ def scrape(
             store_id=store_id,
             files_names_to_scrape=files_names_to_scrape,
             only_latest=only_latest,
+            filter_null=filter_null,
+            filter_zero=filter_zero,
         )
         Logger.info(f"Starting scraping for {self.chain}")
         self.make_storage_path_dir()

diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py
@@ -10,7 +10,7 @@
 
 from il_supermarket_scarper.utils import (
     Logger,
-    execute_in_event_loop,
+    execute_in_parallel,
     multiple_page_aggregtion,
 )
 from .web import WebBase
@@ -31,8 +31,11 @@ def __init__(
         total_page_xpath="""//*[@id="gridContainer"]/table/
                                             tfoot/tr/td/a[6]/@href""",
         total_pages_pattern=r"^\/\?page\=([0-9]{3})$",
+        max_threads=5,
     ):
-        super().__init__(chain, chain_id, url=url, folder_name=folder_name)
+        super().__init__(
+            chain, chain_id, url=url, folder_name=folder_name, max_threads=max_threads
+        )
         self.total_page_xpath = total_page_xpath
         self.total_pages_pattern = total_pages_pattern
 
@@ -71,11 +74,14 @@ def collect_files_details_from_site(
         limit=None,
         files_types=None,
         store_id=None,
+        when_date=None,
         only_latest=False,
         files_names_to_scrape=None,
     ):
         self.post_scraping()
-        url = self.get_request_url()
+        url = self.get_request_url(
+            files_types=files_types, store_id=store_id, when_date=when_date
+        )
 
         total_pages = self.get_number_of_pages(url[0])
         Logger.info(f"Found {total_pages} pages")
@@ -87,11 +93,11 @@ def collect_files_details_from_site(
             )
         )
 
-        download_urls, file_names = execute_in_event_loop(
+        download_urls, file_names = execute_in_parallel(
             self.process_links_before_download,
-            pages_to_scrape,
+            list(pages_to_scrape),
             aggregtion_function=multiple_page_aggregtion,
-            max_workers=self.max_workers,
+            max_threads=self.max_threads,
         )
         file_names, download_urls = self.apply_limit_zip(
             file_names,

diff --git a/il_supermarket_scarper/engines/publishprice.py b/il_supermarket_scarper/engines/publishprice.py
@@ -17,12 +17,21 @@ class PublishPrice(WebBase):
     but this is not implemented.
     """
 
-    def __init__(self, chain, chain_id, site_infix, folder_name=None, domain="prices"):
+    def __init__(
+        self,
+        chain,
+        chain_id,
+        site_infix,
+        folder_name=None,
+        domain="prices",
+        max_threads=5,
+    ):
         super().__init__(
             chain,
             chain_id,
             url=f"https://{domain}.{site_infix}.co.il/",
             folder_name=folder_name,
+            max_threads=max_threads,
         )
         self.folder = None
 

diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py
@@ -1,7 +1,7 @@
 from bs4 import BeautifulSoup
 from il_supermarket_scarper.utils import (
     Logger,
-    execute_in_event_loop,
+    execute_in_parallel,
     session_and_check_status,
     retry_files,
 )
@@ -12,8 +12,8 @@
 class WebBase(Engine):
     """scrape the file of websites that the only why to download them is via web"""
 
-    def __init__(self, chain, chain_id, url, folder_name=None):
-        super().__init__(chain, chain_id, folder_name)
+    def __init__(self, chain, chain_id, url, folder_name=None, max_threads=5):
+        super().__init__(chain, chain_id, folder_name, max_threads=max_threads)
         self.url = url
         self.max_retry = 2
 
@@ -22,7 +22,9 @@ def get_data_from_page(self, req_res):
         soup = BeautifulSoup(req_res.text, features="lxml")
         return soup.find_all("tr")[1:]
 
-    def get_request_url(self):
+    def get_request_url(
+        self, files_types=None, store_id=None, when_date=None
+    ):  # pylint: disable=unused-argument
         """get all links to collect download links from"""
         return [self.url]
 
@@ -66,11 +68,14 @@ def collect_files_details_from_site(
         limit=None,
         files_types=None,
         store_id=None,
+        when_date=None,
         only_latest=False,
         files_names_to_scrape=None,
     ):
         """collect all enteris to download from site"""
-        urls_to_collect_link_from = self.get_request_url()
+        urls_to_collect_link_from = self.get_request_url(
+            files_types=files_types, store_id=store_id, when_date=when_date
+        )
 
         all_trs = []
         for url in urls_to_collect_link_from:
@@ -106,6 +111,8 @@ def scrape(
         store_id=None,
         only_latest=False,
         files_names_to_scrape=None,
+        filter_null=False,
+        filter_zero=False,
     ):
         """scarpe the files from multipage sites"""
         download_urls, file_names = [], []
@@ -115,6 +122,8 @@ def scrape(
                 files_types=files_types,
                 store_id=store_id,
                 only_latest=only_latest,
+                filter_null=filter_null,
+                filter_zero=filter_zero,
             )
 
             download_urls, file_names = self.collect_files_details_from_site(
@@ -129,10 +138,10 @@ def scrape(
 
             Logger.info(f"collected {len(download_urls)} to download.")
             if len(download_urls) > 0:
-                results = execute_in_event_loop(
+                results = execute_in_parallel(
                     self.save_and_extract,
-                    zip(download_urls, file_names),
-                    max_workers=self.max_workers,
+                    list(zip(download_urls, file_names)),
+                    max_threads=self.max_threads,
                 )
             else:
                 results = []

diff --git a/il_supermarket_scarper/scrapper_runner.py b/il_supermarket_scarper/scrapper_runner.py
@@ -96,6 +96,8 @@ def scrape_one(
             store_id=store_id,
             only_latest=only_latest,
             files_names_to_scrape=None,
+            filter_null=False,
+            filter_zero=False,
         )
         Logger.info(f"done scraping {chain_name}")
 

diff --git a/il_supermarket_scarper/scrappers/ramilevy.py b/il_supermarket_scarper/scrappers/ramilevy.py
@@ -11,4 +11,5 @@ def __init__(self, folder_name=None):
             chain_id="7290058140886",
             folder_name=folder_name,
             ftp_username="RamiLevi",
+            max_threads=10,
         )
diff --git a/il_supermarket_scarper/scrappers/tests/test_all.py b/il_supermarket_scarper/scrappers/tests/test_all.py
@@ -6,7 +6,7 @@ class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)):
     """Test case for ScraperFactory.BAREKET."""
 
 
-class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 6)):
+class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 9032)):
     """Test case for ScraperFactory.YAYNO_BITAN."""
 
 

diff --git a/il_supermarket_scarper/scrappers/tests/test_cases.py b/il_supermarket_scarper/scrappers/tests/test_cases.py
@@ -143,6 +143,8 @@ def __clean_scarpe_delete(
                         "files_types": file_type,
                         "store_id": store_id,
                         "only_latest": only_latest,
+                        "filter_null": True,
+                        "filter_zero": True,
                     }
 
                     scraper.scrape(**kwarg)
@@ -199,7 +201,7 @@ def test_scrape_one(self):
 
         def test_scrape_ten(self):
             """scrape ten file and make sure they exists"""
-            self._clean_scarpe_delete(scraper_enum, limit=10)
+            self._clean_scarpe_delete(scraper_enum, limit=None)
 
         def test_scrape_promo(self):
             """scrape one promo file and make sure it exists"""

diff --git a/il_supermarket_scarper/scrappers/yellow.py b/il_supermarket_scarper/scrappers/yellow.py
@@ -12,4 +12,5 @@ def __init__(self, folder_name=None):
             folder_name=folder_name,
             ftp_username="Paz_bo",
             ftp_password="paz468",
+            max_threads=10,
         )
diff --git a/il_supermarket_scarper/utils/__init__.py b/il_supermarket_scarper/utils/__init__.py
@@ -23,7 +23,7 @@
     fetch_temporary_gz_file_from_ftp,
     wget_file,
 )
-from .loop import execute_in_event_loop, multiple_page_aggregtion
+from .loop import execute_in_parallel, multiple_page_aggregtion
 from .exceptions import RestartSessionError
 from .retry import retry_files
 from .marking import FlakyScraper