Tested locally

0.4.9
OpenIsraeliSupermarkets · Oct 11, 2024 · 6fed2c3 · 6fed2c3
2 parents 3bfb65d + 966f57b
commit 6fed2c3
Show file tree

Hide file tree

Showing 19 changed files with 12,848 additions and 227 deletions.
diff --git a/il_supermarket_scarper/engines/apsx.py b/il_supermarket_scarper/engines/apsx.py
@@ -44,43 +44,46 @@ def _build_query_url(self, query_params):
             res.append(base + self.aspx_page + query_params)
         return res
 
-    def _get_all_possible_query_string_params(
+    def _get_all_possible_query_string_params(  # pylint: disable=unused-argument
         self, files_types=None, store_id=None, when_date=None
     ):
         """get the arguments need to add to the url"""
         if isinstance(self.chain_id, list):
             res = []
             for c_id in self.chain_id:
                 res.append(f"?code={c_id}")
-            return res
+            chains_urls = res
+
         chains_urls = [f"?code={self.chain_id}"]
 
-        # add file types to url
-        if files_types:
-            chains_urls_with_types = []
-            for files_type in files_types:
-                file_type_id = self.file_type_id(files_type)
-                chains_urls_with_types.extend(
-                    [
-                        f"{chain_url}&WFileType={file_type_id}"
-                        for chain_url in chains_urls
-                    ]
-                )
-            chains_urls = chains_urls_with_types
-
-        # add store id
-        if store_id:
-            for chain_url in chains_urls:
-                chain_url += f"&WStore={store_id}"
-
-        # posting date
-        if when_date:
-            for chain_url in chains_urls:
-                chain_url += (
-                    f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
-                )
         return chains_urls
 
+        # # add file types to url
+        # if files_types:
+        #     chains_urls_with_types = []
+        #     for files_type in files_types:
+        #         file_type_id = self.file_type_id(files_type)
+        #         chains_urls_with_types.extend(
+        #             [
+        #                 f"{chain_url}&WFileType={file_type_id}"
+        #                 for chain_url in chains_urls
+        #             ]
+        #         )
+        #     chains_urls = chains_urls_with_types
+
+        # # add store id
+        # if store_id:
+        #     for chain_url in chains_urls:
+        #         chain_url += f"&WStore={store_id}"
+
+        # # posting date
+        # if when_date:
+        #     for chain_url in chains_urls:
+        #         chain_url += (
+        #             f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
+        #         )
+        # return chains_urls
+
     def get_request_url(self, files_types=None, store_id=None, when_date=None):
         result = []
         for query_params in self._get_all_possible_query_string_params(

diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py
@@ -41,7 +41,7 @@ def scrape(
         limit=None,
         files_types=None,
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
         filter_null=False,
         filter_zero=False,
@@ -52,15 +52,15 @@ def scrape(
                 limit=limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
             )
             files = self.collect_files_details_from_site(
                 limit=limit,
                 files_types=files_types,
                 filter_null=filter_null,
                 filter_zero=filter_zero,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
                 files_names_to_scrape=files_names_to_scrape,
             )
             self.on_collected_details(files)
@@ -73,6 +73,7 @@ def scrape(
             return results
         except Exception as e:  # pylint: disable=broad-except
             self.on_download_fail(e, files=files)
+            Logger.error_execption(e)
             return []
 
     def collect_files_details_from_site(
@@ -82,7 +83,7 @@ def collect_files_details_from_site(
         filter_null=False,
         filter_zero=False,
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
     ):
         """collect all files to download from the site"""
@@ -117,7 +118,7 @@ def collect_files_details_from_site(
             limit=limit,
             files_types=files_types,
             store_id=store_id,
-            only_latest=only_latest,
+            when_date=when_date,
             files_names_to_scrape=files_names_to_scrape,
         )
         Logger.info(f"After applying limit: Found {len(files)} files")

diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py
@@ -1,7 +1,7 @@
 from abc import ABC
 import os
 import re
-
+import datetime
 
 from il_supermarket_scarper.utils import (
     get_output_folder,
@@ -38,16 +38,16 @@ def get_storage_path(self):
         return self.storage_path
 
     def _is_validate_scraper_found_no_files(
-        self, limit=None, files_types=None, store_id=None, only_latest=False
+        self, limit=None, files_types=None, store_id=None, when_date=None
     ):
         Logger.info(
             f"check if fail is allowd with, limit={limit},"
-            f"files_types={files_types},store_id={store_id},only_latest={only_latest}"
+            f"files_types={files_types},store_id={store_id},when_date={when_date}"
         )
         return False
 
     def is_validate_scraper_found_no_files(
-        self, limit=None, files_types=None, store_id=None, only_latest=False
+        self, limit=None, files_types=None, store_id=None, when_date=None
     ):
         """return true if its ok the scarper reuturn no enrty"""
 
@@ -58,7 +58,7 @@ def is_validate_scraper_found_no_files(
             for file_type in files_types:
                 if file_type in FileTypesFilters.all_full_files():
                     request_only_update_file = False
-        Logger.info(f"the value of {only_latest} should not affect.")
+        Logger.info(f"the value of {when_date} should not affect.")
         return (
             limit == 0
             or files_types == []
@@ -68,7 +68,7 @@ def is_validate_scraper_found_no_files(
                 limit=limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
             )
         )
 
@@ -83,13 +83,13 @@ def apply_limit(
         files_types=None,
         by_function=lambda x: x,
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
     ):
         """filter the list according to condition"""
-        assert (
-            not only_latest or limit is None
-        ), "only_latest flag can't be applied with limit."
+        # assert (
+        #     when_date is not None or limit is None
+        # ), "when_date flag can't be applied with limit."
 
         # filter files already downloaded
         intreable_ = self.filter_already_downloaded(
@@ -124,13 +124,17 @@ def apply_limit(
             )
         Logger.info(f"Number of entry after filter file type id is {len(intreable_)}")
 
-        if only_latest:
+        if isinstance(when_date, datetime.datetime):
+            intreable_ = self.get_by_date(when_date, by_function, intreable_)
+        elif isinstance(when_date, str) and when_date == "latest":
             intreable_ = self.get_only_latest(by_function, intreable_)
+        elif when_date is not None:
+            raise ValueError(f"when_date should be datetime or bool, got {when_date}")
 
         Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}")
 
         # filter by limit if the 'files_types' filter is not on.
-        if limit and files_types is None:
+        if limit:
             assert limit > 0, "Limit must be greater than 0"
             Logger.info(f"Limit: {limit}")
             intreable_ = intreable_[: min(limit, len(list(intreable_)))]
@@ -144,12 +148,12 @@ def apply_limit(
                     limit=limit,
                     files_types=files_types,
                     store_id=store_id,
-                    only_latest=only_latest,
+                    when_date=when_date,
                 )
             ):
                 raise ValueError(
                     f"No files to download for file files_types={files_types},"
-                    f"limit={limit},store_id={store_id},only_latest={only_latest}"
+                    f"limit={limit},store_id={store_id},when_date={when_date}"
                 )
         return intreable_
 
@@ -182,6 +186,21 @@ def get_only_latest(self, by_function, intreable_):
                 groups_value[store_info] = file
         return list(groups_value.values())
 
+    def get_by_date(self, requested_date, by_function, intreable_):
+        """get by date"""
+        #
+        date_format = requested_date.strftime("%Y%m%d")
+        #
+        groups_value = []
+        for file in intreable_:
+            name_split = by_function(file).split("-", maxsplit=2)
+            date_info = name_split[-1].rsplit(".", maxsplit=1)[0]
+
+            if date_info.startswith(date_format):
+                groups_value.append(file)
+
+        return groups_value
+
     @classmethod
     def unique(cls, iterable, by_function=lambda x: x):
         """Returns the type of the file."""
@@ -210,7 +229,7 @@ def scrape(
         limit=None,
         files_types=None,
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
         filter_null=False,
         filter_zero=False,
@@ -222,7 +241,7 @@ def scrape(
             files_types=files_types,
             store_id=store_id,
             files_names_to_scrape=files_names_to_scrape,
-            only_latest=only_latest,
+            when_date=when_date,
             filter_null=filter_null,
             filter_zero=filter_zero,
         )

diff --git a/il_supermarket_scarper/engines/multipage_web.py b/il_supermarket_scarper/engines/multipage_web.py
@@ -77,7 +77,6 @@ def collect_files_details_from_site(
         files_types=None,
         store_id=None,
         when_date=None,
-        only_latest=False,
         files_names_to_scrape=None,
     ):
         self.post_scraping()
@@ -109,7 +108,7 @@ def collect_files_details_from_site(
             limit=limit,
             files_types=files_types,
             store_id=store_id,
-            only_latest=only_latest,
+            when_date=when_date,
             files_names_to_scrape=files_names_to_scrape,
         )
 
@@ -125,7 +124,7 @@ def collect_files_details_from_page(self, html):
         return links, filenames
 
     def process_links_before_download(
-        self, page, limit=None, files_types=None, store_id=None, only_latest=None
+        self, page, limit=None, files_types=None, store_id=None, when_date=None
     ):
         """additional processing to the links before download"""
         response = self.session_with_cookies_by_chain(page)
@@ -141,7 +140,7 @@ def process_links_before_download(
             limit=limit,
             files_types=files_types,
             store_id=store_id,
-            only_latest=only_latest,
+            when_date=when_date,
         )
 
         Logger.info(

diff --git a/il_supermarket_scarper/engines/publishprice.py b/il_supermarket_scarper/engines/publishprice.py
@@ -88,14 +88,14 @@ def get_name_from_herf(x):
         return download_urls, file_names
 
     def _is_validate_scraper_found_no_files(
-        self, limit=None, files_types=None, store_id=None, only_latest=False
+        self, limit=None, files_types=None, store_id=None, when_date=None
     ):
         return (
             super()._is_validate_scraper_found_no_files(  # what fails the rest
                 limit=limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
             )
             or (  # if we are looking for one store file in a weekend or holiday
                 store_id and (_is_weekend_in_israel() or _is_holiday_in_israel())

diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py
@@ -45,7 +45,7 @@ def apply_limit_zip(
         files_types=None,
         by_function=lambda x: x[0],
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
     ):
         """apply limit to zip"""
@@ -55,7 +55,7 @@ def apply_limit_zip(
             files_types=files_types,
             by_function=by_function,
             store_id=store_id,
-            only_latest=only_latest,
+            when_date=when_date,
             files_names_to_scrape=files_names_to_scrape,
         )
         if len(ziped) == 0:
@@ -69,7 +69,6 @@ def collect_files_details_from_site(
         files_types=None,
         store_id=None,
         when_date=None,
-        only_latest=False,
         files_names_to_scrape=None,
     ):
         """collect all enteris to download from site"""
@@ -95,7 +94,7 @@ def collect_files_details_from_site(
                 limit=limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
                 files_names_to_scrape=files_names_to_scrape,
             )
 
@@ -109,7 +108,7 @@ def scrape(
         limit=None,
         files_types=None,
         store_id=None,
-        only_latest=False,
+        when_date=None,
         files_names_to_scrape=None,
         filter_null=False,
         filter_zero=False,
@@ -121,7 +120,7 @@ def scrape(
                 limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
                 filter_null=filter_null,
                 filter_zero=filter_zero,
             )
@@ -130,7 +129,7 @@ def scrape(
                 limit=limit,
                 files_types=files_types,
                 store_id=store_id,
-                only_latest=only_latest,
+                when_date=when_date,
                 files_names_to_scrape=files_names_to_scrape,
             )
 
@@ -153,4 +152,5 @@ def scrape(
             return results
         except Exception as e:  # pylint: disable=broad-except
             self.on_download_fail(e, download_urls=download_urls, file_names=file_names)
+            Logger.error_execption(e)
             return []