Merge pull request #61 from erlichsefi/dev

v0.4.5
OpenIsraeliSupermarkets · Sep 27, 2024 · babe8e7 · babe8e7
2 parents 46647f1 + 94025a1
commit babe8e7
Show file tree

Hide file tree

Showing 12 changed files with 176 additions and 165 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -28,4 +28,4 @@ jobs:
         pip install pylint
     - name: Analysing the code with pylint
       run: |
-        pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707
+        pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707,R0917
diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml
@@ -42,4 +42,6 @@ jobs:
     - name: Build with Docker
       run: docker build -t erlichsefi/israeli-supermarket-scarpers:test --target test .
     - name: Test with pytest
-      run: docker run --rm --name test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test
+      run:  docker rm scraper-test-run || true && 
+            docker run --rm --name scraper-test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test &&
+            docker builder prune -f
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ Quick start
 
 il_supermarket_scarper can be installed using pip:
 
-    python3 pip install il-supermarket-scraper
+    python3 pip install israeli-supermarket-scraper
 
 If you want to run the latest version of the code, you can install it from the
 repo directly:

diff --git a/il_supermarket_scarper/engines/cerberus.py b/il_supermarket_scarper/engines/cerberus.py
@@ -43,29 +43,34 @@ def scrape(
         only_latest=False,
         files_names_to_scrape=None,
     ):
-        super().scrape(
-            limit=limit,
-            files_types=files_types,
-            store_id=store_id,
-            only_latest=only_latest,
-        )
-        files = self.collect_files_details_from_site(
-            limit=limit,
-            files_types=files_types,
-            filter_null=True,
-            filter_zero=True,
-            store_id=store_id,
-            only_latest=only_latest,
-            files_names_to_scrape=files_names_to_scrape,
-        )
-        self.on_collected_details(files)
+        files = []
+        try:
+            super().scrape(
+                limit=limit,
+                files_types=files_types,
+                store_id=store_id,
+                only_latest=only_latest,
+            )
+            files = self.collect_files_details_from_site(
+                limit=limit,
+                files_types=files_types,
+                filter_null=True,
+                filter_zero=True,
+                store_id=store_id,
+                only_latest=only_latest,
+                files_names_to_scrape=files_names_to_scrape,
+            )
+            self.on_collected_details(files)
 
-        results = execute_in_event_loop(
-            self.persist_from_ftp, files, max_workers=self.max_workers
-        )
-        self.on_download_completed(results=results)
-        self.on_scrape_completed(self.get_storage_path())
-        return results
+            results = execute_in_event_loop(
+                self.persist_from_ftp, files, max_workers=self.max_workers
+            )
+            self.on_download_completed(results=results)
+            self.on_scrape_completed(self.get_storage_path())
+            return results
+        except Exception as e:  # pylint: disable=broad-except
+            self.on_download_fail(e, files=files)
+            return []
 
     def collect_files_details_from_site(
         self,

diff --git a/il_supermarket_scarper/engines/engine.py b/il_supermarket_scarper/engines/engine.py
@@ -21,17 +21,12 @@
 class Engine(ScraperStatus, ABC):
     """base engine for scraping"""
 
-    def __init__(
-        self,
-        chain,
-        chain_id,
-        folder_name=None,
-    ):
+    def __init__(self, chain, chain_id, folder_name=None):
         assert DumpFolderNames.is_valid_folder_name(
             chain
         ), "chain name can contain only abc and -"
 
-        super().__init__(chain.value, "status")
+        super().__init__(chain.value, "status", folder_name=folder_name)
         self.chain = chain
         self.chain_id = chain_id
         self.max_workers = 5
@@ -100,12 +95,16 @@ def apply_limit(
         intreable_ = self.filter_already_downloaded(
             self.storage_path, files_names_to_scrape, intreable, by_function=by_function
         )
+        Logger.info(
+            f"Number of entry after filter already downloaded is {len(intreable_)}"
+        )
         files_was_filtered_since_already_download = (
             len(list(intreable)) != 0 and len(list(intreable_)) == 0
         )
 
         # filter unique links
         intreable_ = self.unique(intreable_, by_function=by_function)
+        Logger.info(f"Number of entry after filter unique links is {len(intreable_)}")
 
         # filter by store id
         if store_id:
@@ -116,15 +115,20 @@ def apply_limit(
                     intreable_,
                 )
             )
+        Logger.info(f"Number of entry after filter store id is {len(intreable_)}")
 
         # filter by file type
         if files_types:
             intreable_ = self.filter_file_types(
                 intreable_, limit, files_types, by_function
             )
+        Logger.info(f"Number of entry after filter file type id is {len(intreable_)}")
+
         if only_latest:
             intreable_ = self.get_only_latest(by_function, intreable_)
 
+        Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}")
+
         # filter by limit if the 'files_types' filter is not on.
         if limit and files_types is None:
             assert limit > 0, "Limit must be greater than 0"

diff --git a/il_supermarket_scarper/engines/web.py b/il_supermarket_scarper/engines/web.py
@@ -108,35 +108,40 @@ def scrape(
         files_names_to_scrape=None,
     ):
         """scarpe the files from multipage sites"""
-        super().scrape(
-            limit,
-            files_types=files_types,
-            store_id=store_id,
-            only_latest=only_latest,
-        )
-
-        download_urls, file_names = self.collect_files_details_from_site(
-            limit=limit,
-            files_types=files_types,
-            store_id=store_id,
-            only_latest=only_latest,
-            files_names_to_scrape=files_names_to_scrape,
-        )
-
-        self.on_collected_details(file_names, download_urls)
-
-        Logger.info(f"collected {len(download_urls)} to download.")
-        if len(download_urls) > 0:
-            results = execute_in_event_loop(
-                self.save_and_extract,
-                zip(download_urls, file_names),
-                max_workers=self.max_workers,
+        download_urls, file_names = [], []
+        try:
+            super().scrape(
+                limit,
+                files_types=files_types,
+                store_id=store_id,
+                only_latest=only_latest,
             )
-        else:
-            results = {}
 
-        self.on_download_completed(results=results)
+            download_urls, file_names = self.collect_files_details_from_site(
+                limit=limit,
+                files_types=files_types,
+                store_id=store_id,
+                only_latest=only_latest,
+                files_names_to_scrape=files_names_to_scrape,
+            )
 
-        self.on_scrape_completed(self.get_storage_path())
-        self.post_scraping()
-        return results
+            self.on_collected_details(file_names, download_urls)
+
+            Logger.info(f"collected {len(download_urls)} to download.")
+            if len(download_urls) > 0:
+                results = execute_in_event_loop(
+                    self.save_and_extract,
+                    zip(download_urls, file_names),
+                    max_workers=self.max_workers,
+                )
+            else:
+                results = []
+
+            self.on_download_completed(results=results)
+
+            self.on_scrape_completed(self.get_storage_path())
+            self.post_scraping()
+            return results
+        except Exception as e:  # pylint: disable=broad-except
+            self.on_download_fail(e, download_urls=download_urls, file_names=file_names)
+            return []
diff --git a/il_supermarket_scarper/scrapper_runner.py b/il_supermarket_scarper/scrapper_runner.py
@@ -89,6 +89,7 @@ def scrape_one(
         if self.lookup_in_db:
             scraper.enable_collection_status()
             scraper.enable_aggregation_between_runs()
+
         scraper.scrape(
             limit=limit,
             files_types=files_types,