Skip to content

Commit

Permalink
Merge pull request #61 from erlichsefi/dev
Browse files Browse the repository at this point in the history
v0.4.5
  • Loading branch information
erlichsefi authored Sep 27, 2024
2 parents 46647f1 + 94025a1 commit babe8e7
Show file tree
Hide file tree
Showing 12 changed files with 176 additions and 165 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ jobs:
pip install pylint
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707
pylint $(git ls-files '*.py') --disable=E0401,R0801,R0903,W0707,R0917
4 changes: 3 additions & 1 deletion .github/workflows/test-suite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,6 @@ jobs:
- name: Build with Docker
run: docker build -t erlichsefi/israeli-supermarket-scarpers:test --target test .
- name: Test with pytest
run: docker run --rm --name test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test
run: docker rm scraper-test-run || true &&
docker run --rm --name scraper-test-run -e DISABLED_SCRAPPERS="${{ env.DISABLED_SCRAPPERS }}" erlichsefi/israeli-supermarket-scarpers:test &&
docker builder prune -f
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Quick start

il_supermarket_scarper can be installed using pip:

python3 pip install il-supermarket-scraper
python3 pip install israeli-supermarket-scraper

If you want to run the latest version of the code, you can install it from the
repo directly:
Expand Down
49 changes: 27 additions & 22 deletions il_supermarket_scarper/engines/cerberus.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,29 +43,34 @@ def scrape(
only_latest=False,
files_names_to_scrape=None,
):
super().scrape(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
)
files = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
filter_null=True,
filter_zero=True,
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=files_names_to_scrape,
)
self.on_collected_details(files)
files = []
try:
super().scrape(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
)
files = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
filter_null=True,
filter_zero=True,
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=files_names_to_scrape,
)
self.on_collected_details(files)

results = execute_in_event_loop(
self.persist_from_ftp, files, max_workers=self.max_workers
)
self.on_download_completed(results=results)
self.on_scrape_completed(self.get_storage_path())
return results
results = execute_in_event_loop(
self.persist_from_ftp, files, max_workers=self.max_workers
)
self.on_download_completed(results=results)
self.on_scrape_completed(self.get_storage_path())
return results
except Exception as e: # pylint: disable=broad-except
self.on_download_fail(e, files=files)
return []

def collect_files_details_from_site(
self,
Expand Down
18 changes: 11 additions & 7 deletions il_supermarket_scarper/engines/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,12 @@
class Engine(ScraperStatus, ABC):
"""base engine for scraping"""

def __init__(
self,
chain,
chain_id,
folder_name=None,
):
def __init__(self, chain, chain_id, folder_name=None):
assert DumpFolderNames.is_valid_folder_name(
chain
), "chain name can contain only abc and -"

super().__init__(chain.value, "status")
super().__init__(chain.value, "status", folder_name=folder_name)
self.chain = chain
self.chain_id = chain_id
self.max_workers = 5
Expand Down Expand Up @@ -100,12 +95,16 @@ def apply_limit(
intreable_ = self.filter_already_downloaded(
self.storage_path, files_names_to_scrape, intreable, by_function=by_function
)
Logger.info(
f"Number of entry after filter already downloaded is {len(intreable_)}"
)
files_was_filtered_since_already_download = (
len(list(intreable)) != 0 and len(list(intreable_)) == 0
)

# filter unique links
intreable_ = self.unique(intreable_, by_function=by_function)
Logger.info(f"Number of entry after filter unique links is {len(intreable_)}")

# filter by store id
if store_id:
Expand All @@ -116,15 +115,20 @@ def apply_limit(
intreable_,
)
)
Logger.info(f"Number of entry after filter store id is {len(intreable_)}")

# filter by file type
if files_types:
intreable_ = self.filter_file_types(
intreable_, limit, files_types, by_function
)
Logger.info(f"Number of entry after filter file type id is {len(intreable_)}")

if only_latest:
intreable_ = self.get_only_latest(by_function, intreable_)

Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}")

# filter by limit if the 'files_types' filter is not on.
if limit and files_types is None:
assert limit > 0, "Limit must be greater than 0"
Expand Down
63 changes: 34 additions & 29 deletions il_supermarket_scarper/engines/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,35 +108,40 @@ def scrape(
files_names_to_scrape=None,
):
"""scarpe the files from multipage sites"""
super().scrape(
limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
)

download_urls, file_names = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=files_names_to_scrape,
)

self.on_collected_details(file_names, download_urls)

Logger.info(f"collected {len(download_urls)} to download.")
if len(download_urls) > 0:
results = execute_in_event_loop(
self.save_and_extract,
zip(download_urls, file_names),
max_workers=self.max_workers,
download_urls, file_names = [], []
try:
super().scrape(
limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
)
else:
results = {}

self.on_download_completed(results=results)
download_urls, file_names = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=files_names_to_scrape,
)

self.on_scrape_completed(self.get_storage_path())
self.post_scraping()
return results
self.on_collected_details(file_names, download_urls)

Logger.info(f"collected {len(download_urls)} to download.")
if len(download_urls) > 0:
results = execute_in_event_loop(
self.save_and_extract,
zip(download_urls, file_names),
max_workers=self.max_workers,
)
else:
results = []

self.on_download_completed(results=results)

self.on_scrape_completed(self.get_storage_path())
self.post_scraping()
return results
except Exception as e: # pylint: disable=broad-except
self.on_download_fail(e, download_urls=download_urls, file_names=file_names)
return []
1 change: 1 addition & 0 deletions il_supermarket_scarper/scrapper_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def scrape_one(
if self.lookup_in_db:
scraper.enable_collection_status()
scraper.enable_aggregation_between_runs()

scraper.scrape(
limit=limit,
files_types=files_types,
Expand Down
Loading

0 comments on commit babe8e7

Please sign in to comment.