Skip to content

Commit

Permalink
Tested locally
Browse files Browse the repository at this point in the history
0.4.9
  • Loading branch information
erlichsefi authored Oct 11, 2024
2 parents 3bfb65d + 966f57b commit 6fed2c3
Show file tree
Hide file tree
Showing 19 changed files with 12,848 additions and 227 deletions.
55 changes: 29 additions & 26 deletions il_supermarket_scarper/engines/apsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,43 +44,46 @@ def _build_query_url(self, query_params):
res.append(base + self.aspx_page + query_params)
return res

def _get_all_possible_query_string_params(
def _get_all_possible_query_string_params( # pylint: disable=unused-argument
self, files_types=None, store_id=None, when_date=None
):
"""get the arguments need to add to the url"""
if isinstance(self.chain_id, list):
res = []
for c_id in self.chain_id:
res.append(f"?code={c_id}")
return res
chains_urls = res

chains_urls = [f"?code={self.chain_id}"]

# add file types to url
if files_types:
chains_urls_with_types = []
for files_type in files_types:
file_type_id = self.file_type_id(files_type)
chains_urls_with_types.extend(
[
f"{chain_url}&WFileType={file_type_id}"
for chain_url in chains_urls
]
)
chains_urls = chains_urls_with_types

# add store id
if store_id:
for chain_url in chains_urls:
chain_url += f"&WStore={store_id}"

# posting date
if when_date:
for chain_url in chains_urls:
chain_url += (
f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
)
return chains_urls

# # add file types to url
# if files_types:
# chains_urls_with_types = []
# for files_type in files_types:
# file_type_id = self.file_type_id(files_type)
# chains_urls_with_types.extend(
# [
# f"{chain_url}&WFileType={file_type_id}"
# for chain_url in chains_urls
# ]
# )
# chains_urls = chains_urls_with_types

# # add store id
# if store_id:
# for chain_url in chains_urls:
# chain_url += f"&WStore={store_id}"

# # posting date
# if when_date:
# for chain_url in chains_urls:
# chain_url += (
# f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
# )
# return chains_urls

def get_request_url(self, files_types=None, store_id=None, when_date=None):
result = []
for query_params in self._get_all_possible_query_string_params(
Expand Down
11 changes: 6 additions & 5 deletions il_supermarket_scarper/engines/cerberus.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def scrape(
limit=None,
files_types=None,
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
Expand All @@ -52,15 +52,15 @@ def scrape(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
)
files = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
filter_null=filter_null,
filter_zero=filter_zero,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)
self.on_collected_details(files)
Expand All @@ -73,6 +73,7 @@ def scrape(
return results
except Exception as e: # pylint: disable=broad-except
self.on_download_fail(e, files=files)
Logger.error_execption(e)
return []

def collect_files_details_from_site(
Expand All @@ -82,7 +83,7 @@ def collect_files_details_from_site(
filter_null=False,
filter_zero=False,
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
):
"""collect all files to download from the site"""
Expand Down Expand Up @@ -117,7 +118,7 @@ def collect_files_details_from_site(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)
Logger.info(f"After applying limit: Found {len(files)} files")
Expand Down
51 changes: 35 additions & 16 deletions il_supermarket_scarper/engines/engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC
import os
import re

import datetime

from il_supermarket_scarper.utils import (
get_output_folder,
Expand Down Expand Up @@ -38,16 +38,16 @@ def get_storage_path(self):
return self.storage_path

def _is_validate_scraper_found_no_files(
self, limit=None, files_types=None, store_id=None, only_latest=False
self, limit=None, files_types=None, store_id=None, when_date=None
):
Logger.info(
f"check if fail is allowd with, limit={limit},"
f"files_types={files_types},store_id={store_id},only_latest={only_latest}"
f"files_types={files_types},store_id={store_id},when_date={when_date}"
)
return False

def is_validate_scraper_found_no_files(
self, limit=None, files_types=None, store_id=None, only_latest=False
self, limit=None, files_types=None, store_id=None, when_date=None
):
"""return true if its ok the scarper reuturn no enrty"""

Expand All @@ -58,7 +58,7 @@ def is_validate_scraper_found_no_files(
for file_type in files_types:
if file_type in FileTypesFilters.all_full_files():
request_only_update_file = False
Logger.info(f"the value of {only_latest} should not affect.")
Logger.info(f"the value of {when_date} should not affect.")
return (
limit == 0
or files_types == []
Expand All @@ -68,7 +68,7 @@ def is_validate_scraper_found_no_files(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
)
)

Expand All @@ -83,13 +83,13 @@ def apply_limit(
files_types=None,
by_function=lambda x: x,
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
):
"""filter the list according to condition"""
assert (
not only_latest or limit is None
), "only_latest flag can't be applied with limit."
# assert (
# when_date is not None or limit is None
# ), "when_date flag can't be applied with limit."

# filter files already downloaded
intreable_ = self.filter_already_downloaded(
Expand Down Expand Up @@ -124,13 +124,17 @@ def apply_limit(
)
Logger.info(f"Number of entry after filter file type id is {len(intreable_)}")

if only_latest:
if isinstance(when_date, datetime.datetime):
intreable_ = self.get_by_date(when_date, by_function, intreable_)
elif isinstance(when_date, str) and when_date == "latest":
intreable_ = self.get_only_latest(by_function, intreable_)
elif when_date is not None:
raise ValueError(f"when_date should be datetime or bool, got {when_date}")

Logger.info(f"Number of entry after filter keeping latast is {len(intreable_)}")

# filter by limit if the 'files_types' filter is not on.
if limit and files_types is None:
if limit:
assert limit > 0, "Limit must be greater than 0"
Logger.info(f"Limit: {limit}")
intreable_ = intreable_[: min(limit, len(list(intreable_)))]
Expand All @@ -144,12 +148,12 @@ def apply_limit(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
)
):
raise ValueError(
f"No files to download for file files_types={files_types},"
f"limit={limit},store_id={store_id},only_latest={only_latest}"
f"limit={limit},store_id={store_id},when_date={when_date}"
)
return intreable_

Expand Down Expand Up @@ -182,6 +186,21 @@ def get_only_latest(self, by_function, intreable_):
groups_value[store_info] = file
return list(groups_value.values())

def get_by_date(self, requested_date, by_function, intreable_):
"""get by date"""
#
date_format = requested_date.strftime("%Y%m%d")
#
groups_value = []
for file in intreable_:
name_split = by_function(file).split("-", maxsplit=2)
date_info = name_split[-1].rsplit(".", maxsplit=1)[0]

if date_info.startswith(date_format):
groups_value.append(file)

return groups_value

@classmethod
def unique(cls, iterable, by_function=lambda x: x):
"""Returns the type of the file."""
Expand Down Expand Up @@ -210,7 +229,7 @@ def scrape(
limit=None,
files_types=None,
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
Expand All @@ -222,7 +241,7 @@ def scrape(
files_types=files_types,
store_id=store_id,
files_names_to_scrape=files_names_to_scrape,
only_latest=only_latest,
when_date=when_date,
filter_null=filter_null,
filter_zero=filter_zero,
)
Expand Down
7 changes: 3 additions & 4 deletions il_supermarket_scarper/engines/multipage_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def collect_files_details_from_site(
files_types=None,
store_id=None,
when_date=None,
only_latest=False,
files_names_to_scrape=None,
):
self.post_scraping()
Expand Down Expand Up @@ -109,7 +108,7 @@ def collect_files_details_from_site(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)

Expand All @@ -125,7 +124,7 @@ def collect_files_details_from_page(self, html):
return links, filenames

def process_links_before_download(
self, page, limit=None, files_types=None, store_id=None, only_latest=None
self, page, limit=None, files_types=None, store_id=None, when_date=None
):
"""additional processing to the links before download"""
response = self.session_with_cookies_by_chain(page)
Expand All @@ -141,7 +140,7 @@ def process_links_before_download(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
)

Logger.info(
Expand Down
4 changes: 2 additions & 2 deletions il_supermarket_scarper/engines/publishprice.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ def get_name_from_herf(x):
return download_urls, file_names

def _is_validate_scraper_found_no_files(
self, limit=None, files_types=None, store_id=None, only_latest=False
self, limit=None, files_types=None, store_id=None, when_date=None
):
return (
super()._is_validate_scraper_found_no_files( # what fails the rest
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
)
or ( # if we are looking for one store file in a weekend or holiday
store_id and (_is_weekend_in_israel() or _is_holiday_in_israel())
Expand Down
14 changes: 7 additions & 7 deletions il_supermarket_scarper/engines/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def apply_limit_zip(
files_types=None,
by_function=lambda x: x[0],
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
):
"""apply limit to zip"""
Expand All @@ -55,7 +55,7 @@ def apply_limit_zip(
files_types=files_types,
by_function=by_function,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)
if len(ziped) == 0:
Expand All @@ -69,7 +69,6 @@ def collect_files_details_from_site(
files_types=None,
store_id=None,
when_date=None,
only_latest=False,
files_names_to_scrape=None,
):
"""collect all enteris to download from site"""
Expand All @@ -95,7 +94,7 @@ def collect_files_details_from_site(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)

Expand All @@ -109,7 +108,7 @@ def scrape(
limit=None,
files_types=None,
store_id=None,
only_latest=False,
when_date=None,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
Expand All @@ -121,7 +120,7 @@ def scrape(
limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
filter_null=filter_null,
filter_zero=filter_zero,
)
Expand All @@ -130,7 +129,7 @@ def scrape(
limit=limit,
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
when_date=when_date,
files_names_to_scrape=files_names_to_scrape,
)

Expand All @@ -153,4 +152,5 @@ def scrape(
return results
except Exception as e: # pylint: disable=broad-except
self.on_download_fail(e, download_urls=download_urls, file_names=file_names)
Logger.error_execption(e)
return []
Loading

0 comments on commit 6fed2c3

Please sign in to comment.