Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.4.6 #62

Merged
merged 21 commits into from
Oct 3, 2024
64 changes: 56 additions & 8 deletions il_supermarket_scarper/engines/apsx.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
from abc import ABC, abstractmethod
from il_supermarket_scarper.utils import Logger
from il_supermarket_scarper.utils import Logger, FileTypesFilters

from .web import WebBase


class Aspx(WebBase, ABC):
"""class for aspx scapers"""

def __init__(self, chain, chain_id, url, aspx_page, folder_name=None):
super().__init__(chain, chain_id, url, folder_name=folder_name)
def __init__(
self, chain, chain_id, url, aspx_page, folder_name=None, max_threads=5
):
super().__init__(
chain, chain_id, url, folder_name=folder_name, max_threads=max_threads
)
self.aspx_page = aspx_page

def file_type_id(self, file_type):
"""get the file type id"""
if file_type == FileTypesFilters.STORE_FILE.name:
return 1
if file_type == FileTypesFilters.PRICE_FILE.name:
return 2
if file_type == FileTypesFilters.PROMO_FILE.name:
return 3
if file_type == FileTypesFilters.PRICE_FULL_FILE.name:
return 4
if file_type == FileTypesFilters.PROMO_FULL_FILE.name:
return 5
raise ValueError(f"file type {file_type} not supported")

def extract_task_from_entry(self, all_trs):
download_urls: list = list(
map(lambda x: self.url + self.get_href_from_entry(x), all_trs)
@@ -26,18 +44,48 @@ def _build_query_url(self, query_params):
res.append(base + self.aspx_page + query_params)
return res

def _get_all_possible_query_string_params(self):
def _get_all_possible_query_string_params(
self, files_types=None, store_id=None, when_date=None
):
"""get the arguments need to add to the url"""
if isinstance(self.chain_id, list):
res = []
for c_id in self.chain_id:
res.append(f"?code=={c_id}")
res.append(f"?code={c_id}")
return res
return [f"?code={self.chain_id}"]
chains_urls = [f"?code={self.chain_id}"]

# add file types to url
if files_types:
chains_urls_with_types = []
for files_type in files_types:
file_type_id = self.file_type_id(files_type)
chains_urls_with_types.extend(
[
f"{chain_url}&WFileType={file_type_id}"
for chain_url in chains_urls
]
)
chains_urls = chains_urls_with_types

# add store id
if store_id:
for chain_url in chains_urls:
chain_url += f"&WStore={store_id}"

# posting date
if when_date:
for chain_url in chains_urls:
chain_url += (
f"&WDate={when_date.strftime('%d/%m/%Y').reaplce('/','%2F')}"
)
return chains_urls

def get_request_url(self):
def get_request_url(self, files_types=None, store_id=None, when_date=None):
result = []
for query_params in self._get_all_possible_query_string_params():
for query_params in self._get_all_possible_query_string_params(
files_types=files_types, store_id=store_id, when_date=when_date
):
result.extend(self._build_query_url(query_params))
Logger.info(f"Request url: {result}")
return result
15 changes: 9 additions & 6 deletions il_supermarket_scarper/engines/cerberus.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
from il_supermarket_scarper.utils import (
extract_xml_file_from_gz_file,
Logger,
execute_in_event_loop,
execute_in_parallel,
collect_from_ftp,
fetch_temporary_gz_file_from_ftp,
retry_files,
@@ -26,8 +26,9 @@ def __init__(
ftp_path="/",
ftp_username="",
ftp_password="",
max_threads=5,
):
super().__init__(chain, chain_id, folder_name)
super().__init__(chain, chain_id, folder_name, max_threads)
self.ftp_host = ftp_host
self.ftp_path = ftp_path
self.ftp_username = ftp_username
@@ -42,6 +43,8 @@ def scrape(
store_id=None,
only_latest=False,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
):
files = []
try:
@@ -54,16 +57,16 @@ def scrape(
files = self.collect_files_details_from_site(
limit=limit,
files_types=files_types,
filter_null=True,
filter_zero=True,
filter_null=filter_null,
filter_zero=filter_zero,
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=files_names_to_scrape,
)
self.on_collected_details(files)

results = execute_in_event_loop(
self.persist_from_ftp, files, max_workers=self.max_workers
results = execute_in_parallel(
self.persist_from_ftp, list(files), max_threads=self.max_threads
)
self.on_download_completed(results=results)
self.on_scrape_completed(self.get_storage_path())
8 changes: 6 additions & 2 deletions il_supermarket_scarper/engines/engine.py
Original file line number Diff line number Diff line change
@@ -21,15 +21,15 @@
class Engine(ScraperStatus, ABC):
"""base engine for scraping"""

def __init__(self, chain, chain_id, folder_name=None):
def __init__(self, chain, chain_id, folder_name=None, max_threads=10):
assert DumpFolderNames.is_valid_folder_name(
chain
), "chain name can contain only abc and -"

super().__init__(chain.value, "status", folder_name=folder_name)
self.chain = chain
self.chain_id = chain_id
self.max_workers = 5
self.max_threads = max_threads
self.storage_path = get_output_folder(self.chain.value, folder_name=folder_name)
Logger.info(f"Storage path: {self.storage_path}")

@@ -212,6 +212,8 @@ def scrape(
store_id=None,
only_latest=False,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
):
"""run the scraping logic"""
self.post_scraping()
@@ -221,6 +223,8 @@ def scrape(
store_id=store_id,
files_names_to_scrape=files_names_to_scrape,
only_latest=only_latest,
filter_null=filter_null,
filter_zero=filter_zero,
)
Logger.info(f"Starting scraping for {self.chain}")
self.make_storage_path_dir()
18 changes: 12 additions & 6 deletions il_supermarket_scarper/engines/multipage_web.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@

from il_supermarket_scarper.utils import (
Logger,
execute_in_event_loop,
execute_in_parallel,
multiple_page_aggregtion,
)
from .web import WebBase
@@ -31,8 +31,11 @@ def __init__(
total_page_xpath="""//*[@id="gridContainer"]/table/
tfoot/tr/td/a[6]/@href""",
total_pages_pattern=r"^\/\?page\=([0-9]{3})$",
max_threads=5,
):
super().__init__(chain, chain_id, url=url, folder_name=folder_name)
super().__init__(
chain, chain_id, url=url, folder_name=folder_name, max_threads=max_threads
)
self.total_page_xpath = total_page_xpath
self.total_pages_pattern = total_pages_pattern

@@ -71,11 +74,14 @@ def collect_files_details_from_site(
limit=None,
files_types=None,
store_id=None,
when_date=None,
only_latest=False,
files_names_to_scrape=None,
):
self.post_scraping()
url = self.get_request_url()
url = self.get_request_url(
files_types=files_types, store_id=store_id, when_date=when_date
)

total_pages = self.get_number_of_pages(url[0])
Logger.info(f"Found {total_pages} pages")
@@ -87,11 +93,11 @@ def collect_files_details_from_site(
)
)

download_urls, file_names = execute_in_event_loop(
download_urls, file_names = execute_in_parallel(
self.process_links_before_download,
pages_to_scrape,
list(pages_to_scrape),
aggregtion_function=multiple_page_aggregtion,
max_workers=self.max_workers,
max_threads=self.max_threads,
)
file_names, download_urls = self.apply_limit_zip(
file_names,
11 changes: 10 additions & 1 deletion il_supermarket_scarper/engines/publishprice.py
Original file line number Diff line number Diff line change
@@ -17,12 +17,21 @@ class PublishPrice(WebBase):
but this is not implemented.
"""

def __init__(self, chain, chain_id, site_infix, folder_name=None, domain="prices"):
def __init__(
self,
chain,
chain_id,
site_infix,
folder_name=None,
domain="prices",
max_threads=5,
):
super().__init__(
chain,
chain_id,
url=f"https://{domain}.{site_infix}.co.il/",
folder_name=folder_name,
max_threads=max_threads,
)
self.folder = None

25 changes: 17 additions & 8 deletions il_supermarket_scarper/engines/web.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from bs4 import BeautifulSoup
from il_supermarket_scarper.utils import (
Logger,
execute_in_event_loop,
execute_in_parallel,
session_and_check_status,
retry_files,
)
@@ -12,8 +12,8 @@
class WebBase(Engine):
"""scrape the file of websites that the only why to download them is via web"""

def __init__(self, chain, chain_id, url, folder_name=None):
super().__init__(chain, chain_id, folder_name)
def __init__(self, chain, chain_id, url, folder_name=None, max_threads=5):
super().__init__(chain, chain_id, folder_name, max_threads=max_threads)
self.url = url
self.max_retry = 2

@@ -22,7 +22,9 @@ def get_data_from_page(self, req_res):
soup = BeautifulSoup(req_res.text, features="lxml")
return soup.find_all("tr")[1:]

def get_request_url(self):
def get_request_url(
self, files_types=None, store_id=None, when_date=None
): # pylint: disable=unused-argument
"""get all links to collect download links from"""
return [self.url]

@@ -66,11 +68,14 @@ def collect_files_details_from_site(
limit=None,
files_types=None,
store_id=None,
when_date=None,
only_latest=False,
files_names_to_scrape=None,
):
"""collect all enteris to download from site"""
urls_to_collect_link_from = self.get_request_url()
urls_to_collect_link_from = self.get_request_url(
files_types=files_types, store_id=store_id, when_date=when_date
)

all_trs = []
for url in urls_to_collect_link_from:
@@ -106,6 +111,8 @@ def scrape(
store_id=None,
only_latest=False,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
):
"""scarpe the files from multipage sites"""
download_urls, file_names = [], []
@@ -115,6 +122,8 @@ def scrape(
files_types=files_types,
store_id=store_id,
only_latest=only_latest,
filter_null=filter_null,
filter_zero=filter_zero,
)

download_urls, file_names = self.collect_files_details_from_site(
@@ -129,10 +138,10 @@ def scrape(

Logger.info(f"collected {len(download_urls)} to download.")
if len(download_urls) > 0:
results = execute_in_event_loop(
results = execute_in_parallel(
self.save_and_extract,
zip(download_urls, file_names),
max_workers=self.max_workers,
list(zip(download_urls, file_names)),
max_threads=self.max_threads,
)
else:
results = []
2 changes: 2 additions & 0 deletions il_supermarket_scarper/scrapper_runner.py
Original file line number Diff line number Diff line change
@@ -96,6 +96,8 @@ def scrape_one(
store_id=store_id,
only_latest=only_latest,
files_names_to_scrape=None,
filter_null=False,
filter_zero=False,
)
Logger.info(f"done scraping {chain_name}")

1 change: 1 addition & 0 deletions il_supermarket_scarper/scrappers/ramilevy.py
Original file line number Diff line number Diff line change
@@ -11,4 +11,5 @@ def __init__(self, folder_name=None):
chain_id="7290058140886",
folder_name=folder_name,
ftp_username="RamiLevi",
max_threads=10,
)
2 changes: 1 addition & 1 deletion il_supermarket_scarper/scrappers/tests/test_all.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ class BareketTestCase(make_test_case(ScraperFactory.BAREKET, 5)):
"""Test case for ScraperFactory.BAREKET."""


class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 6)):
class YaynotBitanTestCase(make_test_case(ScraperFactory.YAYNO_BITAN, 9032)):
"""Test case for ScraperFactory.YAYNO_BITAN."""


4 changes: 3 additions & 1 deletion il_supermarket_scarper/scrappers/tests/test_cases.py
Original file line number Diff line number Diff line change
@@ -143,6 +143,8 @@ def __clean_scarpe_delete(
"files_types": file_type,
"store_id": store_id,
"only_latest": only_latest,
"filter_null": True,
"filter_zero": True,
}

scraper.scrape(**kwarg)
@@ -199,7 +201,7 @@ def test_scrape_one(self):

def test_scrape_ten(self):
"""scrape ten file and make sure they exists"""
self._clean_scarpe_delete(scraper_enum, limit=10)
self._clean_scarpe_delete(scraper_enum, limit=None)

def test_scrape_promo(self):
"""scrape one promo file and make sure it exists"""
1 change: 1 addition & 0 deletions il_supermarket_scarper/scrappers/yellow.py
Original file line number Diff line number Diff line change
@@ -12,4 +12,5 @@ def __init__(self, folder_name=None):
folder_name=folder_name,
ftp_username="Paz_bo",
ftp_password="paz468",
max_threads=10,
)
2 changes: 1 addition & 1 deletion il_supermarket_scarper/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@
fetch_temporary_gz_file_from_ftp,
wget_file,
)
from .loop import execute_in_event_loop, multiple_page_aggregtion
from .loop import execute_in_parallel, multiple_page_aggregtion
from .exceptions import RestartSessionError
from .retry import retry_files
from .marking import FlakyScraper
Loading