Skip to content

Commit

Permalink
Gov Site update 19.11.2024
Browse files Browse the repository at this point in the history
  • Loading branch information
erlichsefi authored Nov 22, 2024
2 parents 3d8c505 + b580768 commit 80deb17
Show file tree
Hide file tree
Showing 17 changed files with 215 additions and 112 deletions.
2 changes: 1 addition & 1 deletion il_supermarket_scarper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .main import ScarpingTask
from .scrappers_factory import ScraperFactory
from .utils import FileTypesFilters
from .utils import FileTypesFilters, DumpFolderNames
1 change: 1 addition & 0 deletions il_supermarket_scarper/engines/cerberus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Cerberus(Engine):
"""scraper for all Cerberus base site. (seems like can't support historical data)"""

target_file_extensions = ["xml", "gz"]
utilize_date_param = False

def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions il_supermarket_scarper/engines/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
class Engine(ScraperStatus, ABC):
"""base engine for scraping"""

utilize_date_param = True

def __init__(self, chain, chain_id, folder_name=None, max_threads=10):
assert DumpFolderNames.is_valid_folder_name(
chain
Expand Down
72 changes: 37 additions & 35 deletions il_supermarket_scarper/engines/matrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from bs4 import BeautifulSoup
from il_supermarket_scarper.utils import Logger, _now, FileTypesFilters
from il_supermarket_scarper.utils import Logger
from .apsx import Aspx


class Matrix(Aspx):
"""scraper for all matrix base site.
(support adveanced search: follow the instrucation the page)"""

utilize_date_param = False

def __init__(
self,
chain,
Expand All @@ -19,40 +21,40 @@ def __init__(
super().__init__(chain, chain_id, url, aspx_page, folder_name=folder_name)
self.chain_hebrew_name = chain_hebrew_name

def get_file_types_id(self, files_types=None):
"""get the file type id"""
if files_types is None:
return "all"

types = []
for ftype in files_types:
if ftype == FileTypesFilters.STORE_FILE.name:
types.append("storefull")
if ftype == FileTypesFilters.PRICE_FILE.name:
types.append("price")
if ftype == FileTypesFilters.PROMO_FILE.name:
types.append("promo")
if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
types.append("pricefull")
if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
types.append("promofull")
return types

def get_when(self, when_date):
"""get the when date"""
if when_date is None:
when_date = _now()
return when_date.strftime("%d/%m/%Y")

def get_chain_n_stores__id(self, store_id=None, c_id=None):
"""get the store id"""
if store_id is None:
chain_id = str(c_id) # + "001"
store_id = "-1"
else:
chain_id = str(c_id)
store_id = str(c_id) + "001" + str(store_id).zfill(3)
return chain_id, store_id
# def get_file_types_id(self, files_types=None):
# """get the file type id"""
# if files_types is None:
# return "all"

# types = []
# for ftype in files_types:
# if ftype == FileTypesFilters.STORE_FILE.name:
# types.append("storefull")
# if ftype == FileTypesFilters.PRICE_FILE.name:
# types.append("price")
# if ftype == FileTypesFilters.PROMO_FILE.name:
# types.append("promo")
# if ftype == FileTypesFilters.PRICE_FULL_FILE.name:
# types.append("pricefull")
# if ftype == FileTypesFilters.PROMO_FULL_FILE.name:
# types.append("promofull")
# return types

# def get_when(self, when_date):
# """get the when date"""
# if when_date is None:
# when_date = _now()
# return when_date.strftime("%d/%m/%Y")

# def get_chain_n_stores__id(self, store_id=None, c_id=None):
# """get the store id"""
# if store_id is None:
# chain_id = str(c_id) # + "001"
# store_id = "-1"
# else:
# chain_id = str(c_id)
# store_id = str(c_id) + "001" + str(store_id).zfill(3)
# return chain_id, store_id

def _build_query_url(self, query_params, base_urls):
res = []
Expand Down
4 changes: 3 additions & 1 deletion il_supermarket_scarper/engines/multipage_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def get_request_url(
"""get all links to collect download links from"""

results = []
for arguments in self.build_params(files_types=files_types, store_id=store_id):
for arguments in self.build_params(
files_types=files_types, store_id=store_id, when_date=when_date
):
results.append(
{
"url": self.url + arguments,
Expand Down
1 change: 0 additions & 1 deletion il_supermarket_scarper/engines/publishprice.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def get_request_url(
return [{"url": self.url + formated, "method": "GET"}]

def get_data_from_page(self, req_res):
req_res = self.session_with_cookies_by_chain(self.url)
soup = BeautifulSoup(req_res.text, features="lxml")

# the developer hard-coded the files names in the html
Expand Down
138 changes: 79 additions & 59 deletions il_supermarket_scarper/scraper_stability.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,55 @@
# pylint: disable=arguments-differ,arguments-renamed
from enum import Enum
from il_supermarket_scarper.utils import _is_saturday_in_israel, _now, FileTypesFilters
from il_supermarket_scarper.utils import (
_is_saturday_in_israel,
_now,
datetime_in_tlv,
FileTypesFilters,
hour_files_expected_to_be_accassible,
)


class FullyStable:
"""fully stable is stablity"""

@classmethod
def executes_between_midnight_and_morning_and_requested_today(
cls, when_date=None, **_
cls,
when_date=None,
utilize_date_param=False,
):
"""it is stable if the execution is between midnight
and morning and the requested date is today fails"""
execution_time = _now()
return (
when_date
and execution_time.hour >= 0
and execution_time.hour < 8
and when_date.date() == execution_time.date()
and execution_time.hour < hour_files_expected_to_be_accassible()
and (not utilize_date_param or when_date.date() == execution_time.date())
)

@classmethod
def failire_valid(cls, when_date=None, **_):
def executed_after_date(cls, when_date, date):
"""check if executed after date"""
return when_date > date

@classmethod
def failire_valid(cls, when_date=None, utilize_date_param=True, **_):
"""return true if the parser is stble"""

return cls.executes_between_midnight_and_morning_and_requested_today(
when_date=when_date
when_date=when_date, utilize_date_param=utilize_date_param
)


class SuperFlaky(FullyStable):
"""super flaky is stablity"""

@classmethod
def failire_valid(cls, when_date=None, **_):
def failire_valid(cls, **_):
return True


class Quik(FullyStable):
"""define stability for small chain"""

@classmethod
def executes_early_morning_ask_for_alot_of_files(cls, limit=None, **_):
"""small chain don't upload many files in the morning"""
execution_time = _now()
return limit and execution_time.hour < 12 and limit > 8

@classmethod
def executes_looking_for_store(cls, files_types=None, **_):
"""if the execution is in saturday"""
return files_types and files_types == [FileTypesFilters.STORE_FILE.name]

@classmethod
def failire_valid(cls, when_date=None, limit=None, files_types=None, **_):
"""return true if the parser is stble"""
return (
super().failire_valid(when_date=when_date)
or cls.executes_early_morning_ask_for_alot_of_files(limit=limit)
or cls.executes_looking_for_store(files_types=files_types)
)


class NetivHased(FullyStable):
"""Netiv Hased is stablity"""

Expand All @@ -69,30 +59,40 @@ def executed_in_saturday(cls, when_date=None, **_):
return when_date and _is_saturday_in_israel(when_date)

@classmethod
def failire_valid(cls, when_date=None, **_):
def failire_valid(cls, when_date=None, utilize_date_param=False, **_):
"""return true if the parser is stble"""
return super().failire_valid(when_date=when_date) or cls.executed_in_saturday(
when_date=when_date
)
return super().failire_valid(
when_date=when_date, utilize_date_param=utilize_date_param
) or cls.executed_in_saturday(when_date=when_date)


class SalachDabach(FullyStable):
class CityMarketGivataim(FullyStable):
"""Netiv Hased is stablity"""

@classmethod
def searching_for_store(cls, files_types=None, **_):
def searching_for_update_promo(cls, files_types=None, **_):
"""if the execution is in saturday"""
return files_types and files_types == [FileTypesFilters.STORE_FILE.name]
return files_types and files_types == [FileTypesFilters.PROMO_FILE.name]

@classmethod
def failire_valid(cls, when_date=None, files_types=None, **_):
def failire_valid(
cls, when_date=None, files_types=None, utilize_date_param=True, **_
):
"""return true if the parser is stble"""
return super().failire_valid(when_date=when_date) or cls.searching_for_store(
files_types=files_types
return (
super().failire_valid(when_date=when_date)
or cls.searching_for_update_promo(files_types=files_types)
or when_date
and cls.executed_after_date(
when_date=when_date,
date=datetime_in_tlv(
year=2024, month=11, day=5, hour=0, minute=0, second=0
),
)
)


class CityMarketGivataim(FullyStable):
class CityMarketKiratOno(FullyStable):
"""Netiv Hased is stablity"""

@classmethod
Expand All @@ -101,59 +101,75 @@ def searching_for_update_promo(cls, files_types=None, **_):
return files_types and files_types == [FileTypesFilters.PROMO_FILE.name]

@classmethod
def failire_valid(cls, when_date=None, files_types=None, **_):
def failire_valid(
cls, when_date=None, files_types=None, utilize_date_param=True, **_
):
"""return true if the parser is stble"""
return super().failire_valid(
when_date=when_date
) or cls.searching_for_update_promo(files_types=files_types)


class CityMarketKiratOno(FullyStable):
class CityMarketKiratGat(FullyStable):
"""Netiv Hased is stablity"""

@classmethod
def searching_for_update_promo(cls, files_types=None, **_):
def searching_for_update_promo_full(cls, files_types=None, **_):
"""if the execution is in saturday"""
return files_types and files_types == [FileTypesFilters.PROMO_FILE.name]
return files_types and files_types == [FileTypesFilters.PROMO_FULL_FILE.name]

@classmethod
def failire_valid(cls, when_date=None, files_types=None, **_):
def failire_valid(
cls, when_date=None, files_types=None, utilize_date_param=True, **_
):
"""return true if the parser is stble"""
return super().failire_valid(
when_date=when_date
) or cls.searching_for_update_promo(files_types=files_types)
) or cls.searching_for_update_promo_full(files_types=files_types)


class CityMarketKiratGat(FullyStable):
"""Netiv Hased is stablity"""
class DoNotPublishStores(FullyStable):
"""stablity for chains that doesn't pubish stores"""

@classmethod
def searching_for_update_promo_full(cls, files_types=None, **_):
def searching_for_store_full(cls, files_types=None, **_):
"""if the execution is in saturday"""
return files_types and files_types == [FileTypesFilters.PROMO_FULL_FILE.name]
return files_types and files_types == [FileTypesFilters.STORE_FILE.name]

@classmethod
def failire_valid(cls, when_date=None, files_types=None, **_):
def failire_valid(
cls, when_date=None, files_types=None, utilize_date_param=True, **_
):
"""return true if the parser is stble"""
return super().failire_valid(
when_date=when_date
) or cls.searching_for_update_promo_full(files_types=files_types)
when_date=when_date,
files_types=files_types,
utilize_date_param=utilize_date_param,
) or cls.searching_for_store_full(files_types=files_types)


class ScraperStability(Enum):
"""tracker for the stablity of the scraper"""

COFIX = DoNotPublishStores
NETIV_HASED = NetivHased
QUIK = Quik
SALACH_DABACH = SalachDabach
QUIK = DoNotPublishStores
SALACH_DABACH = DoNotPublishStores
CITY_MARKET_GIVATAYIM = CityMarketGivataim
CITY_MARKET_KIRYATONO = CityMarketKiratOno
CITY_MARKET_KIRYATGAT = CityMarketKiratGat
MESHMAT_YOSEF_1 = SuperFlaky
YOHANANOF = DoNotPublishStores

@classmethod
def is_validate_scraper_found_no_files(
cls, scraper_enum, limit=None, files_types=None, store_id=None, when_date=None
cls,
scraper_enum,
limit=None,
files_types=None,
store_id=None,
when_date=None,
utilize_date_param=False,
):
"""return true if its ok the scarper reuturn no enrty"""

Expand All @@ -162,5 +178,9 @@ def is_validate_scraper_found_no_files(
stabler = ScraperStability[scraper_enum].value

return stabler.failire_valid(
limit=limit, files_types=files_types, store_id=store_id, when_date=when_date
limit=limit,
files_types=files_types,
store_id=store_id,
when_date=when_date,
utilize_date_param=utilize_date_param,
)
1 change: 0 additions & 1 deletion il_supermarket_scarper/scrappers/bareket.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from il_supermarket_scarper.utils import DumpFolderNames


# @FlakyScraper
class Bareket(Bina):
"""scarper for bareket"""

Expand Down
Loading

0 comments on commit 80deb17

Please sign in to comment.