@@ -1,8 +1,12 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from supermarket_chain import SupermarketChain
|
||||
|
||||
FNAME_KEY = "FileNm"
|
||||
|
||||
|
||||
class BinaProjectWebClient:
|
||||
_date_hour_format = '%Y-%m-%d %H:%M:%S'
|
||||
@@ -16,8 +20,16 @@ class BinaProjectWebClient:
|
||||
url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"])
|
||||
req_res: requests.Response = session.get(url)
|
||||
jsons_files = json.loads(req_res.text)
|
||||
suffix = next(cur_json["FileNm"] for cur_json in jsons_files if f'-{store_id:03d}-20' in cur_json["FileNm"]
|
||||
and category.name.replace('s', '') in cur_json["FileNm"])
|
||||
|
||||
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
|
||||
filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname \
|
||||
and not re.search('full', fname, re.IGNORECASE)
|
||||
if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files):
|
||||
return "" # Could not find non-full Promos/Prices file
|
||||
else:
|
||||
filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname
|
||||
suffix = next(
|
||||
cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY]))
|
||||
down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix])
|
||||
return down_url
|
||||
|
||||
|
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from supermarket_chain import SupermarketChain
|
||||
@@ -17,7 +19,13 @@ class CerberusWebClient:
|
||||
ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
|
||||
s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
|
||||
s_json: dict = json.loads(s.text)
|
||||
suffix: str = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
|
||||
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
|
||||
filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE)
|
||||
if not any(filter_func(d, store_id) for d in s_json['aaData']):
|
||||
return "" # Could not find non-full Prices/Promos file
|
||||
else:
|
||||
filter_func = lambda d, id: f'-{id:03d}-20' in d['name']
|
||||
suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id))
|
||||
|
||||
download_url: str = hostname + "/file/d/" + suffix
|
||||
return download_url
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from typing import Dict, List
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -21,8 +22,16 @@ class MahsaneiHashook(SupermarketChain):
|
||||
url = prefix + "NBCompetitionRegulations.aspx"
|
||||
req_res: requests.Response = requests.get(url)
|
||||
soup = BeautifulSoup(req_res.text, features='lxml')
|
||||
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
|
||||
and f'-{store_id:03d}-20' in value).attrs['href']
|
||||
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
|
||||
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
|
||||
and f'-{store_id:03d}-20' in fname \
|
||||
and not re.search('full', fname, re.IGNORECASE)
|
||||
if soup.find('a', href=fname_filter_func) is None:
|
||||
return "" # Could not find non-full Promos/Prices file
|
||||
else:
|
||||
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
|
||||
and f'-{store_id:03d}-20' in fname
|
||||
suffix: str = soup.find('a', href=fname_filter_func).attrs['href']
|
||||
down_url: str = prefix + suffix
|
||||
return down_url
|
||||
|
||||
|
5
main.py
5
main.py
@@ -1,6 +1,4 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from argparse import ArgumentParser
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -33,13 +31,12 @@ from chains import (
|
||||
shefa_birkat_hashem,
|
||||
)
|
||||
|
||||
# TODO: fix problem of left-to-right printing
|
||||
|
||||
Path(RESULTS_DIRNAME).mkdir(exist_ok=True)
|
||||
Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
|
||||
|
||||
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()}
|
||||
|
||||
# TODO: change functions arguments to include all necessary parameters (e.g. chain) or split arguments
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--promos',
|
||||
|
76
promotion.py
76
promotion.py
@@ -1,13 +1,15 @@
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Union
|
||||
from bs4.element import Tag
|
||||
import csv
|
||||
import sys
|
||||
import pandas as pd
|
||||
import xlsxwriter
|
||||
from tqdm import tqdm
|
||||
from aenum import Enum
|
||||
|
||||
from item import Item
|
||||
from utils import (
|
||||
create_bs_object, create_items_dict,
|
||||
@@ -19,6 +21,8 @@ from supermarket_chain import SupermarketChain
|
||||
XML_FILES_PROMOTIONS_CATEGORIES = [SupermarketChain.XMLFilesCategory.PromosFull,
|
||||
SupermarketChain.XMLFilesCategory.Promos]
|
||||
|
||||
PROMOTION_COLS_NUM = 15 # The length of the list returned by get_promotion_row_for_table function
|
||||
|
||||
INVALID_OR_UNKNOWN_PROMOTION_FUNCTION = -1
|
||||
|
||||
PROMOTIONS_TABLE_HEADERS = [
|
||||
@@ -41,10 +45,19 @@ PROMOTIONS_TABLE_HEADERS = [
|
||||
|
||||
|
||||
class ClubID(Enum):
|
||||
מבצע_רגיל = 0
|
||||
מועדון = 1
|
||||
כרטיס_אשראי = 2
|
||||
אחר = 3
|
||||
_init_ = 'value string'
|
||||
|
||||
REGULAR = 0, 'מבצע רגיל'
|
||||
CLUB = 1, 'מועדון'
|
||||
CREDIT_CARD = 2, 'כרטיס אשראי'
|
||||
OTHER = 3, 'אחר'
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
return ClubID.OTHER
|
||||
|
||||
def __str__(self):
|
||||
return self.string
|
||||
|
||||
|
||||
class RewardType(Enum):
|
||||
@@ -57,6 +70,7 @@ class RewardType(Enum):
|
||||
SECOND_INSTANCE_SAME_DISCOUNT = 8
|
||||
SECOND_INSTANCE_DIFFERENT_DISCOUNT = 9
|
||||
DISCOUNT_IN_MULTIPLE_INSTANCES = 10
|
||||
OTHER = 11
|
||||
|
||||
|
||||
class Promotion:
|
||||
@@ -90,15 +104,15 @@ class Promotion:
|
||||
return self.promotion_id == other.promotion_id
|
||||
|
||||
|
||||
def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None:
|
||||
def write_promotions_to_table(promotions: List[Promotion], output_filename: str) -> None:
|
||||
"""
|
||||
This function writes a promotions table to a given CSV or XLSX output file.
|
||||
This function writes a List of promotions to a csv or xlsx output file.
|
||||
|
||||
:param promotions: A given list of promotions
|
||||
:param output_filename: A given file to write to
|
||||
"""
|
||||
log_message_and_time_if_debug('Writing promotions to output file')
|
||||
rows = [get_promotion_row_for_csv(promo, item) for promo in promotions for item in promo.items]
|
||||
rows = [get_promotion_row_for_table(promo, item) for promo in promotions for item in promo.items]
|
||||
if output_filename.endswith('.csv'):
|
||||
encoding_file = "utf_8_sig" if sys.platform == "win32" else "utf_8"
|
||||
with open(output_filename, mode='w', newline='', encoding=encoding_file) as f_out:
|
||||
@@ -135,28 +149,30 @@ def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -
|
||||
raise ValueError(f"The given output file has an invalid extension:\n{output_filename}")
|
||||
|
||||
|
||||
def get_promotion_row_for_csv(promo: Promotion, item: Item):
|
||||
def get_promotion_row_for_table(promo: Promotion, item: Item) -> List:
|
||||
"""
|
||||
This function returns a row in the promotions XLSX table.
|
||||
|
||||
:param promo: A given Promotion object
|
||||
:param item: A given item object participating in the promotion
|
||||
"""
|
||||
return [promo.content,
|
||||
item.name,
|
||||
item.price,
|
||||
promo.promo_func(item),
|
||||
(item.price - promo.promo_func(item)) / item.price,
|
||||
promo.club_id.name.replace('_', ' '),
|
||||
promo.max_qty,
|
||||
promo.allow_multiple_discounts,
|
||||
promo.start_date <= datetime.now(),
|
||||
promo.start_date,
|
||||
promo.end_date,
|
||||
promo.update_date,
|
||||
item.manufacturer,
|
||||
item.code,
|
||||
promo.reward_type.value]
|
||||
return [
|
||||
promo.content,
|
||||
item.name,
|
||||
item.price,
|
||||
promo.promo_func(item),
|
||||
(item.price - promo.promo_func(item)) / max(item.price, 1),
|
||||
promo.club_id.string,
|
||||
promo.max_qty,
|
||||
promo.allow_multiple_discounts,
|
||||
promo.start_date <= datetime.now(),
|
||||
promo.start_date,
|
||||
promo.end_date,
|
||||
promo.update_date,
|
||||
item.manufacturer,
|
||||
item.code,
|
||||
promo.reward_type.value,
|
||||
]
|
||||
|
||||
|
||||
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos: bool) \
|
||||
@@ -177,7 +193,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
|
||||
|
||||
log_message_and_time_if_debug('Creating promotions objects')
|
||||
promo_objs = list()
|
||||
for promo in promo_tags:
|
||||
for promo in tqdm(promo_tags, desc='creating_promotions'):
|
||||
promotion_id = int(promo.find(re.compile('PromotionId', re.IGNORECASE)).text)
|
||||
if promo_objs and promo_objs[-1].promotion_id == promotion_id:
|
||||
promo_objs[-1].items.extend(chain.get_items(promo, items_dict))
|
||||
@@ -243,7 +259,7 @@ def get_discounted_price(promo):
|
||||
def get_discount_rate(discount_rate: Union[float, None], discount_in_percentage: bool):
|
||||
if discount_rate:
|
||||
if discount_in_percentage:
|
||||
return int(discount_rate) * (10 ** -(len(str(discount_rate))))
|
||||
return float(discount_rate) * (10 ** -(len(str(discount_rate))))
|
||||
return float(discount_rate)
|
||||
|
||||
|
||||
@@ -271,6 +287,9 @@ def find_promo_function(reward_type: RewardType, remark: str, promo_description:
|
||||
if reward_type == RewardType.DISCOUNT_BY_THRESHOLD:
|
||||
return lambda item: item.price - discount_rate
|
||||
|
||||
if reward_type == RewardType.OTHER:
|
||||
return lambda item: item.price
|
||||
|
||||
if 'מחיר המבצע הינו המחיר לק"ג' in remark:
|
||||
return lambda item: discounted_price
|
||||
|
||||
@@ -294,7 +313,7 @@ def main_latest_promos(store_id: int, output_filename, chain: SupermarketChain,
|
||||
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
|
||||
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
||||
promo.end_date), reverse=True)
|
||||
write_promotions_to_csv(promotions, output_filename)
|
||||
write_promotions_to_table(promotions, output_filename)
|
||||
|
||||
|
||||
def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
|
||||
@@ -313,7 +332,6 @@ def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str,
|
||||
logging.info(promo.repr_ltr())
|
||||
|
||||
|
||||
# TODO: change to returning list of Items
|
||||
def get_all_null_items_in_promos(chain, store_id) -> List[str]:
|
||||
"""
|
||||
This function finds all items appearing in the chain's promotions file but not in the chain's prices file.
|
||||
@@ -335,7 +353,7 @@ def get_all_promos_tags(chain: SupermarketChain, store_id: int, load_xml: bool)
|
||||
:return: A list of promotions tags
|
||||
"""
|
||||
bs_objects = list()
|
||||
for category in XML_FILES_PROMOTIONS_CATEGORIES:
|
||||
for category in tqdm(XML_FILES_PROMOTIONS_CATEGORIES, desc='promotions_files'):
|
||||
xml_path = xml_file_gen(chain, store_id, category.name)
|
||||
bs_objects.append(create_bs_object(chain, store_id, category, load_xml, xml_path))
|
||||
|
||||
|
@@ -6,3 +6,10 @@ lxml==4.6.1
|
||||
requests==2.25.0
|
||||
soupsieve==2.0.1
|
||||
urllib3==1.26.2
|
||||
openpyxl
|
||||
tqdm~=4.62.1
|
||||
pytest~=6.2.2
|
||||
pandas~=1.2.0
|
||||
argparse~=1.4.0
|
||||
XlsxWriter~=1.4.3
|
||||
aenum
|
@@ -0,0 +1,121 @@
|
||||
import logging
|
||||
import os
|
||||
import pytest
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
from chains.bareket import Bareket
|
||||
from chains.co_op import CoOp
|
||||
from chains.dor_alon import DorAlon
|
||||
from chains.keshet import Keshet
|
||||
from chains.shuk_hayir import ShukHayir
|
||||
from chains.stop_market import StopMarket
|
||||
from chains.tiv_taam import TivTaam
|
||||
from chains.zol_vebegadol import ZolVebegadol
|
||||
from promotion import PROMOTION_COLS_NUM, main_latest_promos
|
||||
from supermarket_chain import SupermarketChain
|
||||
from chains import (
|
||||
bareket,
|
||||
mahsaneiHashook,
|
||||
dor_alon,
|
||||
freshmarket,
|
||||
hazi_hinam,
|
||||
keshet,
|
||||
stop_market,
|
||||
tiv_taam,
|
||||
shufersal,
|
||||
co_op,
|
||||
victory,
|
||||
yohananof,
|
||||
zol_vebegadol,
|
||||
rami_levi,
|
||||
osher_ad,
|
||||
maayan2000,
|
||||
shuk_hayir,
|
||||
king_store,
|
||||
shefa_birkat_hashem,
|
||||
)
|
||||
|
||||
pytest.main(args=['-s', os.path.abspath(__file__)])
|
||||
|
||||
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()}
|
||||
|
||||
MIN_NUM_OF_PROMOS = 3
|
||||
|
||||
|
||||
def test_searching_for_download_urls():
|
||||
"""
|
||||
Test that get_download_url of each chain returns the correct download url for each category in every chain.
|
||||
"""
|
||||
session = requests.Session()
|
||||
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
|
||||
|
||||
logging.info(f'Checking download urls in chain {chain_name}')
|
||||
store_id: int = valid_store_id_by_chain(chain_name)
|
||||
|
||||
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session)
|
||||
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session)
|
||||
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session)
|
||||
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session)
|
||||
|
||||
|
||||
def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory,
|
||||
regex_pat: str, session: requests.session):
|
||||
download_url: str = chain.get_download_url(store_id, category, session)
|
||||
if not download_url: # Not found non-full Promos/Prices file
|
||||
return
|
||||
logging.debug(download_url)
|
||||
assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}'
|
||||
if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]:
|
||||
assert not re.search('full', download_url, re.IGNORECASE), \
|
||||
f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}'
|
||||
|
||||
|
||||
def test_promotions_scraping():
|
||||
"""
|
||||
Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output.
|
||||
"""
|
||||
filename = 'temp.xlsx'
|
||||
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
|
||||
logging.info(f'Test scraping promotions from {chain_name}')
|
||||
|
||||
store_id: int = valid_store_id_by_chain(chain_name)
|
||||
try:
|
||||
main_latest_promos(
|
||||
store_id=store_id,
|
||||
output_filename=filename,
|
||||
chain=chain,
|
||||
load_promos=False,
|
||||
load_xml=False,
|
||||
)
|
||||
df = pd.read_excel(filename)
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
logging.error(f"Failed loading excel of {chain_name}")
|
||||
raise
|
||||
|
||||
assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}"
|
||||
|
||||
|
||||
def valid_store_id_by_chain(chain_name) -> int:
|
||||
"""
|
||||
This function returns a valid store ID for a given chain.
|
||||
|
||||
:param chain_name: The name of a chain as returned by repr(ChainClassName).
|
||||
:return: An integer representing a valid store ID in the given chain
|
||||
"""
|
||||
if chain_name == repr(DorAlon):
|
||||
store_id = 501
|
||||
elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]:
|
||||
store_id = 2
|
||||
elif chain_name == repr(CoOp):
|
||||
store_id = 202
|
||||
elif chain_name == repr(ShukHayir):
|
||||
store_id = 4
|
||||
elif chain_name in [repr(StopMarket), repr(Keshet)]:
|
||||
store_id = 5
|
||||
else:
|
||||
store_id = 1
|
||||
return store_id
|
||||
|
6
utils.py
6
utils.py
@@ -9,6 +9,8 @@ import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from os import path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from item import Item
|
||||
from supermarket_chain import SupermarketChain
|
||||
|
||||
@@ -64,6 +66,8 @@ def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: Su
|
||||
"""
|
||||
session = requests.Session()
|
||||
download_url: str = chain.get_download_url(store_id, category, session)
|
||||
if not download_url:
|
||||
return BeautifulSoup()
|
||||
response_content = session.get(download_url).content
|
||||
try:
|
||||
xml_content: AnyStr = gzip.decompress(response_content)
|
||||
@@ -98,7 +102,7 @@ def create_items_dict(chain: SupermarketChain, store_id: int, load_xml) -> Dict[
|
||||
:param store_id: A given store id
|
||||
"""
|
||||
items_dict = dict()
|
||||
for category in [chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices]:
|
||||
for category in tqdm([chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices], desc='prices_files'):
|
||||
xml_path: str = xml_file_gen(chain, store_id, category.name)
|
||||
bs_prices: BeautifulSoup = create_bs_object(chain, store_id, category, load_xml, xml_path)
|
||||
items_tags = bs_prices.find_all(chain.item_tag_name)
|
||||
|
Reference in New Issue
Block a user