Fixed the bug with cerberus_web_client.py by working with Selenium. To login each chain working with it must have a username for login with Selenium. in this mechanism, a path to a gz file is returned instead of url

Added the option to output a prices json file in main.py under --prices-with-promos, where the prices are updated by the latest promotions (under the 'final_price' key, where 'price' represents the price before promotions).

Fixed small bug of BinaWebCleint by checking that filename does not contain 'null'.

Changed Hierarchy of chains such that it includes the webclients.

Added the date to the output filenames to start storing the data over time.

Black formatting (according to pip 8 guidelines).

Changed the chains_dict in main to a constant one.
This commit is contained in:
korenlazar
2022-10-04 11:42:36 +03:00
parent b5db721a3d
commit ceff48dbd9
28 changed files with 796 additions and 406 deletions

View File

@@ -1,10 +1,11 @@
import logging
import os
import re
import tempfile
import pandas as pd
import pytest
import requests
from tqdm import tqdm
import pandas as pd
import re
from chains.bareket import Bareket
from chains.co_op import CoOp
@@ -14,89 +15,90 @@ from chains.shuk_hayir import ShukHayir
from chains.stop_market import StopMarket
from chains.tiv_taam import TivTaam
from chains.zol_vebegadol import ZolVebegadol
from main import CHAINS_DICT
from promotion import PROMOTION_COLS_NUM, main_latest_promos
from supermarket_chain import SupermarketChain
from chains import (
bareket,
mahsaneiHashook,
dor_alon,
freshmarket,
hazi_hinam,
keshet,
stop_market,
tiv_taam,
shufersal,
co_op,
victory,
yohananof,
zol_vebegadol,
rami_levi,
osher_ad,
maayan2000,
shuk_hayir,
king_store,
shefa_birkat_hashem,
)
pytest.main(args=['-s', os.path.abspath(__file__)])
pytest.main(args=["-s", os.path.abspath(__file__)])
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()}
session = requests.Session()
MIN_NUM_OF_PROMOS = 3
def test_searching_for_download_urls():
@pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
def test_searching_for_download_urls(chain_tuple):
"""
Test that get_download_url of each chain returns the correct download url for each category in every chain.
"""
session = requests.Session()
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
chain_name, chain = chain_tuple
# for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
logging.info(f'Checking download urls in chain {chain_name}')
store_id: int = valid_store_id_by_chain(chain_name)
logging.info(f"Checking download urls in chain {chain_name}")
store_id: int = valid_store_id_by_chain(chain_name)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.PromosFull, r"promo[s]?full", session
)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.Promos, r"promo[s]?", session
)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.PricesFull, r"price[s]?full", session
)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.Prices, r"price[s]?", session
)
def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory,
regex_pat: str, session: requests.session):
download_url: str = chain.get_download_url(store_id, category, session)
def _test_download_url_helper(
chain: SupermarketChain,
store_id: int,
category: SupermarketChain.XMLFilesCategory,
regex_pat: str,
session: requests.session,
):
download_url: str = chain.get_download_url_or_path(store_id, category, session)
if not download_url: # Not found non-full Promos/Prices file
return
logging.debug(download_url)
assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}'
assert re.search(
regex_pat, download_url, re.IGNORECASE
), f"Invalid {category.name} url in {repr(type(chain))}"
if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]:
assert not re.search('full', download_url, re.IGNORECASE), \
f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}'
assert not re.search(
"full", download_url, re.IGNORECASE
), f"Downloaded the full {category.name} file mistakenly in {repr(type(chain))}"
def test_promotions_scraping():
@pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
def test_promotions_scraping(chain_tuple):
"""
Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output.
"""
filename = 'temp.xlsx'
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
logging.info(f'Test scraping promotions from {chain_name}')
chain_name, chain = chain_tuple
tf = tempfile.NamedTemporaryFile(suffix=".xlsx")
store_id: int = valid_store_id_by_chain(chain_name)
try:
main_latest_promos(
store_id=store_id,
output_filename=filename,
chain=chain,
load_promos=False,
load_xml=False,
)
df = pd.read_excel(filename)
except Exception as e:
logging.error(e)
logging.error(f"Failed loading excel of {chain_name}")
raise
logging.info(f"Test scraping promotions from {chain_name}")
assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}"
store_id: int = valid_store_id_by_chain(chain_name)
try:
main_latest_promos(
store_id=store_id,
output_filename=tf.name,
chain=chain,
load_promos=False,
load_prices=False,
)
df = pd.read_excel(tf.name)
except Exception as e:
logging.error(e)
logging.error(f"Failed loading excel of {chain_name}")
raise
assert (
df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM
), f"Failed scraping {chain_name}"
def valid_store_id_by_chain(chain_name) -> int:
@@ -108,11 +110,11 @@ def valid_store_id_by_chain(chain_name) -> int:
"""
if chain_name == repr(DorAlon):
store_id = 501
elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]:
elif chain_name in [repr(TivTaam), repr(Bareket)]:
store_id = 2
elif chain_name == repr(CoOp):
store_id = 202
elif chain_name == repr(ShukHayir):
elif chain_name == [repr(ShukHayir), repr(ZolVebegadol)]:
store_id = 4
elif chain_name in [repr(StopMarket), repr(Keshet)]:
store_id = 5