Files
supermarket-scraping/tests/test_scraping.py
2021-08-18 11:32:04 +03:00

122 lines
4.1 KiB
Python

import logging
import os
import pytest
import requests
from tqdm import tqdm
import pandas as pd
import re
from chains.bareket import Bareket
from chains.co_op import CoOp
from chains.dor_alon import DorAlon
from chains.keshet import Keshet
from chains.shuk_hayir import ShukHayir
from chains.stop_market import StopMarket
from chains.tiv_taam import TivTaam
from chains.zol_vebegadol import ZolVebegadol
from promotion import PROMOTION_COLS_NUM, main_latest_promos
from supermarket_chain import SupermarketChain
from chains import (
bareket,
mahsaneiHashook,
dor_alon,
freshmarket,
hazi_hinam,
keshet,
stop_market,
tiv_taam,
shufersal,
co_op,
victory,
yohananof,
zol_vebegadol,
rami_levi,
osher_ad,
maayan2000,
shuk_hayir,
king_store,
shefa_birkat_hashem,
)
pytest.main(args=['-s', os.path.abspath(__file__)])
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()}
MIN_NUM_OF_PROMOS = 3
def test_searching_for_download_urls():
"""
Test that get_download_url of each chain returns the correct download url for each category in every chain.
"""
session = requests.Session()
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
logging.info(f'Checking download urls in chain {chain_name}')
store_id: int = valid_store_id_by_chain(chain_name)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session)
def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory,
regex_pat: str, session: requests.session):
download_url: str = chain.get_download_url(store_id, category, session)
if not download_url: # Not found non-full Promos/Prices file
return
logging.debug(download_url)
assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}'
if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]:
assert not re.search('full', download_url, re.IGNORECASE), \
f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}'
def test_promotions_scraping():
"""
Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output.
"""
filename = 'temp.xlsx'
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
logging.info(f'Test scraping promotions from {chain_name}')
store_id: int = valid_store_id_by_chain(chain_name)
try:
main_latest_promos(
store_id=store_id,
output_filename=filename,
chain=chain,
load_promos=False,
load_xml=False,
)
df = pd.read_excel(filename)
except Exception as e:
logging.error(e)
logging.error(f"Failed loading excel of {chain_name}")
raise
assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}"
def valid_store_id_by_chain(chain_name) -> int:
"""
This function returns a valid store ID for a given chain.
:param chain_name: The name of a chain as returned by repr(ChainClassName).
:return: An integer representing a valid store ID in the given chain
"""
if chain_name == repr(DorAlon):
store_id = 501
elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]:
store_id = 2
elif chain_name == repr(CoOp):
store_id = 202
elif chain_name == repr(ShukHayir):
store_id = 4
elif chain_name in [repr(StopMarket), repr(Keshet)]:
store_id = 5
else:
store_id = 1
return store_id