diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 484a83a..a696a27 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -1,9 +1,10 @@ import logging import os - import pytest +import requests from tqdm import tqdm import pandas as pd +import re from chains.bareket import Bareket from chains.co_op import CoOp @@ -44,23 +45,43 @@ chain_dict = {repr(chain): chain() if callable(chain) else None for chain in Sup MIN_NUM_OF_PROMOS = 3 -def test_scraping(): +def test_searching_for_download_urls(): + """ + Test that get_download_url of each chain returns the correct download url for each category: + """ + session = requests.Session() + for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): + + logging.info(f'Finding download url in chain {chain_name}') + store_id: int = valid_store_id_by_chain(chain_name) + + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session) + + +def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, + regex_pat: str, session: requests.session): + download_url: str = chain.get_download_url(store_id, category, session) + logging.debug(download_url) + if not download_url: # Not found non-full Promos/Prices file + return + assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}' + if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]: + assert not re.search('full', download_url, re.IGNORECASE), \ + f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}' + + +def test_promotions_scraping(): + """ + Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output. + """ filename = 'temp.xlsx' for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): logging.info(f'Test scraping promotions from {chain_name}') - if chain_name == repr(DorAlon): - store_id = 501 - elif chain_name in [repr(Keshet), repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: - store_id = 2 - elif chain_name == repr(CoOp): - store_id = 202 - elif chain_name == repr(ShukHayir): - store_id = 4 - elif chain_name == repr(StopMarket): - store_id = 5 - else: - store_id = 1 + store_id: int = valid_store_id_by_chain(chain_name) try: main_latest_promos( store_id=store_id, @@ -76,3 +97,25 @@ def test_scraping(): raise assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}" + + +def valid_store_id_by_chain(chain_name) -> int: + """ + This function returns a valid store ID for a given chain. + + :param chain_name: The name of a chain as returned by repr(ChainClassName). + :return: An integer representing a valid store ID in the given chain + """ + if chain_name == repr(DorAlon): + store_id = 501 + elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: + store_id = 2 + elif chain_name == repr(CoOp): + store_id = 202 + elif chain_name == repr(ShukHayir): + store_id = 4 + elif chain_name in [repr(StopMarket), repr(Keshet)]: + store_id = 5 + else: + store_id = 1 + return store_id