Fixed the bug with cerberus_web_client.py by working with Selenium. To login each chain working with it must have a username for login with Selenium. in this mechanism, a path to a gz file is returned instead of url

Added the option to output a prices json file in main.py under --prices-with-promos, where the prices are updated by the latest promotions (under the 'final_price' key, where 'price' represents the price before promotions). Fixed small bug of BinaWebCleint by checking that filename does not contain 'null'. Changed Hierarchy of chains such that it includes the webclients. Added the date to the output filenames to start storing the data over time. Black formatting (according to pip 8 guidelines). Changed the chains_dict in main to a constant one.
2022-10-04 11:42:36 +03:00
parent b5db721a3d
commit ceff48dbd9
28 changed files with 796 additions and 406 deletions
--- a/tests/test_scraping.py
+++ b/tests/test_scraping.py
@@ -1,10 +1,11 @@
 import logging
 import os
+import re
+import tempfile
+
+import pandas as pd
 import pytest
 import requests
-from tqdm import tqdm
-import pandas as pd
-import re

 from chains.bareket import Bareket
 from chains.co_op import CoOp
@@ -14,89 +15,90 @@ from chains.shuk_hayir import ShukHayir
 from chains.stop_market import StopMarket
 from chains.tiv_taam import TivTaam
 from chains.zol_vebegadol import ZolVebegadol
+from main import CHAINS_DICT
 from promotion import PROMOTION_COLS_NUM, main_latest_promos
 from supermarket_chain import SupermarketChain
-from chains import (
-    bareket,
-    mahsaneiHashook,
-    dor_alon,
-    freshmarket,
-    hazi_hinam,
-    keshet,
-    stop_market,
-    tiv_taam,
-    shufersal,
-    co_op,
-    victory,
-    yohananof,
-    zol_vebegadol,
-    rami_levi,
-    osher_ad,
-    maayan2000,
-    shuk_hayir,
-    king_store,
-    shefa_birkat_hashem,
-)

-pytest.main(args=['-s', os.path.abspath(__file__)])
+pytest.main(args=["-s", os.path.abspath(__file__)])

-chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()}
+session = requests.Session()

 MIN_NUM_OF_PROMOS = 3


-def test_searching_for_download_urls():
+@pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
+def test_searching_for_download_urls(chain_tuple):
    """
    Test that get_download_url of each chain returns the correct download url for each category in every chain.
    """
-    session = requests.Session()
-    for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
+    chain_name, chain = chain_tuple
+    # for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):

-        logging.info(f'Checking download urls in chain {chain_name}')
-        store_id: int = valid_store_id_by_chain(chain_name)
+    logging.info(f"Checking download urls in chain {chain_name}")
+    store_id: int = valid_store_id_by_chain(chain_name)

-        _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session)
-        _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session)
-        _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session)
-        _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session)
+    _test_download_url_helper(
+        chain, store_id, chain.XMLFilesCategory.PromosFull, r"promo[s]?full", session
+    )
+    _test_download_url_helper(
+        chain, store_id, chain.XMLFilesCategory.Promos, r"promo[s]?", session
+    )
+    _test_download_url_helper(
+        chain, store_id, chain.XMLFilesCategory.PricesFull, r"price[s]?full", session
+    )
+    _test_download_url_helper(
+        chain, store_id, chain.XMLFilesCategory.Prices, r"price[s]?", session
+    )


-def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory,
-                              regex_pat: str, session: requests.session):
-    download_url: str = chain.get_download_url(store_id, category, session)
+def _test_download_url_helper(
+    chain: SupermarketChain,
+    store_id: int,
+    category: SupermarketChain.XMLFilesCategory,
+    regex_pat: str,
+    session: requests.session,
+):
+    download_url: str = chain.get_download_url_or_path(store_id, category, session)
    if not download_url:  # Not found non-full Promos/Prices file
        return
    logging.debug(download_url)
-    assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}'
+    assert re.search(
+        regex_pat, download_url, re.IGNORECASE
+    ), f"Invalid {category.name} url in {repr(type(chain))}"
    if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]:
-        assert not re.search('full', download_url, re.IGNORECASE), \
-            f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}'
+        assert not re.search(
+            "full", download_url, re.IGNORECASE
+        ), f"Downloaded the full {category.name} file mistakenly in {repr(type(chain))}"


-def test_promotions_scraping():
+@pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
+def test_promotions_scraping(chain_tuple):
    """
    Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output.
    """
-    filename = 'temp.xlsx'
-    for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
-        logging.info(f'Test scraping promotions from {chain_name}')
+    chain_name, chain = chain_tuple
+    tf = tempfile.NamedTemporaryFile(suffix=".xlsx")

-        store_id: int = valid_store_id_by_chain(chain_name)
-        try:
-            main_latest_promos(
-                store_id=store_id,
-                output_filename=filename,
-                chain=chain,
-                load_promos=False,
-                load_xml=False,
-            )
-            df = pd.read_excel(filename)
-        except Exception as e:
-            logging.error(e)
-            logging.error(f"Failed loading excel of {chain_name}")
-            raise
+    logging.info(f"Test scraping promotions from {chain_name}")

-        assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}"
+    store_id: int = valid_store_id_by_chain(chain_name)
+    try:
+        main_latest_promos(
+            store_id=store_id,
+            output_filename=tf.name,
+            chain=chain,
+            load_promos=False,
+            load_prices=False,
+        )
+        df = pd.read_excel(tf.name)
+    except Exception as e:
+        logging.error(e)
+        logging.error(f"Failed loading excel of {chain_name}")
+        raise
+
+    assert (
+        df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM
+    ), f"Failed scraping {chain_name}"


 def valid_store_id_by_chain(chain_name) -> int:
@@ -108,11 +110,11 @@ def valid_store_id_by_chain(chain_name) -> int:
    """
    if chain_name == repr(DorAlon):
        store_id = 501
-    elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]:
+    elif chain_name in [repr(TivTaam), repr(Bareket)]:
        store_id = 2
    elif chain_name == repr(CoOp):
        store_id = 202
-    elif chain_name == repr(ShukHayir):
+    elif chain_name == [repr(ShukHayir), repr(ZolVebegadol)]:
        store_id = 4
    elif chain_name in [repr(StopMarket), repr(Keshet)]:
        store_id = 5