From 844a106c576bf54be33b2dd8cd6a42cb413edef7 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Mon, 16 Aug 2021 23:05:16 +0300 Subject: [PATCH 01/12] Added tqdm --- promotion.py | 6 ++++-- utils.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/promotion.py b/promotion.py index 68e97ad..a34b799 100644 --- a/promotion.py +++ b/promotion.py @@ -8,6 +8,8 @@ import csv import sys import pandas as pd import xlsxwriter +from tqdm import tqdm + from item import Item from utils import ( create_bs_object, create_items_dict, @@ -177,7 +179,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo log_message_and_time_if_debug('Creating promotions objects') promo_objs = list() - for promo in promo_tags: + for promo in tqdm(promo_tags, desc='creating_promotions'): promotion_id = int(promo.find(re.compile('PromotionId', re.IGNORECASE)).text) if promo_objs and promo_objs[-1].promotion_id == promotion_id: promo_objs[-1].items.extend(chain.get_items(promo, items_dict)) @@ -335,7 +337,7 @@ def get_all_promos_tags(chain: SupermarketChain, store_id: int, load_xml: bool) :return: A list of promotions tags """ bs_objects = list() - for category in XML_FILES_PROMOTIONS_CATEGORIES: + for category in tqdm(XML_FILES_PROMOTIONS_CATEGORIES, desc='promotions_files'): xml_path = xml_file_gen(chain, store_id, category.name) bs_objects.append(create_bs_object(chain, store_id, category, load_xml, xml_path)) diff --git a/utils.py b/utils.py index f809ba3..4fa2c75 100644 --- a/utils.py +++ b/utils.py @@ -9,6 +9,8 @@ import requests from bs4 import BeautifulSoup from os import path +from tqdm import tqdm + from item import Item from supermarket_chain import SupermarketChain @@ -98,7 +100,7 @@ def create_items_dict(chain: SupermarketChain, store_id: int, load_xml) -> Dict[ :param store_id: A given store id """ items_dict = dict() - for category in [chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices]: + for category in tqdm([chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices], desc='prices_files'): xml_path: str = xml_file_gen(chain, store_id, category.name) bs_prices: BeautifulSoup = create_bs_object(chain, store_id, category, load_xml, xml_path) items_tags = bs_prices.find_all(chain.item_tag_name) From 1a6707341d17498d0b132d3441fc911243bfa7f9 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Mon, 16 Aug 2021 23:07:07 +0300 Subject: [PATCH 02/12] Logical fixes in promotions scraping and calculation. --- promotion.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/promotion.py b/promotion.py index a34b799..c02e9d4 100644 --- a/promotion.py +++ b/promotion.py @@ -59,6 +59,7 @@ class RewardType(Enum): SECOND_INSTANCE_SAME_DISCOUNT = 8 SECOND_INSTANCE_DIFFERENT_DISCOUNT = 9 DISCOUNT_IN_MULTIPLE_INSTANCES = 10 + OTHER = 11 class Promotion: @@ -226,7 +227,11 @@ def create_new_promo_instance(chain: SupermarketChain, items_dict: Dict[str, Ite chain.date_hour_format) promo_update_time = datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format) - club_id = ClubID(int(promo.find(re.compile('ClubId', re.IGNORECASE)).text)) + club_id = int(promo.find(re.compile('ClubId', re.IGNORECASE)).text) + if club_id in [club_id.value for club_id in ClubID]: + club_id = ClubID(club_id) + else: + club_id = ClubID(ClubID.אחר) multiple_discounts_allowed = bool(int(promo.find('AllowMultipleDiscounts').text)) items = chain.get_items(promo, items_dict) @@ -245,7 +250,7 @@ def get_discounted_price(promo): def get_discount_rate(discount_rate: Union[float, None], discount_in_percentage: bool): if discount_rate: if discount_in_percentage: - return int(discount_rate) * (10 ** -(len(str(discount_rate)))) + return float(discount_rate) * (10 ** -(len(str(discount_rate)))) return float(discount_rate) @@ -273,6 +278,9 @@ def find_promo_function(reward_type: RewardType, remark: str, promo_description: if reward_type == RewardType.DISCOUNT_BY_THRESHOLD: return lambda item: item.price - discount_rate + if reward_type == RewardType.OTHER: + return lambda item: item.price + if 'מחיר המבצע הינו המחיר לק"ג' in remark: return lambda item: discounted_price From 9b0ab013c9683dd29f51277637accbcaa0d2acb7 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Mon, 16 Aug 2021 23:07:32 +0300 Subject: [PATCH 03/12] Added requirements to requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 98bb28c..94b3eb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ lxml==4.6.1 requests==2.25.0 soupsieve==2.0.1 urllib3==1.26.2 +openpyxl +tqdm \ No newline at end of file From 1a88ed6e01670a4495514e10eedb2b645df2f558 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Mon, 16 Aug 2021 23:08:04 +0300 Subject: [PATCH 04/12] minor changes --- main.py | 4 ---- promotion.py | 36 +++++++++++++++++++----------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 84736ea..6baeb30 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,4 @@ import os -import sys -import time from argparse import ArgumentParser from datetime import datetime from pathlib import Path @@ -33,8 +31,6 @@ from chains import ( shefa_birkat_hashem, ) -# TODO: fix problem of left-to-right printing - Path(RESULTS_DIRNAME).mkdir(exist_ok=True) Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True) diff --git a/promotion.py b/promotion.py index c02e9d4..9ee2bd6 100644 --- a/promotion.py +++ b/promotion.py @@ -101,7 +101,7 @@ def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) - :param output_filename: A given file to write to """ log_message_and_time_if_debug('Writing promotions to output file') - rows = [get_promotion_row_for_csv(promo, item) for promo in promotions for item in promo.items] + rows = [get_promotion_row_for_table(promo, item) for promo in promotions for item in promo.items] if output_filename.endswith('.csv'): encoding_file = "utf_8_sig" if sys.platform == "win32" else "utf_8" with open(output_filename, mode='w', newline='', encoding=encoding_file) as f_out: @@ -138,28 +138,30 @@ def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) - raise ValueError(f"The given output file has an invalid extension:\n{output_filename}") -def get_promotion_row_for_csv(promo: Promotion, item: Item): +def get_promotion_row_for_table(promo: Promotion, item: Item) -> List: """ This function returns a row in the promotions XLSX table. :param promo: A given Promotion object :param item: A given item object participating in the promotion """ - return [promo.content, - item.name, - item.price, - promo.promo_func(item), - (item.price - promo.promo_func(item)) / item.price, - promo.club_id.name.replace('_', ' '), - promo.max_qty, - promo.allow_multiple_discounts, - promo.start_date <= datetime.now(), - promo.start_date, - promo.end_date, - promo.update_date, - item.manufacturer, - item.code, - promo.reward_type.value] + return [ + promo.content, + item.name, + item.price, + promo.promo_func(item), + (item.price - promo.promo_func(item)) / max(item.price, 1), + promo.club_id.name.replace('_', ' '), + promo.max_qty, + promo.allow_multiple_discounts, + promo.start_date <= datetime.now(), + promo.start_date, + promo.end_date, + promo.update_date, + item.manufacturer, + item.code, + promo.reward_type.value, + ] def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos: bool) \ From c1281cb3122f04f7432f6554e422725b2c1f065b Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Mon, 16 Aug 2021 23:09:10 +0300 Subject: [PATCH 05/12] Added a test for scraping the promotions and exporting them to xlsx files. --- promotion.py | 2 + tests/test_scraping.py | 84 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/promotion.py b/promotion.py index 9ee2bd6..61fafbb 100644 --- a/promotion.py +++ b/promotion.py @@ -21,6 +21,8 @@ from supermarket_chain import SupermarketChain XML_FILES_PROMOTIONS_CATEGORIES = [SupermarketChain.XMLFilesCategory.PromosFull, SupermarketChain.XMLFilesCategory.Promos] +PROMOTION_COLS_NUM = 15 # The length of the list returned by get_promotion_row_for_table function + INVALID_OR_UNKNOWN_PROMOTION_FUNCTION = -1 PROMOTIONS_TABLE_HEADERS = [ diff --git a/tests/test_scraping.py b/tests/test_scraping.py index e69de29..474323e 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -0,0 +1,84 @@ +import logging +import os + +import pytest +from tqdm import tqdm +import pandas as pd + +from chains.bareket import Bareket +from chains.co_op import CoOp +from chains.dor_alon import DorAlon +from chains.freshmarket import Freshmarket +from chains.hazi_hinam import HaziHinam +from chains.keshet import Keshet +from chains.maayan2000 import Maayan2000 +from chains.mahsaneiHashook import MahsaneiHashook +from chains.shuk_hayir import ShukHayir +from chains.stop_market import StopMarket +from chains.tiv_taam import TivTaam +from chains.zol_vebegadol import ZolVebegadol +from promotion import PROMOTION_COLS_NUM, main_latest_promos +from supermarket_chain import SupermarketChain +from chains import ( + bareket, + mahsaneiHashook, + dor_alon, + freshmarket, + hazi_hinam, + keshet, + stop_market, + tiv_taam, + shufersal, + co_op, + victory, + yohananof, + zol_vebegadol, + rami_levi, + osher_ad, + maayan2000, + shuk_hayir, + king_store, + shefa_birkat_hashem, +) + +pytest.main(args=['-s', os.path.abspath(__file__)]) + +chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()} + +MIN_NUM_OF_PROMOS = 3 + + +def test_scraping(): + filename = 'temp.xlsx' + for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): + if chain_name in [repr(Maayan2000), repr(ZolVebegadol), repr(ShukHayir)]: + continue + logging.info(f'Test scraping promotions from {chain_name}') + if chain_name == repr(DorAlon): + store_id = 501 + elif chain_name in [repr(Keshet), repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: + store_id = 2 + elif chain_name == repr(CoOp): + store_id = 202 + elif chain_name == repr(ShukHayir): + store_id = 4 + elif chain_name == repr(StopMarket): + store_id = 5 + else: + store_id = 1 + + try: + main_latest_promos( + store_id=store_id, + output_filename=filename, + chain=chain, + load_promos=False, + load_xml=False + ) + df = pd.read_excel(filename) + except Exception as e: + logging.error(e) + logging.info(f"Failed loading excel of {chain_name}") + raise + + assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}" From 63fec1490ccba953d93de948da28646af9f33055 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Tue, 17 Aug 2021 09:18:45 +0300 Subject: [PATCH 06/12] Added new requirements to requirements.txt --- requirements.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 94b3eb2..807cc2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,8 @@ requests==2.25.0 soupsieve==2.0.1 urllib3==1.26.2 openpyxl -tqdm \ No newline at end of file +tqdm~=4.62.1 +pytest~=6.2.2 +pandas~=1.2.0 +argparse~=1.4.0 +XlsxWriter~=1.4.3 \ No newline at end of file From 3770352d04e2df41a5c700f907cbc79ae81a2335 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Tue, 17 Aug 2021 09:35:20 +0300 Subject: [PATCH 07/12] Added new requirements to requirements.txt --- tests/test_scraping.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 474323e..484a83a 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -8,11 +8,7 @@ import pandas as pd from chains.bareket import Bareket from chains.co_op import CoOp from chains.dor_alon import DorAlon -from chains.freshmarket import Freshmarket -from chains.hazi_hinam import HaziHinam from chains.keshet import Keshet -from chains.maayan2000 import Maayan2000 -from chains.mahsaneiHashook import MahsaneiHashook from chains.shuk_hayir import ShukHayir from chains.stop_market import StopMarket from chains.tiv_taam import TivTaam @@ -51,8 +47,6 @@ MIN_NUM_OF_PROMOS = 3 def test_scraping(): filename = 'temp.xlsx' for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): - if chain_name in [repr(Maayan2000), repr(ZolVebegadol), repr(ShukHayir)]: - continue logging.info(f'Test scraping promotions from {chain_name}') if chain_name == repr(DorAlon): store_id = 501 @@ -73,12 +67,12 @@ def test_scraping(): output_filename=filename, chain=chain, load_promos=False, - load_xml=False + load_xml=False, ) df = pd.read_excel(filename) except Exception as e: logging.error(e) - logging.info(f"Failed loading excel of {chain_name}") + logging.error(f"Failed loading excel of {chain_name}") raise assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}" From cffdd84086cd439f4923dd0fd29854c7e76c04c4 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Tue, 17 Aug 2021 13:06:42 +0300 Subject: [PATCH 08/12] Added specific searching for the download url of non-full promotions and prices files. Changed return value of get_download_url accordingly. --- chains/binaproject_web_client.py | 16 ++++++++++++++-- chains/cerberus_web_client.py | 10 +++++++++- chains/mahsaneiHashook.py | 13 +++++++++++-- utils.py | 2 ++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/chains/binaproject_web_client.py b/chains/binaproject_web_client.py index 8f5ecb7..ec23e4e 100644 --- a/chains/binaproject_web_client.py +++ b/chains/binaproject_web_client.py @@ -1,8 +1,12 @@ import json +import re + import requests from supermarket_chain import SupermarketChain +FNAME_KEY = "FileNm" + class BinaProjectWebClient: _date_hour_format = '%Y-%m-%d %H:%M:%S' @@ -16,8 +20,16 @@ class BinaProjectWebClient: url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"]) req_res: requests.Response = session.get(url) jsons_files = json.loads(req_res.text) - suffix = next(cur_json["FileNm"] for cur_json in jsons_files if f'-{store_id:03d}-20' in cur_json["FileNm"] - and category.name.replace('s', '') in cur_json["FileNm"]) + + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname \ + and not re.search('full', fname, re.IGNORECASE) + if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files): + return "" # Could not find non-full Promos/Prices file + else: + filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname + suffix = next( + cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY])) down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix]) return down_url diff --git a/chains/cerberus_web_client.py b/chains/cerberus_web_client.py index ae8ef98..f02f75e 100644 --- a/chains/cerberus_web_client.py +++ b/chains/cerberus_web_client.py @@ -1,4 +1,6 @@ import json +import re + import requests from supermarket_chain import SupermarketChain @@ -17,7 +19,13 @@ class CerberusWebClient: ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')} s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload) s_json: dict = json.loads(s.text) - suffix: str = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name']) + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE) + if not any(filter_func(d, store_id) for d in s_json['aaData']): + return "" # Could not find non-full Prices/Promos file + else: + filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] + suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id)) download_url: str = hostname + "/file/d/" + suffix return download_url diff --git a/chains/mahsaneiHashook.py b/chains/mahsaneiHashook.py index 9cb7c5b..b11e387 100644 --- a/chains/mahsaneiHashook.py +++ b/chains/mahsaneiHashook.py @@ -1,3 +1,4 @@ +import re from typing import Dict, List import requests from bs4 import BeautifulSoup @@ -21,8 +22,16 @@ class MahsaneiHashook(SupermarketChain): url = prefix + "NBCompetitionRegulations.aspx" req_res: requests.Response = requests.get(url) soup = BeautifulSoup(req_res.text, features='lxml') - suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value - and f'-{store_id:03d}-20' in value).attrs['href'] + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ + and f'-{store_id:03d}-20' in fname \ + and not re.search('full', fname, re.IGNORECASE) + if soup.find('a', href=fname_filter_func) is None: + return "" # Could not find non-full Promos/Prices file + else: + fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ + and f'-{store_id:03d}-20' in fname + suffix: str = soup.find('a', href=fname_filter_func).attrs['href'] down_url: str = prefix + suffix return down_url diff --git a/utils.py b/utils.py index 4fa2c75..4d3e3b2 100644 --- a/utils.py +++ b/utils.py @@ -66,6 +66,8 @@ def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: Su """ session = requests.Session() download_url: str = chain.get_download_url(store_id, category, session) + if not download_url: + return BeautifulSoup() response_content = session.get(download_url).content try: xml_content: AnyStr = gzip.decompress(response_content) From 294dee8cc2dfb92345fad3db219d2b69f92068a1 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Tue, 17 Aug 2021 13:08:39 +0300 Subject: [PATCH 09/12] Added test for searching different files' urls. Specifically, asserting the searching non-full files does not yield urls of full files. --- tests/test_scraping.py | 71 +++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 484a83a..a696a27 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -1,9 +1,10 @@ import logging import os - import pytest +import requests from tqdm import tqdm import pandas as pd +import re from chains.bareket import Bareket from chains.co_op import CoOp @@ -44,23 +45,43 @@ chain_dict = {repr(chain): chain() if callable(chain) else None for chain in Sup MIN_NUM_OF_PROMOS = 3 -def test_scraping(): +def test_searching_for_download_urls(): + """ + Test that get_download_url of each chain returns the correct download url for each category: + """ + session = requests.Session() + for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): + + logging.info(f'Finding download url in chain {chain_name}') + store_id: int = valid_store_id_by_chain(chain_name) + + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session) + _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session) + + +def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, + regex_pat: str, session: requests.session): + download_url: str = chain.get_download_url(store_id, category, session) + logging.debug(download_url) + if not download_url: # Not found non-full Promos/Prices file + return + assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}' + if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]: + assert not re.search('full', download_url, re.IGNORECASE), \ + f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}' + + +def test_promotions_scraping(): + """ + Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output. + """ filename = 'temp.xlsx' for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): logging.info(f'Test scraping promotions from {chain_name}') - if chain_name == repr(DorAlon): - store_id = 501 - elif chain_name in [repr(Keshet), repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: - store_id = 2 - elif chain_name == repr(CoOp): - store_id = 202 - elif chain_name == repr(ShukHayir): - store_id = 4 - elif chain_name == repr(StopMarket): - store_id = 5 - else: - store_id = 1 + store_id: int = valid_store_id_by_chain(chain_name) try: main_latest_promos( store_id=store_id, @@ -76,3 +97,25 @@ def test_scraping(): raise assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}" + + +def valid_store_id_by_chain(chain_name) -> int: + """ + This function returns a valid store ID for a given chain. + + :param chain_name: The name of a chain as returned by repr(ChainClassName). + :return: An integer representing a valid store ID in the given chain + """ + if chain_name == repr(DorAlon): + store_id = 501 + elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: + store_id = 2 + elif chain_name == repr(CoOp): + store_id = 202 + elif chain_name == repr(ShukHayir): + store_id = 4 + elif chain_name in [repr(StopMarket), repr(Keshet)]: + store_id = 5 + else: + store_id = 1 + return store_id From 322995ba15bcb24d079aaa98c22a6d6c0ac06698 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Wed, 18 Aug 2021 11:16:25 +0300 Subject: [PATCH 10/12] Added TODO for ordering the argparse --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 6baeb30..9251f15 100644 --- a/main.py +++ b/main.py @@ -36,6 +36,7 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True) chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()} +# TODO: change functions arguments to include all necessary parameters (e.g. chain) or split arguments if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--promos', From 87b6fbe2b0df2f0b9b77a0cb87cc5d4fea24100d Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Wed, 18 Aug 2021 11:30:31 +0300 Subject: [PATCH 11/12] Changed ClubID enum class to include a string field used for printing, and define ClubID.OTHER as a default value for the class to handle invalid inputs. --- promotion.py | 27 ++++++++++++++++----------- requirements.txt | 3 ++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/promotion.py b/promotion.py index 61fafbb..51e2e22 100644 --- a/promotion.py +++ b/promotion.py @@ -1,7 +1,6 @@ import logging import re from datetime import datetime -from enum import Enum from typing import Dict, List, Union from bs4.element import Tag import csv @@ -9,6 +8,7 @@ import sys import pandas as pd import xlsxwriter from tqdm import tqdm +from aenum import Enum from item import Item from utils import ( @@ -45,10 +45,19 @@ PROMOTIONS_TABLE_HEADERS = [ class ClubID(Enum): - מבצע_רגיל = 0 - מועדון = 1 - כרטיס_אשראי = 2 - אחר = 3 + _init_ = 'value string' + + REGULAR = 0, 'מבצע רגיל' + CLUB = 1, 'מועדון' + CREDIT_CARD = 2, 'כרטיס אשראי' + OTHER = 3, 'אחר' + + @classmethod + def _missing_(cls, value): + return ClubID.OTHER + + def __str__(self): + return self.string class RewardType(Enum): @@ -153,7 +162,7 @@ def get_promotion_row_for_table(promo: Promotion, item: Item) -> List: item.price, promo.promo_func(item), (item.price - promo.promo_func(item)) / max(item.price, 1), - promo.club_id.name.replace('_', ' '), + promo.club_id.string, promo.max_qty, promo.allow_multiple_discounts, promo.start_date <= datetime.now(), @@ -231,11 +240,7 @@ def create_new_promo_instance(chain: SupermarketChain, items_dict: Dict[str, Ite chain.date_hour_format) promo_update_time = datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format) - club_id = int(promo.find(re.compile('ClubId', re.IGNORECASE)).text) - if club_id in [club_id.value for club_id in ClubID]: - club_id = ClubID(club_id) - else: - club_id = ClubID(ClubID.אחר) + club_id = ClubID(int(promo.find(re.compile('ClubId', re.IGNORECASE)).text)) multiple_discounts_allowed = bool(int(promo.find('AllowMultipleDiscounts').text)) items = chain.get_items(promo, items_dict) diff --git a/requirements.txt b/requirements.txt index 807cc2d..35740e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ tqdm~=4.62.1 pytest~=6.2.2 pandas~=1.2.0 argparse~=1.4.0 -XlsxWriter~=1.4.3 \ No newline at end of file +XlsxWriter~=1.4.3 +aenum \ No newline at end of file From 90cab0a2e142628d8ee20387b95137e6bd3230e1 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Wed, 18 Aug 2021 11:32:04 +0300 Subject: [PATCH 12/12] Minor changes --- promotion.py | 7 +++---- tests/test_scraping.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/promotion.py b/promotion.py index 51e2e22..4023185 100644 --- a/promotion.py +++ b/promotion.py @@ -104,9 +104,9 @@ class Promotion: return self.promotion_id == other.promotion_id -def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None: +def write_promotions_to_table(promotions: List[Promotion], output_filename: str) -> None: """ - This function writes a promotions table to a given CSV or XLSX output file. + This function writes a List of promotions to a csv or xlsx output file. :param promotions: A given list of promotions :param output_filename: A given file to write to @@ -313,7 +313,7 @@ def main_latest_promos(store_id: int, output_filename, chain: SupermarketChain, promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos) promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - promo.end_date), reverse=True) - write_promotions_to_csv(promotions, output_filename) + write_promotions_to_table(promotions, output_filename) def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool): @@ -332,7 +332,6 @@ def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, logging.info(promo.repr_ltr()) -# TODO: change to returning list of Items def get_all_null_items_in_promos(chain, store_id) -> List[str]: """ This function finds all items appearing in the chain's promotions file but not in the chain's prices file. diff --git a/tests/test_scraping.py b/tests/test_scraping.py index a696a27..f2e3d9d 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -47,12 +47,12 @@ MIN_NUM_OF_PROMOS = 3 def test_searching_for_download_urls(): """ - Test that get_download_url of each chain returns the correct download url for each category: + Test that get_download_url of each chain returns the correct download url for each category in every chain. """ session = requests.Session() for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): - logging.info(f'Finding download url in chain {chain_name}') + logging.info(f'Checking download urls in chain {chain_name}') store_id: int = valid_store_id_by_chain(chain_name) _test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session) @@ -64,9 +64,9 @@ def test_searching_for_download_urls(): def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, regex_pat: str, session: requests.session): download_url: str = chain.get_download_url(store_id, category, session) - logging.debug(download_url) if not download_url: # Not found non-full Promos/Prices file return + logging.debug(download_url) assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}' if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]: assert not re.search('full', download_url, re.IGNORECASE), \