diff --git a/main.py b/main.py index 6c33e2d..b5f98ea 100644 --- a/main.py +++ b/main.py @@ -1,204 +1,20 @@ -from typing import List, Dict -from bs4 import BeautifulSoup -from datetime import datetime -import requests -import gzip -from enum import Enum -from argparse import ArgumentParser, ArgumentTypeError +from argparse import ArgumentParser import logging -import time +from promotion import main_latest_promos +from store import get_store_id, store_id_type +from utils import get_products_prices +# import json +# from bs4 import BeautifulSoup +# import requests -PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות'] - -STORE_ID_NOT_FOUND = -1 - - -class ShufersalCategories(Enum): - All, Prices, PricesFull, Promos, PromosFull, Stores = range(6) - - -def xml_file_gen(category_name: str, store_id: int) -> str: - """ - This function generate an xml filename given a store id and a category_name - If the given store_id is invalid, it is ignored in the returned string. - - :param store_id: A given store_id - :param category_name: A given category name - :return: An xml filename - """ - store_id_str = f"-{str(store_id)}" if is_valid_store_id(store_id) else "" - return f"{category_name}{store_id_str}.xml" - - -class Promotion: - """ - A class of a promotion in Shufersal. - It contains only part of the available information in Shufersal's data. - """ - - def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, - code_items: List[str]): - self.content: str = content - self.start_date = start_date - self.end_date: datetime = end_date - self.update_date: datetime = update_date - self.code_items: List[str] = code_items - - def __str__(self): - title = self.content - dates_range = f"Between {self.start_date.date()} and {self.end_date.date()}" - update_line = f"Updated at {self.update_date.date()}" - items = '\n'.join(str(item) for item in self.code_items) - return '\n'.join([title, dates_range, update_line, items]) + '\n' - - -def get_download_url(store_id: int, cat_id: int) -> str: - """ - This function scrapes Shufersal's website and returns a url that contains the data for a given store and category. - For info about the categories, see ShufersalCategories. - - :param store_id: A given id of a store - :param cat_id: A given id of a category - :return: A downloadable link of the data for a given store and category - """ - url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={cat_id}" - if is_valid_store_id(store_id): - url += f"&storeId={store_id}" - req_res = requests.get(url) - soup = BeautifulSoup(req_res.text, features='lxml') - return soup.find('a', text="לחץ להורדה")['href'] - - -def create_bs_object(xml_path, download_url: str) -> BeautifulSoup: - """ - This function creates a BeautifulSoup object according to the given xml_path and download_url. - In case the given download_url is an empty string, the function tries to read from the given xml_path, - otherwise it downloads the gzip from the download link and extract it. - - :param xml_path: A given path to an xml file - :param download_url: A string that may represent a link (described above) - :return: A BeautifulSoup object with xml content (either from a file or a link). - """ - if download_url: - xml_content = gzip.decompress(requests.get(download_url).content) - with open(xml_path, 'wb') as f_out: - f_out.write(xml_content) - return BeautifulSoup(xml_content, features='xml') - else: - with open(xml_path, 'rb') as f_in: - return BeautifulSoup(f_in, features='xml') - - -def get_available_promos(store_id: int, load_xml: bool) -> List[Promotion]: - """ - This function return the available promotions given a BeautifulSoup object. - - :param store_id: A given store id - :param load_xml: A boolean representing whether to load an existing xml or load an already saved one - :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available - """ - start = time.time() - items_dict = create_items_dict(store_id, load_xml) - - down_url = get_download_url(store_id, ShufersalCategories.PromosFull.value) - bs_promos = create_bs_object(xml_file_gen(ShufersalCategories.PromosFull.name, store_id), down_url) - - promo_objs = list() - for cur_promo in bs_promos.find_all("Promotion"): - cur_promo = Promotion( - content=cur_promo.find('PromotionDescription').text, - start_date=datetime.strptime(cur_promo.find('PromotionStartDate').text, '%Y-%m-%d'), - end_date=datetime.strptime(cur_promo.find('PromotionEndDate').text, '%Y-%m-%d'), - update_date=datetime.strptime(cur_promo.find('PromotionUpdateDate').text, '%Y-%m-%d %H:%M'), - code_items=[items_dict.get(item.find('ItemCode').text) for item in cur_promo.find_all('Item') - if items_dict.get(item.find('ItemCode').text)], - ) - if is_valid_promo(cur_promo): - promo_objs.append(cur_promo) - print(f"Finished getting available promos in {time.time() - start}") - return promo_objs - - -def is_valid_promo(promo: Promotion): - today_date = datetime.now() - not_expired = promo.end_date.date() >= today_date.date() - has_started = promo.start_date <= today_date - has_products = len(promo.code_items) > 0 - in_promo_ignore_list = any(product in promo.content for product in PRODUCTS_TO_IGNORE) - return not_expired and has_started and has_products and not in_promo_ignore_list - - -def create_items_dict(store_id: int, load_xml) -> Dict: - """ - This function creates a dictionary where every key is an item code and its value is the item's name and price. - - :param load_xml: A boolean representing whether to load an existing prices xml file - :param store_id: A given store id - :return: A dictionary where the firs - """ - down_url = "" if load_xml else get_download_url(store_id, ShufersalCategories.PricesFull.value) - xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id) - bs_prices = create_bs_object(xml_path, down_url) - return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all('Item')} - - -def get_item_info(item): - return str((item.find('ItemName').text, item.find('ManufacturerName').text, item.find('ItemPrice').text)) - - -def main_latest_promos(store_id: int, load_xml: bool): - """ - This function logs the available promos in a Shufersal store with a given id sorted by their update date. - - :param store_id: A given store id - :param load_xml: A boolean representing whether to load an existing prices xml file - """ - - promotions = get_available_promos(store_id, load_xml) - promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True) - logger.info('\n'.join(str(promotion) for promotion in promotions)) - - -def get_store_id(city: str, load_xml: bool): - """ - This function returns the id of a Shufersal store according to a given city. - The city must match exactly to its spelling in Shufersal's website. - - :param load_xml: A boolean representing whether to load an existing xml or load an already saved one - :param city: A string representing the city of the requested store. - """ - down_url = "" if load_xml else get_download_url(-1, ShufersalCategories.Stores.value) - bs = create_bs_object(xml_file_gen(ShufersalCategories.Stores.name, -1), down_url) - - for store in bs.find_all("STORE"): - if store.find("CITY").text == city: - logger.info((store.find("ADDRESS").text, store.find("STOREID").text, store.find("SUBCHAINNAME").text)) - - -def get_products_prices(store_id: int, product_name: str, load_xml: bool): - """ - This function logs the products in a given Shufersal store which contains a given product_name. - - :param store_id: A given Shufersal store id - :param product_name: A given product name - :param load_xml: A boolean representing whether to load an existing xml or load an already saved one - """ - down_url = "" if load_xml else get_download_url(store_id, ShufersalCategories.PricesFull.value) - bs = create_bs_object(xml_file_gen(ShufersalCategories.PricesFull.name, store_id), down_url) - prods = [item for item in bs.find_all("Item") if product_name in item.find("ItemName").text] - prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text)) - for prod in prods: - logger.info(get_item_info(prod)) - - -def is_valid_store_id(store_id: int): - return isinstance(store_id, int) and store_id >= 0 - - -def store_id_type(store_id: str): - if not is_valid_store_id(int(store_id)): - raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.") - return store_id +# def get_coupons(): +# coupons_json = requests.get('https://www.shufersal.co.il/online/he/my-account/coupons/my-coupons') +# # with open('C:\\Users\\user\\Downloads\\my-coupons.json', "rb") as f: +# # coupons_json = json.load(f) +# bs_coupons = [BeautifulSoup(coup['display'], 'xml') for coup in coupons_json['myCoupons']] +# return [bs_coupon.find("img", src=lambda value: value and value.startswith( +# "https://res.cloudinary.com/shufersal/image/upload/f_auto," +# "q_auto/v1551800918/prod/product_images/products_medium")).contents[1] for bs_coupon in bs_coupons] if __name__ == '__main__': @@ -234,23 +50,39 @@ if __name__ == '__main__': handler = logging.FileHandler(filename=f'promos_{arg_store_id}.log', mode='w', encoding='utf-8') logger.addHandler(handler) try: - main_latest_promos(store_id=arg_store_id, load_xml=args.load_xml) + main_latest_promos(store_id=arg_store_id, + load_xml=args.load_xml, + logger=logger) except FileNotFoundError: - main_latest_promos(store_id=arg_store_id, load_xml=False) + main_latest_promos(store_id=arg_store_id, + load_xml=False, + logger=logger) elif args.price: handler = logging.FileHandler(filename='products_prices.log', mode='w', encoding='utf-8') logger.addHandler(handler) try: - get_products_prices(store_id=args.price[0], product_name=args.price[1], load_xml=args.load_xml) + get_products_prices(store_id=args.price[0], + product_name=args.price[1], + load_xml=args.load_xml, + logger=logger) except FileNotFoundError: - get_products_prices(store_id=args.price[0], product_name=args.price[1], load_xml=False) + get_products_prices(store_id=args.price[0], + product_name=args.price[1], + load_xml=False, + logger=logger) elif args.find_store: arg_city = args.find_store[0] - handler = logging.FileHandler(filename=f'stores_{arg_city}.log', mode='w', encoding='utf-8') + handler = logging.FileHandler(filename=f'stores_{arg_city}.log', + mode='w', + encoding='utf-8') logger.addHandler(handler) try: - get_store_id(city=arg_city, load_xml=args.load_xml) + get_store_id(city=arg_city, + load_xml=args.load_xml, + logger=logger) except FileNotFoundError: - get_store_id(city=arg_city, load_xml=False) + get_store_id(city=arg_city, + load_xml=False, + logger=logger) diff --git a/promotion.py b/promotion.py new file mode 100644 index 0000000..cb7cbd8 --- /dev/null +++ b/promotion.py @@ -0,0 +1,77 @@ +from datetime import datetime +from typing import List +from utils import ShufersalCategories, create_bs_object, create_items_dict, get_download_url, xml_file_gen + +PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל'] + + +class Promotion: + """ + A class of a promotion in Shufersal. + It contains only part of the available information in Shufersal's data. + """ + + def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, + code_items: List[str]): + self.content: str = content + self.start_date = start_date + self.end_date: datetime = end_date + self.update_date: datetime = update_date + self.code_items: List[str] = code_items + + def __str__(self): + title = self.content + dates_range = f"Between {self.start_date.date()} and {self.end_date.date()}" + update_line = f"Updated at {self.update_date.date()}" + items = '\n'.join(str(item) for item in self.code_items) + return '\n'.join([title, dates_range, update_line, items]) + '\n' + + +def get_available_promos(store_id: int, load_xml: bool) -> List[Promotion]: + """ + This function return the available promotions given a BeautifulSoup object. + + :param store_id: A given store id + :param load_xml: A boolean representing whether to load an existing xml or load an already saved one + :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available + """ + items_dict = create_items_dict(store_id, load_xml) + + down_url = get_download_url(store_id, ShufersalCategories.PromosFull.value) + bs_promos = create_bs_object(xml_file_gen(ShufersalCategories.PromosFull.name, store_id), down_url) + + promo_objs = list() + for cur_promo in bs_promos.find_all("Promotion"): + cur_promo = Promotion( + content=cur_promo.find('PromotionDescription').text, + start_date=datetime.strptime(cur_promo.find('PromotionStartDate').text, '%Y-%m-%d'), + end_date=datetime.strptime(cur_promo.find('PromotionEndDate').text, '%Y-%m-%d'), + update_date=datetime.strptime(cur_promo.find('PromotionUpdateDate').text, '%Y-%m-%d %H:%M'), + code_items=[items_dict.get(item.find('ItemCode').text) for item in cur_promo.find_all('Item') + if items_dict.get(item.find('ItemCode').text)], + ) + if is_valid_promo(cur_promo): + promo_objs.append(cur_promo) + return promo_objs + + +def is_valid_promo(promo: Promotion): + today_date = datetime.now() + not_expired = promo.end_date.date() >= today_date.date() + has_started = promo.start_date <= today_date + has_products = len(promo.code_items) > 0 + in_promo_ignore_list = any(product in promo.content for product in PRODUCTS_TO_IGNORE) + return not_expired and has_started and has_products and not in_promo_ignore_list + + +def main_latest_promos(store_id: int, load_xml: bool, logger): + """ + This function logs the available promos in a Shufersal store with a given id sorted by their update date. + + :param store_id: A given store id + :param load_xml: A boolean representing whether to load an existing prices xml file + """ + + promotions = get_available_promos(store_id, load_xml) + promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True) + logger.info('\n'.join(str(promotion) for promotion in promotions)) \ No newline at end of file diff --git a/store.py b/store.py new file mode 100644 index 0000000..d5b7b5c --- /dev/null +++ b/store.py @@ -0,0 +1,25 @@ +from argparse import ArgumentTypeError + +from utils import ShufersalCategories, create_bs_object, get_download_url, is_valid_store_id, xml_file_gen + + +def store_id_type(store_id: str): + if not is_valid_store_id(int(store_id)): + raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.") + return store_id + + +def get_store_id(city: str, load_xml: bool, logger): + """ + This function returns the id of a Shufersal store according to a given city. + The city must match exactly to its spelling in Shufersal's website. + + :param load_xml: A boolean representing whether to load an existing xml or load an already saved one + :param city: A string representing the city of the requested store. + """ + down_url = "" if load_xml else get_download_url(-1, ShufersalCategories.Stores.value) + bs = create_bs_object(xml_file_gen(ShufersalCategories.Stores.name, -1), down_url) + + for store in bs.find_all("STORE"): + if store.find("CITY").text == city: + logger.info((store.find("ADDRESS").text, store.find("STOREID").text, store.find("SUBCHAINNAME").text)) \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..bb34d8e --- /dev/null +++ b/utils.py @@ -0,0 +1,98 @@ +import gzip +from enum import Enum +from typing import Dict + +import requests +from bs4 import BeautifulSoup + + +class ShufersalCategories(Enum): + All, Prices, PricesFull, Promos, PromosFull, Stores = range(6) + + +def xml_file_gen(category_name: str, store_id: int) -> str: + """ + This function generate an xml filename given a store id and a category_name + If the given store_id is invalid, it is ignored in the returned string. + + :param store_id: A given store_id + :param category_name: A given category name + :return: An xml filename + """ + store_id_str = f"-{str(store_id)}" if is_valid_store_id(store_id) else "" + return f"{category_name}{store_id_str}.xml" + + +def get_download_url(store_id: int, cat_id: int) -> str: + """ + This function scrapes Shufersal's website and returns a url that contains the data for a given store and category. + For info about the categories, see ShufersalCategories. + + :param store_id: A given id of a store + :param cat_id: A given id of a category + :return: A downloadable link of the data for a given store and category + """ + url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={cat_id}" + if is_valid_store_id(store_id): + url += f"&storeId={store_id}" + req_res = requests.get(url) + soup = BeautifulSoup(req_res.text, features='lxml') + return soup.find('a', text="לחץ להורדה")['href'] + + +def create_bs_object(xml_path, download_url: str) -> BeautifulSoup: + """ + This function creates a BeautifulSoup object according to the given xml_path and download_url. + In case the given download_url is an empty string, the function tries to read from the given xml_path, + otherwise it downloads the gzip from the download link and extract it. + + :param xml_path: A given path to an xml file + :param download_url: A string that may represent a link (described above) + :return: A BeautifulSoup object with xml content (either from a file or a link). + """ + if download_url: + xml_content = gzip.decompress(requests.get(download_url).content) + with open(xml_path, 'wb') as f_out: + f_out.write(xml_content) + return BeautifulSoup(xml_content, features='xml') + else: + with open(xml_path, 'rb') as f_in: + return BeautifulSoup(f_in, features='xml') + + +def create_items_dict(store_id: int, load_xml) -> Dict: + """ + This function creates a dictionary where every key is an item code and its value is the item's name and price. + + :param load_xml: A boolean representing whether to load an existing prices xml file + :param store_id: A given store id + :return: A dictionary where the firs + """ + down_url = "" if load_xml else get_download_url(store_id, ShufersalCategories.PricesFull.value) + xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id) + bs_prices = create_bs_object(xml_path, down_url) + return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all('Item')} + + +def get_item_info(item): + return str((item.find('ItemName').text, item.find('ManufacturerName').text, item.find('ItemPrice').text)) + + +def get_products_prices(store_id: int, product_name: str, load_xml: bool, logger): + """ + This function logs the products in a given Shufersal store which contains a given product_name. + + :param store_id: A given Shufersal store id + :param product_name: A given product name + :param load_xml: A boolean representing whether to load an existing xml or load an already saved one + """ + down_url = "" if load_xml else get_download_url(store_id, ShufersalCategories.PricesFull.value) + bs = create_bs_object(xml_file_gen(ShufersalCategories.PricesFull.name, store_id), down_url) + prods = [item for item in bs.find_all("Item") if product_name in item.find("ItemName").text] + prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text)) + for prod in prods: + logger.info(get_item_info(prod)) + + +def is_valid_store_id(store_id: int): + return isinstance(store_id, int) and store_id >= 0 \ No newline at end of file