From d7e5b709f8cf602c3f986f76d0a6f597a7ebdaf7 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Thu, 28 Jan 2021 16:25:38 +0200 Subject: [PATCH] has added a CSV format promotions file when running --promos. Item.py was added for moduling. --- README.md | 9 +++---- co_op.py | 4 ++- item.py | 13 ++++++++++ promotion.py | 58 ++++++++++++++++++++++++++++++++++++-------- shufersal.py | 4 ++- supermarket_chain.py | 13 +++++++--- utils.py | 19 +++++++++------ zol_vebegadol.py | 3 ++- 8 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 item.py diff --git a/README.md b/README.md index 5481f49..9798b6b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Supermarket basic scraping -The library supports scraping from Shufersal, Co-Op and Zol Vebegadol +The library supports scraping from Shufersal, Co-Op and Zol Vebegadol. ## Installation clone: @@ -24,15 +24,14 @@ python main.py --find_store ירושלים --chain Shufersal In case you want a different supermarket chain, just change 'Shufersal' to a different name (the options will be printed in case of misspelling). -After running the command, you'll be able to see the different stores in Jerusalem with their IDs in "results\Shufersal-Stores.xml". +The output of the last command - the different Shufersal stores in Jerusalem with their IDs - should be printed. -Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update -, and length. +Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update and length. ```cmd script python main.py --promos 5 --chain Shufersal ``` * We assumed that the store's ID is 5. -Now, you can find the promos in "results\Shufersal_promos_5.log". +Now, you can find the promos in both "results\Shufersal_promos_5.csv" and "results\Shufersal_promos_5.log". For other documentation and commands, you can run ```cmd script diff --git a/co_op.py b/co_op.py index eba45e9..eecbc59 100644 --- a/co_op.py +++ b/co_op.py @@ -2,6 +2,8 @@ from typing import Dict, List import requests from bs4 import BeautifulSoup from bs4.element import Tag + +from item import Item from supermarket_chain import SupermarketChain @@ -31,6 +33,6 @@ class CoOp(SupermarketChain): return 'Co-Op' @staticmethod - def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: + def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: promo_item = items_dict.get(promo.find('ItemCode').text) return [promo_item] if promo_item else [] diff --git a/item.py b/item.py new file mode 100644 index 0000000..2dbd016 --- /dev/null +++ b/item.py @@ -0,0 +1,13 @@ +class Item: + """ + A class representing a product in some supermarket. + """ + + def __init__(self, name: str, price: float, manufacturer: str, code: int): + self.name: str = name + self.price: float = price + self.manufacturer: str = manufacturer + self.code: int = code + + def __repr__(self): + return str((self.name, self.price, self.manufacturer, self.code)) diff --git a/promotion.py b/promotion.py index b289760..7cae83a 100644 --- a/promotion.py +++ b/promotion.py @@ -1,6 +1,8 @@ from datetime import datetime from typing import Dict, List +import csv +from item import Item from utils import ( create_items_dict, xml_file_gen, @@ -17,12 +19,13 @@ class Promotion: It contains only part of the available information in Shufersal's data. """ - def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, items: List[str]): + def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, + items: List[Item]): self.content: str = content - self.start_date = start_date + self.start_date: datetime = start_date self.end_date: datetime = end_date self.update_date: datetime = update_date - self.items: List[str] = items + self.items: List[Item] = items def __repr__(self): title = self.content @@ -42,6 +45,40 @@ class Promotion: return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date +def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None: + """ + This function writes a given list of promotions to a given output file in a CSV format. + + :param promotions: A given list of promotions + :param output_filename: A given file to write to + """ + with open(output_filename, mode='w', newline='') as f_out: + promos_writer = csv.writer(f_out) + promos_writer.writerow([ + 'תיאור המבצע', + 'הפריט המשתתף במבצע', + 'מחיר לפני המבצע', + 'זמן תחילת המבצע', + 'זמן סיום המבצע', + 'זמן עדכון אחרון', + 'יצרן', + 'ברקוד של הפריט' + ]) + + for promo in promotions: + promos_writer.writerows( + [[promo.content, + item.name, + item.price, + promo.start_date, + promo.end_date, + promo.update_date, + item.manufacturer, + item.code] + for item in promo.items] + ) + + def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]: """ This function return the available promotions given a BeautifulSoup object. @@ -52,7 +89,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo :param load_prices: A boolean representing whether to load an existing xml or load an already saved one :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available """ - items_dict: Dict[str, str] = create_items_dict(chain, load_prices, store_id) + items_dict: Dict[str, Item] = create_items_dict(chain, load_prices, store_id) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name) bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull) @@ -78,10 +115,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo def is_valid_promo(promo: Promotion): """ - This function checks if a given promo object is valid. - - :param promo: A given promotion - :return: True iff the given Promotion is valid. + This function returns whether a given Promotion object is currently valid. """ today_date: datetime = datetime.now() not_expired: bool = promo.end_date >= today_date @@ -93,7 +127,7 @@ def is_valid_promo(promo: Promotion): def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain): """ - This function logs the available promos in a store with a given id sorted by their update date. + This function logs the available promotions in a store with a given id sorted by their update date. :param chain: The name of the requested supermarket chain :param store_id: A given store id @@ -105,6 +139,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - promo.end_date), reverse=True) logger.info('\n'.join(str(promotion) for promotion in promotions)) + write_promotions_to_csv(promotions, f'results/{chain}_promos_{store_id}.csv') def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool): @@ -124,7 +159,10 @@ def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, def get_all_null_items_in_promos(chain, store_id): - items_dict: Dict[str, str] = create_items_dict(chain, True, store_id) + """ + This function finds all items appearing in the chain's promotions file but not in the chain's prices file. + """ + items_dict: Dict[str, Item] = create_items_dict(chain, True, store_id) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name) bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull) diff --git a/shufersal.py b/shufersal.py index d3bd0f7..97ed15c 100644 --- a/shufersal.py +++ b/shufersal.py @@ -2,6 +2,8 @@ from typing import Dict, List import requests from bs4 import BeautifulSoup from bs4.element import Tag + +from item import Item from supermarket_chain import SupermarketChain @@ -30,7 +32,7 @@ class ShuferSal(SupermarketChain): return 'Shufersal' @staticmethod - def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: + def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: items = list() for item in promo.find_all('Item'): item_code = item.find('ItemCode').text diff --git a/supermarket_chain.py b/supermarket_chain.py index 0b0cafc..dc3c58a 100644 --- a/supermarket_chain.py +++ b/supermarket_chain.py @@ -4,6 +4,8 @@ from argparse import ArgumentTypeError from typing import Dict, List from bs4.element import Tag +from item import Item + class SupermarketChain: """ @@ -72,17 +74,20 @@ class SupermarketChain: @staticmethod @abstractmethod - def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: + def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: """ - This method returns a list of the items that participate in a given promo + This method returns a list of the items that participate in a given promotion. - :param promo: A given promo + :param promo: A given promotion :param items_dict: A given dictionary of products """ pass @staticmethod - def get_null_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: + def get_null_items(promo: Tag, items_dict: Dict[str, Item]) -> List[str]: + """ + This function returns all the items in a given promotion which do not appear in the given items_dict. + """ return [item.find('ItemCode').text for item in promo.find_all('Item') if not items_dict.get(item.find('ItemCode').text)] diff --git a/utils.py b/utils.py index 6d7c684..1397f86 100644 --- a/utils.py +++ b/utils.py @@ -1,11 +1,13 @@ import gzip import io import zipfile -from typing import AnyStr, Dict, List +from typing import AnyStr, Dict import requests from bs4 import BeautifulSoup from bs4.element import Tag from os import path + +from item import Item from supermarket_chain import SupermarketChain import re @@ -83,26 +85,29 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup: return BeautifulSoup(f_in, features='xml') -def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]: +def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, Item]: """ - This function creates a dictionary where every key is an item code and its value is the item's name and price. + This function creates a dictionary where every key is an item code and its value is its corresponding Item instance. :param chain: A given supermarket chain :param load_xml: A boolean representing whether to load an existing prices xml file :param store_id: A given store id - :return: A dictionary where the firs """ xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull) return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)} -def get_item_info(item: Tag) -> List[str]: +def get_item_info(item: Tag) -> Item: """ This function returns a string containing important information about a given supermarket's product. """ - return [item.find(re.compile(r'ItemN[a]?m[e]?')).text, item.find(re.compile(r'Manufacture[r]?Name')).text, - item.find('ItemPrice').text, item.find('ItemCode').text] + return Item( + name=item.find(re.compile(r'ItemN[a]?m[e]?')).text, + price=item.find('ItemPrice').text, + manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text, + code=item.find('ItemCode').text + ) def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None: diff --git a/zol_vebegadol.py b/zol_vebegadol.py index eb86bb0..ccceeaa 100644 --- a/zol_vebegadol.py +++ b/zol_vebegadol.py @@ -3,6 +3,7 @@ from typing import Dict, List import requests from bs4.element import Tag +from item import Item from supermarket_chain import SupermarketChain @@ -32,7 +33,7 @@ class ZolVebegadol(SupermarketChain): return down_url @staticmethod - def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: + def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: items = list() for item in promo.find_all('Item'): item_code = item.find('ItemCode').text