diff --git a/README.md b/README.md index 9798b6b..7642e6b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Supermarket basic scraping -The library supports scraping from Shufersal, Co-Op and Zol Vebegadol. +The library supports scraping from Shufersal, CoOp and Zol Vebegadol. ## Installation clone: diff --git a/co_op.py b/co_op.py index eecbc59..900676a 100644 --- a/co_op.py +++ b/co_op.py @@ -8,20 +8,25 @@ from supermarket_chain import SupermarketChain class CoOp(SupermarketChain): + promotion_tag_name = 'Sale' promotion_update_tag_name = 'PriceUpdateDate' date_format = '%Y/%m/%d' date_hour_format = '%Y/%m/%d %H:%M:%S' item_tag_name = 'Product' + @property + def update_date_format(self): + return CoOp.date_hour_format + @staticmethod - def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: + def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str: prefix = "http://matrixcatalog.co.il/" url = prefix + "NBCompetitionRegulations.aspx" req_res: requests.Response = requests.get(url) soup = BeautifulSoup(req_res.text, features='lxml') suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value - and f'-{store_id:03d}-20' in value).attrs['href'] + and f'-{store_id:03d}-20' in value).attrs['href'] down_url = prefix + suffix print(down_url) return down_url @@ -30,7 +35,7 @@ class CoOp(SupermarketChain): All, Promos, PromosFull, Prices, PricesFull, Stores = range(6) def __repr__(self): - return 'Co-Op' + return 'CoOp' @staticmethod def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: diff --git a/main.py b/main.py index b42d965..8a408a2 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain from shufersal import ShuferSal from co_op import CoOp from zol_vebegadol import ZolVebegadol +from rami_levi import RamiLevi from pathlib import Path # TODO: fix problem of left-to-right printing @@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True) chain_dict = { 'Shufersal': ShuferSal(), - 'Co-Op': CoOp(), - 'Zol-Vebegadol': ZolVebegadol() + 'CoOp': CoOp(), + 'Zol-Vebegadol': ZolVebegadol(), + 'RamiLevi': RamiLevi(), } if __name__ == '__main__': @@ -75,7 +77,8 @@ if __name__ == '__main__': handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w', encoding='utf-8') logger.addHandler(handler) - main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain) + main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain, + load_promos=args.load_promos) elif args.price: get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1]) diff --git a/promotion.py b/promotion.py index 7cae83a..118e400 100644 --- a/promotion.py +++ b/promotion.py @@ -10,7 +10,7 @@ from utils import ( ) from supermarket_chain import SupermarketChain -PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל'] +PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי'] class Promotion: @@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo chain.date_hour_format), end_date=datetime.strptime(promo.find( 'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format), - update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format), + update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format), items=chain.get_items(promo, items_dict), ) if is_valid_promo(promo): @@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion): return not_expired and has_started and has_products and not in_promo_ignore_list -def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain): +def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool): """ This function logs the available promotions in a store with a given id sorted by their update date. @@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket :param logger: A given logger """ - promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False) + promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos) promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - promo.end_date), reverse=True) logger.info('\n'.join(str(promotion) for promotion in promotions)) diff --git a/rami_levi.py b/rami_levi.py new file mode 100644 index 0000000..9106843 --- /dev/null +++ b/rami_levi.py @@ -0,0 +1,66 @@ +import json +from typing import Dict, List +import requests +from bs4.element import Tag + +from item import Item +from supermarket_chain import SupermarketChain + + +class RamiLevi(SupermarketChain): + @property + def promotion_tag_name(self): + return 'Promotion' + + @property + def promotion_update_tag_name(self): + return 'PromotionUpdateDate' + + @property + def date_format(self): + return '%Y-%m-%d' + + @property + def date_hour_format(self): + return '%Y-%m-%d %H:%M:%S' + + @property + def update_date_format(self): + return '%Y-%m-%d %H:%M' + + @property + def item_tag_name(self): + return 'Item' + + @staticmethod + def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str: + hostname = "https://publishedprices.co.il" + + # Post the payload to the site to log in + session.post(hostname + "/login/user", data={'username': 'ramilevi'}) + + # Scrape the data + ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')} + s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload) + s_json = json.loads(s.text) + suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name']) + + download_url = hostname + "/file/d/" + suffix + print(download_url) + return download_url + + @staticmethod + def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: + items = list() + for item in promo.find_all('Item'): + item_code = item.find('ItemCode').text + full_item_info = items_dict.get(item_code) + if full_item_info: + items.append(full_item_info) + return items + + class XMLFilesCategory(SupermarketChain.XMLFilesCategory): + All, Promos, PromosFull, Prices, PricesFull, Stores = range(6) + + def __repr__(self): + return 'RamiLevi' diff --git a/shufersal.py b/shufersal.py index 97ed15c..5656fbc 100644 --- a/shufersal.py +++ b/shufersal.py @@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain): date_hour_format = '%Y-%m-%d %H:%M' item_tag_name = 'Item' + @property + def update_date_format(self): + return ShuferSal.date_hour_format + @staticmethod - def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: + def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str: url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}" if SupermarketChain.is_valid_store_id(int(store_id)): url += f"&storeId={store_id}" diff --git a/supermarket_chain.py b/supermarket_chain.py index dc3c58a..cd989c3 100644 --- a/supermarket_chain.py +++ b/supermarket_chain.py @@ -1,7 +1,10 @@ +import re from abc import abstractmethod from enum import Enum from argparse import ArgumentTypeError from typing import Dict, List + +import requests from bs4.element import Tag from item import Item @@ -35,6 +38,10 @@ class SupermarketChain: @abstractmethod def date_hour_format(self): pass + @property + @abstractmethod + def update_date_format(self): pass + @property @abstractmethod def item_tag_name(self): pass @@ -62,10 +69,11 @@ class SupermarketChain: @staticmethod @abstractmethod - def get_download_url(store_id: int, category: XMLFilesCategory) -> str: + def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str: """ This method scrapes supermarket's website and returns a url containing the data for a given store and category. + :param session: :param store_id: A given id of a store :param category: A given category :return: A downloadable link of the data for a given store and category @@ -91,6 +99,18 @@ class SupermarketChain: return [item.find('ItemCode').text for item in promo.find_all('Item') if not items_dict.get(item.find('ItemCode').text)] + @staticmethod + def get_item_info(item: Tag) -> Item: + """ + This function returns a string containing important information about a given supermarket's product. + """ + return Item( + name=item.find(re.compile(r'ItemN[a]?m[e]?')).text, + price=item.find('ItemPrice').text, + manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text, + code=item.find('ItemCode').text + ) + @abstractmethod def __repr__(self): pass diff --git a/utils.py b/utils.py index 1397f86..70726b6 100644 --- a/utils.py +++ b/utils.py @@ -4,12 +4,10 @@ import zipfile from typing import AnyStr, Dict import requests from bs4 import BeautifulSoup -from bs4.element import Tag from os import path from item import Item from supermarket_chain import SupermarketChain -import re RESULTS_DIRNAME = "results" RAW_FILES_DIRNAME = "raw_files" @@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category: :param category: A given category :return: A BeautifulSoup object with xml content. """ - download_url: str = chain.get_download_url(store_id, category) - response_content = requests.get(download_url).content + session = requests.Session() + download_url: str = chain.get_download_url(store_id, category, session) + response_content = session.get(download_url).content try: xml_content: AnyStr = gzip.decompress(response_content) except gzip.BadGzipFile: @@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[ """ xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull) - return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)} - - -def get_item_info(item: Tag) -> Item: - """ - This function returns a string containing important information about a given supermarket's product. - """ - return Item( - name=item.find(re.compile(r'ItemN[a]?m[e]?')).text, - price=item.find('ItemPrice').text, - manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text, - code=item.find('ItemCode').text - ) + return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)} def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None: @@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text] prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text)) for prod in prods: - print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1], - prod.find('ItemPrice').text)) + print( + ( + prod.find('ItemName').text[::-1], + prod.find('ManufacturerName').text[::-1], + prod.find('ItemPrice').text + ) + ) diff --git a/zol_vebegadol.py b/zol_vebegadol.py index ccceeaa..3879a57 100644 --- a/zol_vebegadol.py +++ b/zol_vebegadol.py @@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain): date_hour_format = '%Y-%m-%d %H:%M:%S' item_tag_name = 'Item' + @property + def update_date_format(self): + return ZolVebegadol.date_hour_format + @staticmethod - def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: + def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str: prefix = "http://zolvebegadol.binaprojects.com" url = prefix + "/MainIO_Hok.aspx" req_res: requests.Response = requests.get(url)