has added RamiLevi to the chains collection

2021-02-06 14:41:04 +02:00
parent d7e5b709f8
commit 3a57edf5af
9 changed files with 127 additions and 33 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Supermarket basic scraping
-The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
+The library supports scraping from Shufersal, CoOp and Zol Vebegadol.
 ## Installation
 clone:
--- a/co_op.py
+++ b/co_op.py
@@ -8,20 +8,25 @@ from supermarket_chain import SupermarketChain
 class CoOp(SupermarketChain):
    promotion_tag_name = 'Sale'
    promotion_update_tag_name = 'PriceUpdateDate'
    date_format = '%Y/%m/%d'
    date_hour_format = '%Y/%m/%d %H:%M:%S'
    item_tag_name = 'Product'
    @property
    def update_date_format(self):
        return CoOp.date_hour_format
    @staticmethod
-    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
+    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
        prefix = "http://matrixcatalog.co.il/"
        url = prefix + "NBCompetitionRegulations.aspx"
        req_res: requests.Response = requests.get(url)
        soup = BeautifulSoup(req_res.text, features='lxml')
        suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
-                                and f'-{store_id:03d}-20' in value).attrs['href']
+                                                        and f'-{store_id:03d}-20' in value).attrs['href']
        down_url = prefix + suffix
        print(down_url)
        return down_url
@@ -30,7 +35,7 @@ class CoOp(SupermarketChain):
        All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
    def __repr__(self):
-        return 'Co-Op'
+        return 'CoOp'
    @staticmethod
    def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
--- a/main.py
+++ b/main.py
@@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain
 from shufersal import ShuferSal
 from co_op import CoOp
 from zol_vebegadol import ZolVebegadol
 from rami_levi import RamiLevi
 from pathlib import Path
 # TODO: fix problem of left-to-right printing
@@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
 chain_dict = {
    'Shufersal': ShuferSal(),
-    'Co-Op': CoOp(),
+    'CoOp': CoOp(),
-    'Zol-Vebegadol': ZolVebegadol()
+    'Zol-Vebegadol': ZolVebegadol(),
    'RamiLevi': RamiLevi(),
 }
 if __name__ == '__main__':
@@ -75,7 +77,8 @@ if __name__ == '__main__':
        handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
                                      encoding='utf-8')
        logger.addHandler(handler)
-        main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
+        main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain,
                           load_promos=args.load_promos)
    elif args.price:
        get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
--- a/promotion.py
+++ b/promotion.py
@@ -10,7 +10,7 @@ from utils import (
 )
 from supermarket_chain import SupermarketChain
-PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
+PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
 class Promotion:
@@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
                chain.date_hour_format),
            end_date=datetime.strptime(promo.find(
                'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
-            update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
+            update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
            items=chain.get_items(promo, items_dict),
        )
        if is_valid_promo(promo):
@@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion):
    return not_expired and has_started and has_products and not in_promo_ignore_list
-def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
+def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
    """
    This function logs the available promotions in a store with a given id sorted by their update date.
@@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
    :param logger: A given logger
    """
-    promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
+    promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
    promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
                                       promo.end_date), reverse=True)
    logger.info('\n'.join(str(promotion) for promotion in promotions))
--- a/rami_levi.py
+++ b/rami_levi.py
@@ -0,0 +1,66 @@
 import json
 from typing import Dict, List
 import requests
 from bs4.element import Tag
 from item import Item
 from supermarket_chain import SupermarketChain
 class RamiLevi(SupermarketChain):
    @property
    def promotion_tag_name(self):
        return 'Promotion'
    @property
    def promotion_update_tag_name(self):
        return 'PromotionUpdateDate'
    @property
    def date_format(self):
        return '%Y-%m-%d'
    @property
    def date_hour_format(self):
        return '%Y-%m-%d %H:%M:%S'
    @property
    def update_date_format(self):
        return '%Y-%m-%d %H:%M'
    @property
    def item_tag_name(self):
        return 'Item'
    @staticmethod
    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
        hostname = "https://publishedprices.co.il"
        # Post the payload to the site to log in
        session.post(hostname + "/login/user", data={'username': 'ramilevi'})
        # Scrape the data
        ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
        s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
        s_json = json.loads(s.text)
        suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
        download_url = hostname + "/file/d/" + suffix
        print(download_url)
        return download_url
    @staticmethod
    def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
        items = list()
        for item in promo.find_all('Item'):
            item_code = item.find('ItemCode').text
            full_item_info = items_dict.get(item_code)
            if full_item_info:
                items.append(full_item_info)
        return items
    class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
        All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
    def __repr__(self):
        return 'RamiLevi'
--- a/shufersal.py
+++ b/shufersal.py
@@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain):
    date_hour_format = '%Y-%m-%d %H:%M'
    item_tag_name = 'Item'
    @property
    def update_date_format(self):
        return ShuferSal.date_hour_format
    @staticmethod
-    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
+    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
        url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
        if SupermarketChain.is_valid_store_id(int(store_id)):
            url += f"&storeId={store_id}"
--- a/supermarket_chain.py
+++ b/supermarket_chain.py
@@ -1,7 +1,10 @@
 import re
 from abc import abstractmethod
 from enum import Enum
 from argparse import ArgumentTypeError
 from typing import Dict, List
 import requests
 from bs4.element import Tag
 from item import Item
@@ -35,6 +38,10 @@ class SupermarketChain:
    @abstractmethod
    def date_hour_format(self): pass
    @property
    @abstractmethod
    def update_date_format(self): pass
    @property
    @abstractmethod
    def item_tag_name(self): pass
@@ -62,10 +69,11 @@ class SupermarketChain:
    @staticmethod
    @abstractmethod
-    def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
+    def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str:
        """
        This method scrapes supermarket's website and returns a url containing the data for a given store and category.
        :param session:
        :param store_id: A given id of a store
        :param category: A given category
        :return: A downloadable link of the  data for a given store and category
@@ -91,6 +99,18 @@ class SupermarketChain:
        return [item.find('ItemCode').text for item in promo.find_all('Item')
                if not items_dict.get(item.find('ItemCode').text)]
    @staticmethod
    def get_item_info(item: Tag) -> Item:
        """
        This function returns a string containing important information about a given supermarket's product.
        """
        return Item(
            name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
            price=item.find('ItemPrice').text,
            manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
            code=item.find('ItemCode').text
        )
    @abstractmethod
    def __repr__(self):
        pass
--- a/utils.py
+++ b/utils.py
@@ -4,12 +4,10 @@ import zipfile
 from typing import AnyStr, Dict
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from os import path
 from item import Item
 from supermarket_chain import SupermarketChain
 import re
 RESULTS_DIRNAME = "results"
 RAW_FILES_DIRNAME = "raw_files"
@@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category:
    :param category: A given category
    :return: A BeautifulSoup object with xml content.
    """
-    download_url: str = chain.get_download_url(store_id, category)
+    session = requests.Session()
-    response_content = requests.get(download_url).content
+    download_url: str = chain.get_download_url(store_id, category, session)
    response_content = session.get(download_url).content
    try:
        xml_content: AnyStr = gzip.decompress(response_content)
    except gzip.BadGzipFile:
@@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[
    """
    xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
    bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
-    return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
+    return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
 def get_item_info(item: Tag) -> Item:
    """
    This function returns a string containing important information about a given supermarket's product.
    """
    return Item(
        name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
        price=item.find('ItemPrice').text,
        manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
        code=item.find('ItemCode').text
    )
 def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
@@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
    prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
    prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
    for prod in prods:
-        print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
+        print(
-               prod.find('ItemPrice').text))
+            (
                prod.find('ItemName').text[::-1],
                prod.find('ManufacturerName').text[::-1],
                prod.find('ItemPrice').text
            )
        )
--- a/zol_vebegadol.py
+++ b/zol_vebegadol.py
@@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain):
    date_hour_format = '%Y-%m-%d %H:%M:%S'
    item_tag_name = 'Item'
    @property
    def update_date_format(self):
        return ZolVebegadol.date_hour_format
    @staticmethod
-    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
+    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str:
        prefix = "http://zolvebegadol.binaprojects.com"
        url = prefix + "/MainIO_Hok.aspx"
        req_res: requests.Response = requests.get(url)