has added RamiLevi to the chains collection

This commit is contained in:
KorenLazar
2021-02-06 14:41:04 +02:00
parent d7e5b709f8
commit 3a57edf5af
9 changed files with 127 additions and 33 deletions

View File

@@ -1,5 +1,5 @@
# Supermarket basic scraping # Supermarket basic scraping
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol. The library supports scraping from Shufersal, CoOp and Zol Vebegadol.
## Installation ## Installation
clone: clone:

View File

@@ -8,20 +8,25 @@ from supermarket_chain import SupermarketChain
class CoOp(SupermarketChain): class CoOp(SupermarketChain):
promotion_tag_name = 'Sale' promotion_tag_name = 'Sale'
promotion_update_tag_name = 'PriceUpdateDate' promotion_update_tag_name = 'PriceUpdateDate'
date_format = '%Y/%m/%d' date_format = '%Y/%m/%d'
date_hour_format = '%Y/%m/%d %H:%M:%S' date_hour_format = '%Y/%m/%d %H:%M:%S'
item_tag_name = 'Product' item_tag_name = 'Product'
@property
def update_date_format(self):
return CoOp.date_hour_format
@staticmethod @staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
prefix = "http://matrixcatalog.co.il/" prefix = "http://matrixcatalog.co.il/"
url = prefix + "NBCompetitionRegulations.aspx" url = prefix + "NBCompetitionRegulations.aspx"
req_res: requests.Response = requests.get(url) req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml') soup = BeautifulSoup(req_res.text, features='lxml')
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
and f'-{store_id:03d}-20' in value).attrs['href'] and f'-{store_id:03d}-20' in value).attrs['href']
down_url = prefix + suffix down_url = prefix + suffix
print(down_url) print(down_url)
return down_url return down_url
@@ -30,7 +35,7 @@ class CoOp(SupermarketChain):
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6) All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
def __repr__(self): def __repr__(self):
return 'Co-Op' return 'CoOp'
@staticmethod @staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:

View File

@@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain
from shufersal import ShuferSal from shufersal import ShuferSal
from co_op import CoOp from co_op import CoOp
from zol_vebegadol import ZolVebegadol from zol_vebegadol import ZolVebegadol
from rami_levi import RamiLevi
from pathlib import Path from pathlib import Path
# TODO: fix problem of left-to-right printing # TODO: fix problem of left-to-right printing
@@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
chain_dict = { chain_dict = {
'Shufersal': ShuferSal(), 'Shufersal': ShuferSal(),
'Co-Op': CoOp(), 'CoOp': CoOp(),
'Zol-Vebegadol': ZolVebegadol() 'Zol-Vebegadol': ZolVebegadol(),
'RamiLevi': RamiLevi(),
} }
if __name__ == '__main__': if __name__ == '__main__':
@@ -75,7 +77,8 @@ if __name__ == '__main__':
handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w', handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
encoding='utf-8') encoding='utf-8')
logger.addHandler(handler) logger.addHandler(handler)
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain) main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain,
load_promos=args.load_promos)
elif args.price: elif args.price:
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1]) get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])

View File

@@ -10,7 +10,7 @@ from utils import (
) )
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל'] PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
class Promotion: class Promotion:
@@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
chain.date_hour_format), chain.date_hour_format),
end_date=datetime.strptime(promo.find( end_date=datetime.strptime(promo.find(
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format), 'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format), update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
items=chain.get_items(promo, items_dict), items=chain.get_items(promo, items_dict),
) )
if is_valid_promo(promo): if is_valid_promo(promo):
@@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion):
return not_expired and has_started and has_products and not in_promo_ignore_list return not_expired and has_started and has_products and not in_promo_ignore_list
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain): def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
""" """
This function logs the available promotions in a store with a given id sorted by their update date. This function logs the available promotions in a store with a given id sorted by their update date.
@@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
:param logger: A given logger :param logger: A given logger
""" """
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False) promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
promo.end_date), reverse=True) promo.end_date), reverse=True)
logger.info('\n'.join(str(promotion) for promotion in promotions)) logger.info('\n'.join(str(promotion) for promotion in promotions))

66
rami_levi.py Normal file
View File

@@ -0,0 +1,66 @@
import json
from typing import Dict, List
import requests
from bs4.element import Tag
from item import Item
from supermarket_chain import SupermarketChain
class RamiLevi(SupermarketChain):
@property
def promotion_tag_name(self):
return 'Promotion'
@property
def promotion_update_tag_name(self):
return 'PromotionUpdateDate'
@property
def date_format(self):
return '%Y-%m-%d'
@property
def date_hour_format(self):
return '%Y-%m-%d %H:%M:%S'
@property
def update_date_format(self):
return '%Y-%m-%d %H:%M'
@property
def item_tag_name(self):
return 'Item'
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
hostname = "https://publishedprices.co.il"
# Post the payload to the site to log in
session.post(hostname + "/login/user", data={'username': 'ramilevi'})
# Scrape the data
ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
s_json = json.loads(s.text)
suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
download_url = hostname + "/file/d/" + suffix
print(download_url)
return download_url
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
items = list()
for item in promo.find_all('Item'):
item_code = item.find('ItemCode').text
full_item_info = items_dict.get(item_code)
if full_item_info:
items.append(full_item_info)
return items
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
def __repr__(self):
return 'RamiLevi'

View File

@@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain):
date_hour_format = '%Y-%m-%d %H:%M' date_hour_format = '%Y-%m-%d %H:%M'
item_tag_name = 'Item' item_tag_name = 'Item'
@property
def update_date_format(self):
return ShuferSal.date_hour_format
@staticmethod @staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}" url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
if SupermarketChain.is_valid_store_id(int(store_id)): if SupermarketChain.is_valid_store_id(int(store_id)):
url += f"&storeId={store_id}" url += f"&storeId={store_id}"

View File

@@ -1,7 +1,10 @@
import re
from abc import abstractmethod from abc import abstractmethod
from enum import Enum from enum import Enum
from argparse import ArgumentTypeError from argparse import ArgumentTypeError
from typing import Dict, List from typing import Dict, List
import requests
from bs4.element import Tag from bs4.element import Tag
from item import Item from item import Item
@@ -35,6 +38,10 @@ class SupermarketChain:
@abstractmethod @abstractmethod
def date_hour_format(self): pass def date_hour_format(self): pass
@property
@abstractmethod
def update_date_format(self): pass
@property @property
@abstractmethod @abstractmethod
def item_tag_name(self): pass def item_tag_name(self): pass
@@ -62,10 +69,11 @@ class SupermarketChain:
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def get_download_url(store_id: int, category: XMLFilesCategory) -> str: def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str:
""" """
This method scrapes supermarket's website and returns a url containing the data for a given store and category. This method scrapes supermarket's website and returns a url containing the data for a given store and category.
:param session:
:param store_id: A given id of a store :param store_id: A given id of a store
:param category: A given category :param category: A given category
:return: A downloadable link of the data for a given store and category :return: A downloadable link of the data for a given store and category
@@ -91,6 +99,18 @@ class SupermarketChain:
return [item.find('ItemCode').text for item in promo.find_all('Item') return [item.find('ItemCode').text for item in promo.find_all('Item')
if not items_dict.get(item.find('ItemCode').text)] if not items_dict.get(item.find('ItemCode').text)]
@staticmethod
def get_item_info(item: Tag) -> Item:
"""
This function returns a string containing important information about a given supermarket's product.
"""
return Item(
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
price=item.find('ItemPrice').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
code=item.find('ItemCode').text
)
@abstractmethod @abstractmethod
def __repr__(self): def __repr__(self):
pass pass

View File

@@ -4,12 +4,10 @@ import zipfile
from typing import AnyStr, Dict from typing import AnyStr, Dict
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag
from os import path from os import path
from item import Item from item import Item
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
import re
RESULTS_DIRNAME = "results" RESULTS_DIRNAME = "results"
RAW_FILES_DIRNAME = "raw_files" RAW_FILES_DIRNAME = "raw_files"
@@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category:
:param category: A given category :param category: A given category
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
""" """
download_url: str = chain.get_download_url(store_id, category) session = requests.Session()
response_content = requests.get(download_url).content download_url: str = chain.get_download_url(store_id, category, session)
response_content = session.get(download_url).content
try: try:
xml_content: AnyStr = gzip.decompress(response_content) xml_content: AnyStr = gzip.decompress(response_content)
except gzip.BadGzipFile: except gzip.BadGzipFile:
@@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[
""" """
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)} return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_item_info(item: Tag) -> Item:
"""
This function returns a string containing important information about a given supermarket's product.
"""
return Item(
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
price=item.find('ItemPrice').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
code=item.find('ItemCode').text
)
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None: def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
@@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text] prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text)) prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
for prod in prods: for prod in prods:
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1], print(
prod.find('ItemPrice').text)) (
prod.find('ItemName').text[::-1],
prod.find('ManufacturerName').text[::-1],
prod.find('ItemPrice').text
)
)

View File

@@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain):
date_hour_format = '%Y-%m-%d %H:%M:%S' date_hour_format = '%Y-%m-%d %H:%M:%S'
item_tag_name = 'Item' item_tag_name = 'Item'
@property
def update_date_format(self):
return ZolVebegadol.date_hour_format
@staticmethod @staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str: def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str:
prefix = "http://zolvebegadol.binaprojects.com" prefix = "http://zolvebegadol.binaprojects.com"
url = prefix + "/MainIO_Hok.aspx" url = prefix + "/MainIO_Hok.aspx"
req_res: requests.Response = requests.get(url) req_res: requests.Response = requests.get(url)