has added RamiLevi to the chains collection

This commit is contained in:
KorenLazar
2021-02-06 14:41:04 +02:00
parent d7e5b709f8
commit 3a57edf5af
9 changed files with 127 additions and 33 deletions

View File

@@ -1,5 +1,5 @@
# Supermarket basic scraping
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
The library supports scraping from Shufersal, CoOp and Zol Vebegadol.
## Installation
clone:

View File

@@ -8,20 +8,25 @@ from supermarket_chain import SupermarketChain
class CoOp(SupermarketChain):
promotion_tag_name = 'Sale'
promotion_update_tag_name = 'PriceUpdateDate'
date_format = '%Y/%m/%d'
date_hour_format = '%Y/%m/%d %H:%M:%S'
item_tag_name = 'Product'
@property
def update_date_format(self):
return CoOp.date_hour_format
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
prefix = "http://matrixcatalog.co.il/"
url = prefix + "NBCompetitionRegulations.aspx"
req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
and f'-{store_id:03d}-20' in value).attrs['href']
and f'-{store_id:03d}-20' in value).attrs['href']
down_url = prefix + suffix
print(down_url)
return down_url
@@ -30,7 +35,7 @@ class CoOp(SupermarketChain):
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
def __repr__(self):
return 'Co-Op'
return 'CoOp'
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:

View File

@@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain
from shufersal import ShuferSal
from co_op import CoOp
from zol_vebegadol import ZolVebegadol
from rami_levi import RamiLevi
from pathlib import Path
# TODO: fix problem of left-to-right printing
@@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
chain_dict = {
'Shufersal': ShuferSal(),
'Co-Op': CoOp(),
'Zol-Vebegadol': ZolVebegadol()
'CoOp': CoOp(),
'Zol-Vebegadol': ZolVebegadol(),
'RamiLevi': RamiLevi(),
}
if __name__ == '__main__':
@@ -75,7 +77,8 @@ if __name__ == '__main__':
handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
encoding='utf-8')
logger.addHandler(handler)
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain,
load_promos=args.load_promos)
elif args.price:
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])

View File

@@ -10,7 +10,7 @@ from utils import (
)
from supermarket_chain import SupermarketChain
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
class Promotion:
@@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
chain.date_hour_format),
end_date=datetime.strptime(promo.find(
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
items=chain.get_items(promo, items_dict),
)
if is_valid_promo(promo):
@@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion):
return not_expired and has_started and has_products and not in_promo_ignore_list
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
"""
This function logs the available promotions in a store with a given id sorted by their update date.
@@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
:param logger: A given logger
"""
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
promo.end_date), reverse=True)
logger.info('\n'.join(str(promotion) for promotion in promotions))

66
rami_levi.py Normal file
View File

@@ -0,0 +1,66 @@
import json
from typing import Dict, List
import requests
from bs4.element import Tag
from item import Item
from supermarket_chain import SupermarketChain
class RamiLevi(SupermarketChain):
@property
def promotion_tag_name(self):
return 'Promotion'
@property
def promotion_update_tag_name(self):
return 'PromotionUpdateDate'
@property
def date_format(self):
return '%Y-%m-%d'
@property
def date_hour_format(self):
return '%Y-%m-%d %H:%M:%S'
@property
def update_date_format(self):
return '%Y-%m-%d %H:%M'
@property
def item_tag_name(self):
return 'Item'
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
hostname = "https://publishedprices.co.il"
# Post the payload to the site to log in
session.post(hostname + "/login/user", data={'username': 'ramilevi'})
# Scrape the data
ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
s_json = json.loads(s.text)
suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
download_url = hostname + "/file/d/" + suffix
print(download_url)
return download_url
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
items = list()
for item in promo.find_all('Item'):
item_code = item.find('ItemCode').text
full_item_info = items_dict.get(item_code)
if full_item_info:
items.append(full_item_info)
return items
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
def __repr__(self):
return 'RamiLevi'

View File

@@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain):
date_hour_format = '%Y-%m-%d %H:%M'
item_tag_name = 'Item'
@property
def update_date_format(self):
return ShuferSal.date_hour_format
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
if SupermarketChain.is_valid_store_id(int(store_id)):
url += f"&storeId={store_id}"

View File

@@ -1,7 +1,10 @@
import re
from abc import abstractmethod
from enum import Enum
from argparse import ArgumentTypeError
from typing import Dict, List
import requests
from bs4.element import Tag
from item import Item
@@ -35,6 +38,10 @@ class SupermarketChain:
@abstractmethod
def date_hour_format(self): pass
@property
@abstractmethod
def update_date_format(self): pass
@property
@abstractmethod
def item_tag_name(self): pass
@@ -62,10 +69,11 @@ class SupermarketChain:
@staticmethod
@abstractmethod
def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str:
"""
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
:param session:
:param store_id: A given id of a store
:param category: A given category
:return: A downloadable link of the data for a given store and category
@@ -91,6 +99,18 @@ class SupermarketChain:
return [item.find('ItemCode').text for item in promo.find_all('Item')
if not items_dict.get(item.find('ItemCode').text)]
@staticmethod
def get_item_info(item: Tag) -> Item:
"""
This function returns a string containing important information about a given supermarket's product.
"""
return Item(
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
price=item.find('ItemPrice').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
code=item.find('ItemCode').text
)
@abstractmethod
def __repr__(self):
pass

View File

@@ -4,12 +4,10 @@ import zipfile
from typing import AnyStr, Dict
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from os import path
from item import Item
from supermarket_chain import SupermarketChain
import re
RESULTS_DIRNAME = "results"
RAW_FILES_DIRNAME = "raw_files"
@@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category:
:param category: A given category
:return: A BeautifulSoup object with xml content.
"""
download_url: str = chain.get_download_url(store_id, category)
response_content = requests.get(download_url).content
session = requests.Session()
download_url: str = chain.get_download_url(store_id, category, session)
response_content = session.get(download_url).content
try:
xml_content: AnyStr = gzip.decompress(response_content)
except gzip.BadGzipFile:
@@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[
"""
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_item_info(item: Tag) -> Item:
"""
This function returns a string containing important information about a given supermarket's product.
"""
return Item(
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
price=item.find('ItemPrice').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
code=item.find('ItemCode').text
)
return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
@@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
for prod in prods:
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
prod.find('ItemPrice').text))
print(
(
prod.find('ItemName').text[::-1],
prod.find('ManufacturerName').text[::-1],
prod.find('ItemPrice').text
)
)

View File

@@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain):
date_hour_format = '%Y-%m-%d %H:%M:%S'
item_tag_name = 'Item'
@property
def update_date_format(self):
return ZolVebegadol.date_hour_format
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str:
prefix = "http://zolvebegadol.binaprojects.com"
url = prefix + "/MainIO_Hok.aspx"
req_res: requests.Response = requests.get(url)